The EnvStats
package should be installed for functions that are called directly. Generating simulation datasets sampled from skewed distributions requires the sn
package. ggplot2
is used for plotting of results.
require(ggplot2)
require(sn)
require(parallel)
The below function to subsample each population (for subsamples where n ≥ 3) and use a modified Two One-Sided T-test (TOST) procedure to test for statistical equivalence between the subsample and source populations. If the total number of unique subsamples nsub
is ≤ 10 000 (i.e. when n is low), all combinations are tested. If nsub
is > 10 000, 10 000 random combinations are tested. First a chenTTest
(from the EnvStats
package) is tried, which returns NA if skew is not detected. Where skew is not detected a standard t-test is used instead. Equivalence between variances is calculated using two one-sided chi-squared tests (EnvStats::varTest
).
As demonstrated in the below examples, argument x
should be either a vector of trait values or a single numeric index for simulated populations, for which the argument set
must also be defined. The equiv_margin
argument should be a single number indicating the desired equivalence margin for testing (e.g. equiv_margin = 0.25
for ‘negligble error’, ±0.25 °C, or equiv_margin = 1
for ‘moderate error’, ±1.00 °C).
TOSTer <- function(x, equiv_margin, set = NULL) {
if (is.null(set))
{
spacdat <- x
} #if used on empirical data (i.e. one vector of CT data)
if (is.numeric(set)) {
spacdat <- skew_dists[[set]][x][[1]]
}
# if used on simulated data where the 'set' is required
# for subsetting
ChenTOST <- function(sub, spacdat, full_mn, full_sd, equiv_margin) {
# modified CHEN TOST procedure run all combinations
# if < 10 000 possible combinations
if (choose(length(spacdat), sub) < 10000) {
nsub <- choose(length(spacdat), sub)
comX <- combn(1:length(spacdat), sub)
smpl_mn <- split(comX, rep(1:ncol(comX), each = nrow(comX)))
} else {
# if there are more than 10 000 possible
# combinations, run a random subset of 10 000
nsub <- 10000
smpl_mn <- lapply(1:nsub, function(x) {
sample(1:length(spacdat), sub)
})
}
equiv <- sapply(smpl_mn, function(x) {
tryCatch({
# EnvStats::chenTTest #two one-sided
# t-tests for mean equivalence try a Chen
# t-test on one end of distribution
mn_upr <- EnvStats::chenTTest(spacdat[x], alternative = "greater",
mu = full_mn - equiv_margin)$p.value[3]
# if data are not sufficiently skewed for
# Chen t-test, run a standard t-test
# instead
if (is.na(mn_upr)) {
mn_upr <- t.test(spacdat[x], alternative = "greater",
mu = full_mn - equiv_margin)$p.value
}
mn_lwr <- t.test(spacdat[x], alternative = "less",
mu = full_mn + equiv_margin)$p.value #standard t-test for other end
mn_rez <- c(mn_upr, mn_lwr)
# EnvStats::varTest
var_upr <- EnvStats::varTest(spacdat[x], alternative = "greater",
sigma.squared = ifelse((full_sd - equiv_margin) >
0, (full_sd - equiv_margin)^2, 1e-05), conf.level = 0.95)$p.value
var_lwr <- EnvStats::varTest(spacdat[x], alternative = "less",
sigma.squared = (full_sd + equiv_margin)^2,
conf.level = 0.95)$p.value
var_rez <- c(var_upr, var_lwr)
mnvar_equiv <- c(NA, NA)
ifelse(sum(is.na(mn_rez)) > 0, {
mnvar_equiv[1] <- NA
}, {
mnvar_equiv[1] <- max(mn_rez) < 0.05
})
ifelse(sum(is.na(var_rez)) > 0, {
mnvar_equiv[2] <- NA
}, {
mnvar_equiv[2] <- max(var_rez) < 0.05
})
return(mnvar_equiv)
# error handling if TOST procedure fails
# (e.g. if data contain too many non-unique
# values)\t
}, error = function(z) {
return(c(NA, NA))
})
})
mn_eq <- equiv[1, ]
var_eq <- equiv[2, ]
# calculate proportion of succesful tests
return(c(length(mn_eq[which(mn_eq == T)])/sum(is.finite(mn_eq)),
length(var_eq[which(var_eq == T)])/sum(is.finite(var_eq))))
}
res <- do.call(cbind, lapply(3:length(spacdat), ChenTOST,
spacdat = spacdat, full_mn = mean(spacdat), full_sd = sd(spacdat),
equiv_margin = equiv_margin)) #only doing subsets where n > 3
res2return <- cbind(1:length(spacdat), t(cbind(matrix(NA,
nrow = 2, ncol = 2), res)))
colnames(res2return) <- c("n", "mn_eq", "var_eq")
return(data.frame(res2return))
}
Example using CTmax data from Desoria trispinata, passed as a vector of individual CT values.
desoria_ctmax <- c(37.5, 37.5, 37.5, 37.5, 37.5, 37.5, 37.5,
37.5, 37.5, 37.9, 37.9, 37.9, 37.9, 37.9, 38.2, 38.2, 38.2,
37.7, 37.7, 37.7, 37.8, 37.8, 37.8, 37.8, 37.8, 37.8, 37.8,
37.8, 37.8, 37.8, 37.8, 37.8, 37.8, 37.8, 37.8, 37.6, 37.6,
37.6, 37.6, 37.6, 37.6, 37.6, 37.6, 37.6, 38, 38, 38, 38.4,
38.4, 38.4, 38.4)
des_max_TOSTed_0p25 <- TOSTer(desoria_ctmax, equiv_margin = 0.25,
set = NULL)
# run TOSTer function on vector of CT values with a 0.25 °C
# equivalence margin.
head(des_max_TOSTed_0p25)
## n mn_eq var_eq
## 1 1 NA NA
## 2 2 NA NA
## 3 3 0.1138915 0.1458140
## 4 4 0.2454481 0.4070013
## 5 5 0.4122007 0.4655915
## 6 6 0.5349605 0.4944483
ggplot(data = des_max_TOSTed_0p25, aes(x = n, y = mn_eq)) + geom_point() +
labs(x = "Subsample size (n)", y = "Proportion equivalent") +
ggtitle("Equivalence of means")
ggplot(data = des_max_TOSTed_0p25, aes(x = n, y = var_eq)) +
geom_point() + labs(x = "Subsample size (n)", y = "Proportion equivalent") +
ggtitle("Equivalence of variances")
Example generating and using simulated data. These data are parameterised using mean, variance, and skew values from the published literature, which can be replaced/updated using preliminary studies on the species and trait of interest. For demonstration purposes, seeds are generated for 50 reproducible randomly generated populations.
pop_seeds <- c(809560L, 854479L, 16380L, 856738L, 42793L, 949638L,
349154L, 680772L, 904048L, 32917L, 846553L, 23701L, 673766L,
538887L, 373079L, 888918L, 244202L, 598545L, 671795L, 530864L,
944901L, 489945L, 21730L, 380536L, 854547L, 482305L, 628216L,
892896L, 198276L, 988513L, 296083L, 262567L, 209831L, 76535L,
257664L, 198760L, 523672L, 174167L, 129254L, 916874L, 288318L,
408110L, 212368L, 18343L, 963653L, 102179L, 652356L, 779200L,
109124L, 253919L)
Set parameters for source population(s) and equivalence testing. Change as required.
mn <- 40.0296951 #mean of population(s)
stdev <- 1 #standard deviation of population(s)
skews <- c(0, 1, 2, 10, 50) #skew of population(s)
equiv_margin <- 0.25 #equivalence margin for equivalence testing
Generate randomly drawn population(s) comprising 100 individuals from normal/skewed distributions using the sn
package.
skew_dists <- lapply(skews, function(skw, stdev, seeds) {
lapply(seeds, function(seed) {
set.seed(seed)
rsn(100, xi = mn, omega = stdev, alpha = skw)
})
}, stdev = stdev, seeds = pop_seeds)
Run the TOSTer
function across all simulated populations. Simulation and testing will take a considerable amount of time (~16hrs on a 2.6Ghz quad-core desktop). The process can be distributed across cores and run in parallel using parallel::mclapply
if desired (substitute in mclapply
for one of the lapply
loops in the TOSTer
function). Results will be outputted to a list, with each element containing results for each skew variant.
tost_res <- lapply(1:length(skews), function(x) {
do.call(rbind, lapply(1:length(pop_seeds), TOSTer, equiv_margin = equiv_margin,
set = x))
})
# name each element in the list for future reference
names(tost_res) <- paste0("sd", sub("\\.", "p", stdev), "_skw",
sub("\\.", "p", skews), "_eqm", sub("\\.", "p", equiv_margin))
Summarise and format data for plotting.
plot_dat <- do.call(rbind, lapply(1:length(skews), function(x) {
data.frame(nsamp = 1:100, mn = sapply(1:100, function(n) {
mean(tost_res[[x]][which(tost_res[[x]][, "n"] == n),
"mn_eq"])
}), stdev = sapply(1:100, function(n) {
sd(tost_res[[x]][which(tost_res[[x]][, "n"] == n), "mn_eq"])
}), grp = skews[x])
}))
Create plot from plot_dat
.
ggplot(data = plot_dat, mapping = aes(x = nsamp, y = mn, ymin = mn -
stdev, ymax = mn + stdev, fill = as.factor(grp))) + geom_ribbon(alpha = 0.5) +
geom_line() + labs(x = "Subsample size (n)", y = "Proportion equivalent",
fill = "skew")