Jean Baptiste, a botanist, seeks to develop a method for retrieving a black tulip from the Azores islands that can survive a specified duration at sea and be presented to his beloved. He requires your assistance in formulating the optimal procedure, and has provided you with his castle and gardens for the task.
PURPOSE Determine the parameters to fit our model
OUTCOME The variance was found to be 13.
EXPLANATION
The pilot data requested was 10*4 due to 2 gardens and 2 types of roses (10 each). The pilot data will be used to estimate variance in the control group.
data <- read.csv("G6.pilot.data.csv")
totvar = var(data$tot.vase.days)
#Convert garden and species to factor variables
fac_col <- c("garden", "species")
data[, fac_col] <- lapply(data[fac_col], factor)
sum_data <- data %>% group_by(garden, species) %>%
summarise(items = length(flowerID),
mean = mean(tot.vase.days),
max = max(tot.vase.days),
min = min(tot.vase.days),
sd = sd(tot.vase.days),
var = var(tot.vase.days)) %>% as.data.frame()
sum_data
| garden | species | items | mean | max | min | sd | var |
|---|---|---|---|---|---|---|---|
| 1 | 1 | 10 | 15.5 | 20 | 6 | 4.4Â | 19.4Â |
| 1 | 2 | 10 | 16.3 | 25 | 11 | 3.83 | 14.7Â |
| 2 | 1 | 10 | 13.7 | 20 | 8 | 3.5Â | 12.2Â |
| 2 | 2 | 10 | 14.4 | 19 | 11 | 2.22 | 4.93 |
tt=ttheme_minimal(base_size = 9)
tab_general <- data %>%
summarise(items = length(flowerID),
mean = mean(tot.vase.days),
max = max(tot.vase.days),
min = min(tot.vase.days),
sd = round(sd(tot.vase.days),3),
var = round(var(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt)
tab_garden <- data %>% group_by(garden) %>%
summarise(items = length(flowerID),
mean = mean(tot.vase.days),
max = max(tot.vase.days),
min = min(tot.vase.days),
sd = round(sd(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt)
tab_species <- data %>% group_by(species) %>%
summarise(items = length(flowerID),
mean = mean(tot.vase.days),
max = max(tot.vase.days),
min = min(tot.vase.days),
sd=round(sd(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt)
attach(data)
plot_gen <- ggplot(data, aes(x = tot.vase.days, fill = "violetred"))+
geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
scale_x_continuous(breaks=seq(6,25,1)) +
theme(legend.position = "none") +
xlab("Shelf life") +
ggtitle("Shelf life of roses")
plot_gard <- ggplot(data, aes(x = tot.vase.days, fill = garden))+
geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
scale_x_continuous(breaks=seq(6,25,1)) +
theme(legend.position = c(0.9,0.9)) +
xlab("Shelf life") +
ggtitle("Shelf life per garden")+
facet_grid(rows = vars(garden))+
scale_fill_manual(name='garden',values=c("violetred","cyan3"),labels=c("North","South"))
plot_specie <- ggplot(data, aes(x = tot.vase.days, fill = species))+
geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
scale_x_continuous(breaks=seq(6,25,1)) +
theme(legend.position = c(0.9,0.9)) +
xlab("Shelf life") +
ggtitle("Shelf life per species")+
facet_grid(rows = vars(species))+
scale_fill_manual(name='species',values=c("lightskyblue","lightsalmon"),labels=c("Fl","HT"))
## Box plots
bx_garden <- ggplot(data, aes(x = factor(garden), y = tot.vase.days)) +
geom_boxplot(fill = "darkgoldenrod1") +
labs(x = "Garden", y = "Shelf life (days)", title = "Distribution of by garden")
bx_species <- ggplot(data, aes(x = factor(species), y = tot.vase.days)) +
geom_boxplot(fill = "peachpuff") +
labs(x = "Species", y = "Shelf life (days)", title = "Distribution by species")
gridExtra::grid.arrange(plot_gen, tab_general, nrow = 1)
gridExtra::grid.arrange(arrangeGrob(plot_specie, plot_gard,
tab_species, tab_garden,
nrow = 2))
gridExtra::grid.arrange(grob =
bx_species, bx_garden, nrow =1, ncol=2)
We are going to use the variance 12.8455128 to calculate our sample size.
PURPOSE See if the data follows a Poisson distribution.
OUTCOME The data follows a Poisson distribution.
EXPLANATION The null hypothesis of the goodness-of-fit test for a Poisson distribution is that the observed data follows a Poisson distribution. The alternative hypothesis is that the observed data does not follow a Poisson distribution.
In this case, we do not have enough evidence to reject the null hypothesis that the data follows a poisson distribution.
gf <- goodfit(data$tot.vase.days, type = "poisson")
summary(gf)
Goodness-of-fit test for poisson distribution
X^2 df P(> X^2)
Likelihood Ratio 19.47772 12 0.07763355
plot(gf, type = "standing", scale = "raw")
Fitting the model we can conclude that the species and gardens do not affect the outcome.
model <- glm(formula = tot.vase.days ~ garden + species + rater, family = "poisson", data = data)
summary(model)
Call:
glm(formula = tot.vase.days ~ garden + species + rater, family = "poisson",
data = data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.7404701 0.1268337 21.607 <2e-16 ***
garden2 -0.1237099 0.0819240 -1.510 0.131
species2 0.0500623 0.0820294 0.610 0.542
rater 0.0002348 0.0508516 0.005 0.996
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 34.568 on 39 degrees of freedom
Residual deviance: 31.906 on 36 degrees of freedom
AIC: 220.91
Number of Fisher Scoring iterations: 4
The variables with high p-values indicate that they are not significantly related to the response variable (tot.vase.days) in the model. However, it is important to consider whether these variables are theoretically or practically important to include in the model.
The fixed effects represent the effects of variables that are assumed to have a constant effect on the outcome variable, while the random effects represent the effects of variables that have a varying effect on the outcome variable across groups or individuals.
model <- glmer(tot.vase.days ~ garden + species + (1|rater), data=data, family="poisson")
summary(model)
Generalized linear mixed model fit by maximum likelihood (Laplace
Approximation) [glmerMod]
Family: poisson ( log )
Formula: tot.vase.days ~ garden + species + (1 | rater)
Data: data
AIC BIC logLik deviance df.resid
220.9 227.7 -106.5 212.9 36
Scaled residuals:
Min 1Q Median 3Q Max
-2.41333 -0.45883 -0.07386 0.62507 2.15547
Random effects:
Groups Name Variance Std.Dev.
rater (Intercept) 0 0
Number of obs: 40, groups: rater, 3
Fixed effects:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.74096 0.07000 39.157 <2e-16 ***
garden2 -0.12370 0.08187 -1.511 0.131
species2 0.05009 0.08174 0.613 0.540
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation of Fixed Effects:
(Intr) gardn2
garden2 -0.549
species2 -0.599 0.000
optimizer (Nelder_Mead) convergence code: 0 (OK)
boundary (singular) fit: see help('isSingular')
Using power_Poisson to compute the sample size for a test of two sample means with Poisson distribution.
# Effect size
d = 2
# Significance level
alpha = 0.05
# Power level
power = 0.80
#Variance
lb1 = 13
# Calculate sample size
p_test <- power_Poisson(n1 = NULL, n2 = NULL,
power = 0.8,
sig.level = 0.05,
lambda1 = lb1, lambda2 = lb1 +d,
equal.sample = TRUE,
alternative = "one.sided")
p_test
Two-sample Poisson Ratio Tests (Equal Sizes)
N = 44.30537
lambda1 = 13
lambda2 = 15
sig.level = 0.05
power = 0.8
alternative = one.sided
NOTE: N is number in *each* group
Rounding up to the nearest whole number, we need a sample size of 44.3053746 roses in each group (control and each treatment) for this experiment. This is the value
In this case, lambda is the mean number of events per unit of
observation (in this case, per rose) and is used to define the Poisson
distribution from which the number of events is simulated.
Nvec is the range of sample sizes to consider.
CODE FOR SIMULATIONS
##FUNCTION
Poisson_sample <- function(n_grid,
lambda0 = 13,
lambda1 = 15,
ngroups = 14,
alpha =0.05,
test = "two_sided",
n_sims = 10000,
seed_nr = 1234,
method){
#Where the power calculation is going to be stored.
power_vec <- matrix(nrow = 1, ncol = length(n_grid))
for (j in 1:length(n_grid)){
# 1. Choose sample size per group
N <- n_grid[j]
alpha = alpha
# 2. Select parameters
lambda.control = lambda0
for (group_id in 1:ngroups){
lambda.group = paste0("lambda.treated",group_id)
assign(lambda.group, lambda1)}
# 3. Simulate experiments and test
numberSimulation <- n_sims
pval <- matrix(0, nrow = numberSimulation, ncol = ngroups)
zval <- matrix(0, nrow = numberSimulation, ncol = ngroups)
set.seed(seed_nr)
for (i in 1:numberSimulation){
# We simulate from Poisson distribution
controlGroup <- rpois(N, lambda = lambda.control)
treatedGroup <- vector()
for (group_id in 1:ngroups){
lambda.group = paste0("lambda.treated",group_id)
treatedGroup = c(treatedGroup, rpois(N, lambda = get(lambda.group)))}
simData <- data.frame(response = c(controlGroup, treatedGroup),
treatment = rep(c(0, 1:ngroups), each = N))
# We use a GLM model for Poisson regression to test effect of treatment
# (Wald test)
glm_fit <- summary(glm(response ~ as.factor(treatment), data = simData, family=poisson()))
pval[i,] <- glm_fit$coeff[-1, "Pr(>|z|)"]
zval[i,] <- glm_fit$coeff[-1, "z value"]
#The sign of the z-value indicates the direction of the effect of the treatment compared to the control group. A positive z-value indicates that the treatment group has a higher outcome than the control group, while a negative z-value indicates that the treatment group has a lower outcome than the control group.
# If test is set to "greater" and the z-value is greater than 0, it means that we are testing if the treatment has a greater effect than the control group, and the p-value needs to be divided by 2 because it is a one-tailed test
for (k in 1:ngroups){
if (test == "greater" & zval[i,k] > 0){pval[i,k] <- pval[i,k]/2}
if (test == "greater" & zval[i,k] < 0){pval[i,k] <- 1 - (pval[i,k]/2)}
if (test == "less" & zval[i,k] < 0){pval[i,k] <- pval[i,k]/2 }
if (test == "less" & zval[i,k] > 0){pval[i,k] <- 1 - (pval[i,k]/2)}
}
# Multiplicity adjustment(s)
if (method == "Bonferroni"){pval[i,] = p.adjust(pval[i,], method = "bonferroni")}
if (method == "Holm"){ pval[i,] = p.adjust(pval[i,], method = "holm")}
if (method == "Benjamini-Hochberg"){pval[i,] = p.adjust(pval[i,], method = "BH")}
}
# 4. Estimate power
power_vec[j] = mean(apply(pval, 2, FUN = function(x){sum(x < alpha)}))/numberSimulation
}
df_power<- data.frame(SS = n_grid, Power = t(power_vec))
return(df_power)
}
ss_at_80 <- function(sampleSize, target_power = 0.8){
# define the power level to interpolate
target_power <- target_power
# interpolate the sample size at the target power level
ss_at_target_power <- approx(sampleSize$Power, sampleSize$SS, xout = target_power)$y
return(round(ss_at_target_power, 2))
}
powerplot <- function(sampleSize, extra = "", power = 0.8){
x.extra = ss_at_80(sampleSize)
y.extra = power
size_line = 1
label = paste("Sample size vs Power ", extra)
sublabel = paste("Sample size ", x.extra)
plotCurve <- ggplot(sampleSize, aes(x=SS,y=Power)) +
geom_line( color = "black", size = size_line) +
geom_point(color = "blue")+
geom_point(aes(x=x.extra, y = y.extra), color = "orange")+
geom_hline(yintercept = 0.8)+
ggtitle(label ) +
labs(title = label,
subtitle = sublabel)+
ylab("Power") +
xlab("Sample Size") +
theme_ipsum()
return(plotCurve)
}
Fixed parameters
n_sim <- 10000
#number of groups
ngroups <- 14
#Type of test
test = "greater"
#Parameters
ES = 2 #Effect Size
lambda0 = 13
lambda1 = lambda0 + ES
alpha = 0.05
n_grid <- seq(from = 10, to = 100, by = 10)
samplesizeBon <- Poisson_sample(n_grid,lambda0,lambda1,ngroups,alpha,test = "greater",n_sims = 10000, seed_nr = 1234, method = "bonferroni")
powerplot(samplesizeBon, extra = 'Method: Bonferroni')
samplesizeBon <-data.frame(format(round(t(samplesizeBon),2),nsmall = 2))
samplesizeBon
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 |
|---|---|---|---|---|---|---|---|---|---|
| 10.00 | 20.00 | 30.00 | 40.00 | 50.00 | 60.00 | 70.00 | 80.00 | 90.00 | 100.00 |
| 0.33 | 0.52 | 0.66 | 0.77 | 0.85 | 0.90 | 0.93 | 0.96 | 0.97 | 0.98 |
n_grid <- seq(from = 50, to = 100, by = 10)
samplesizeHolmBon <- Poisson_sample(n_grid,lambda0,lambda1,ngroups,alpha,test = "greater",n_sims = 10000, seed_nr = 1234, method = "Holm")
powerplot(samplesizeHolmBon, "Method: Holm - Bonferroni")
samplesizeHolmBon <-as.data.frame(format(round(t(samplesizeHolmBon),2),nsmall = 2))
samplesizeHolmBon
| V1 | V2 | V3 | V4 | V5 | V6 |
|---|---|---|---|---|---|
| 50.00 | 60.00 | 70.00 | 80.00 | 90.00 | 100.00 |
| 0.62 | 0.72 | 0.81 | 0.87 | 0.92 | 0.95 |
n_grid <- seq(from = 10, to = 100, by = 10)
samplesizeBH <- Poisson_sample(n_grid,lambda0,lambda1,ngroups,alpha,test = "greater",n_sims = 10000, seed_nr = 1234, method = "BH")
powerplot(samplesizeBH, "Method: BH")
samplesizeBH <-data.frame(format(round(t(samplesizeBH),2),nsmall = 2))
samplesizeBH
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 |
|---|---|---|---|---|---|---|---|---|---|
| 10.00 | 20.00 | 30.00 | 40.00 | 50.00 | 60.00 | 70.00 | 80.00 | 90.00 | 100.00 |
| 0.33 | 0.52 | 0.66 | 0.77 | 0.85 | 0.90 | 0.93 | 0.96 | 0.97 | 0.98 |