Jean Baptiste, a botanist, seeks to develop a method for retrieving a black tulip from the Azores islands that can survive a specified duration at sea and be presented to his beloved. He requires your assistance in formulating the optimal procedure, and has provided you with his castle and gardens for the task.
The pilot data requested was 10*4 due to 2 gardens and 2 types of roses (10 each). The pilot data will be used to estimate variance in the control group.
data <- read.csv("G6.pilot.data.csv")
totvar = var(data$tot.vase.days)
#Convert garden and species to factor variables
fac_col <- c("garden", "species")
data[, fac_col] <- lapply(data[fac_col], factor)
sum_data <- data %>% group_by(garden, species) %>%
summarise(items = length(flowerID),
mean = mean(tot.vase.days),
max = max(tot.vase.days),
min = min(tot.vase.days),
sd = sd(tot.vase.days),
var = var(tot.vase.days)) %>% as.data.frame()
sum_data
| garden | species | items | mean | max | min | sd | var |
|---|---|---|---|---|---|---|---|
| 1 | 1 | 10 | 15.5 | 20 | 6 | 4.4 | 19.4 |
| 1 | 2 | 10 | 16.3 | 25 | 11 | 3.83 | 14.7 |
| 2 | 1 | 10 | 13.7 | 20 | 8 | 3.5 | 12.2 |
| 2 | 2 | 10 | 14.4 | 19 | 11 | 2.22 | 4.93 |
tt=ttheme_minimal(base_size = 9)
tab_general <- data %>%
summarise(items = length(flowerID),
mean = mean(tot.vase.days),
max = max(tot.vase.days),
min = min(tot.vase.days),
sd = round(sd(tot.vase.days),3),
var = round(var(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt)
tab_garden <- data %>% group_by(garden) %>%
summarise(items = length(flowerID),
mean = mean(tot.vase.days),
max = max(tot.vase.days),
min = min(tot.vase.days),
sd = round(sd(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt)
tab_species <- data %>% group_by(species) %>%
summarise(items = length(flowerID),
mean = mean(tot.vase.days),
max = max(tot.vase.days),
min = min(tot.vase.days),
sd=round(sd(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt)
attach(data)
plot_gen <- ggplot(data, aes(x = tot.vase.days, fill = "violetred"))+
geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
scale_x_continuous(breaks=seq(6,25,1)) +
theme(legend.position = "none") +
xlab("Shelf life") +
ggtitle("Shelf life of roses")
plot_gard <- ggplot(data, aes(x = tot.vase.days, fill = garden))+
geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
scale_x_continuous(breaks=seq(6,25,1)) +
theme(legend.position = c(0.9,0.9)) +
xlab("Shelf life") +
ggtitle("Shelf life per garden")+
facet_grid(rows = vars(garden))+
scale_fill_manual(name='garden',values=c("violetred","cyan3"),labels=c("North","South"))
plot_specie <- ggplot(data, aes(x = tot.vase.days, fill = species))+
geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
scale_x_continuous(breaks=seq(6,25,1)) +
theme(legend.position = c(0.9,0.9)) +
xlab("Shelf life") +
ggtitle("Shelf life per species")+
facet_grid(rows = vars(species))+
scale_fill_manual(name='species',values=c("lightskyblue","lightsalmon"),labels=c("Fl","HT"))
## Box plots
bx_garden <- ggplot(data, aes(x = factor(garden), y = tot.vase.days)) +
geom_boxplot(fill = "darkgoldenrod1") +
labs(x = "Garden", y = "Shelf life (days)", title = "Distribution of by garden")
bx_species <- ggplot(data, aes(x = factor(species), y = tot.vase.days)) +
geom_boxplot(fill = "peachpuff") +
labs(x = "Species", y = "Shelf life (days)", title = "Distribution by species")
gridExtra::grid.arrange(plot_gen, tab_general, nrow = 1)
gridExtra::grid.arrange(arrangeGrob(plot_specie, plot_gard,
tab_species, tab_garden,
nrow = 2))
gridExtra::grid.arrange(grob =
bx_species, bx_garden, nrow =1, ncol=2)
Performing a two way ANOVA The purpose of performing a two-way ANOVA on a dataset is to investigate the simultaneous effects of two categorical independent variables (factors) on a continuous dependent variable. In the given dataset of flower measurements, the two independent variables are garden and species, and the dependent variable is tot.vase.days, which represents the shelf life of the flowers.
By conducting a two-way ANOVA on this dataset, we can determine whether there is a significant effect of garden, species, or their interaction on the shelf life of the flowers. This can provide valuable insights into the factors that affect the longevity of flowers
anova_results <- aov(tot.vase.days ~ species + garden + species:garden, data = data)
# Finally, check the ANOVA table for the p-value
summary(anova_results)
Df Sum Sq Mean Sq F value Pr(>F)
species 1 5.6 5.63 0.439 0.512
garden 1 34.2 34.22 2.672 0.111
species:garden 1 0.0 0.03 0.002 0.965
Residuals 36 461.1 12.81
Garden influences the shelf life? To determine if the garden influences the shelf life of the flower, an analysis of variance (ANOVA) is performed.
anova_results <- aov(tot.vase.days ~ garden, data = data)
# Finally, check the ANOVA table for the p-value
summary(anova_results)
Df Sum Sq Mean Sq F value Pr(>F)
garden 1 34.2 34.22 2.786 0.103
Residuals 38 466.7 12.28
The p-value of 0.103 is greater than the significance level of 0.05, so we fail to reject the null hypothesis that the means of the shelf life of flowers are the same across different gardens. Therefore, we conclude that there is no significant effect of garden on the shelf life of flowers.
Species influences the shelf life? To determine if the species influences the shelf life of the flower, an analysis of variance (ANOVA) is performed.
anova_results <- aov(tot.vase.days ~ species, data = data)
# Finally, check the ANOVA table for the p-value
summary(anova_results)
Df Sum Sq Mean Sq F value Pr(>F)
species 1 5.6 5.625 0.432 0.515
Residuals 38 495.3 13.036
The p-value of 0.515 is greater than the significance level of 0.05, so we fail to reject the null hypothesis that the means of the shelf life of flowers are the same across different species. Therefore, we conclude that there is no significant effect of species on the shelf life of flowers.
There is no influence on species or garden. How that influences or affects out model??
The null hypothesis of the goodness-of-fit test for a Poisson distribution is that the observed data follows a Poisson distribution. The alternative hypothesis is that the observed data does not follow a Poisson distribution. Remember that If the p-value is less than or equal to the specified significance level α (p-value < alpha), the null hypothesis is rejected; otherwise, the null hypothesis is not rejected.
In this case, we do not have enough evidence to reject the null hypothesis that the data follows a poisson distribution.
gf <- goodfit(data$tot.vase.days, type = "poisson")
summary(gf)
Goodness-of-fit test for poisson distribution
X^2 df P(> X^2)
Likelihood Ratio 19.47772 12 0.07763355
plot(gf, type = "standing", scale = "raw")
Fitting the model we get the same conclusions as before. i.e that the species and gardens do not affect the outcome.
model <- glm(formula = tot.vase.days ~ garden + species + rater, family = "poisson", data = data)
summary(model)
Call:
glm(formula = tot.vase.days ~ garden + species + rater, family = "poisson",
data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.75996 -0.46957 -0.07294 0.60963 1.99811
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.7404701 0.1268337 21.607 <2e-16 ***
garden2 -0.1237099 0.0819240 -1.510 0.131
species2 0.0500623 0.0820294 0.610 0.542
rater 0.0002348 0.0508516 0.005 0.996
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 34.568 on 39 degrees of freedom
Residual deviance: 31.906 on 36 degrees of freedom
AIC: 220.91
Number of Fisher Scoring iterations: 4
The variables with high p-values indicate that they are not significantly related to the response variable (tot.vase.days) in the model. However, it is important to consider whether these variables are theoretically or practically important to include in the model.
Should we remove some variables? i would remove raters .
The fixed effects represent the effects of variables that are assumed to have a constant effect on the outcome variable, while the random effects represent the effects of variables that have a varying effect on the outcome variable across groups or individuals.
model <- glmer(tot.vase.days ~ garden + (1|species) + (1|rater), data=data, family="poisson")
summary(model)
Generalized linear mixed model fit by maximum likelihood (Laplace
Approximation) [glmerMod]
Family: poisson ( log )
Formula: tot.vase.days ~ garden + (1 | species) + (1 | rater)
Data: data
AIC BIC logLik deviance df.resid
221.3 228.0 -106.6 213.3 36
Scaled residuals:
Min 1Q Median 3Q Max
-2.48277 -0.54691 0.02508 0.52184 2.28214
Random effects:
Groups Name Variance Std.Dev.
rater (Intercept) 0 0
species (Intercept) 0 0
Number of obs: 40, groups: rater, 3; species, 2
Fixed effects:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.76632 0.05608 49.331 <2e-16 ***
garden2 -0.12370 0.08187 -1.511 0.131
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation of Fixed Effects:
(Intr)
garden2 -0.685
optimizer (Nelder_Mead) convergence code: 0 (OK)
boundary (singular) fit: see help('isSingular')
We can compare the AIC or BIC values of the two models fitted. The model with the lower AIC or BIC value is the better fit. In this case would be the glm without the fixed effects.
Still not sure why we did the model fitting .
Using basic ‘power.t.test’.
# Effect size
d = 1 #Iwould suggest changing the effectsize
# Significance level
alpha = 0.05
# Power level
power = 0.80
# Variance
sigma2 = totvar #12.8
# Calculate sample size
p_test <- power.t.test(delta = d, sd = sqrt(sigma2), sig.level = alpha, power = power, type = "two.sample", alternative = "one.sided")
p_test
Two-sample t test power calculation
n = 159.517
delta = 1
sd = 3.584064
sig.level = 0.05
power = 0.8
alternative = one.sided
NOTE: n is number in *each* group
Rounding up to the nearest whole number, we need a sample size of 159.5169505 roses in each group (control and each treatment) for this experiment.
# 2. Select lambda and effect size
lambda.control = 15
lambda.treated = 16
alpha = (0.05) # significance level
# 3. set number of experiments and test
numberSimulation <- 5000
pval <- numeric(numberSimulation)
Nvec <- seq(100, 400, by = 50)
sampleSizePower <- numeric(length(Nvec))
names(sampleSizePower) <- Nvec
set.seed(1234)
for (j in 1:length(Nvec)){
for (i in 1:numberSimulation){
# we simulate from Poisson distribution
controlGroup <- rpois(Nvec[j], lambda = lambda.control)
treatedGroup <- rpois(Nvec[j], lambda = lambda.treated)
simData <- data.frame(response = c(controlGroup, treatedGroup), treatment = rep(c(0, 1), each = Nvec[j]))
# we use a GLM model for Poisson regression to test effect of treatment
pval[i] <- summary(glm(response ~ treatment, data = simData, family=poisson()))$coeff["treatment", "Pr(>|z|)"]
}
# 4. Estimate power
sampleSizePower[j] <-sum(pval/2 < alpha)/numberSimulation
}
sampleSizePower
100 150 200 250 300 350 400
0.5650 0.7120 0.8142 0.8844 0.9310 0.9594 0.9778
plot.data = data.frame(x=Nvec, y=sampleSizePower)
ggplot(data=plot.data, aes(x=x, y=y)) +
theme_bw(base_size=15) +
geom_point() +
geom_line() +
labs(x="Sample size, N", y="Power") +
geom_hline(yintercept=0.85, linetype=1, col="red", size=1) +
scale_x_continuous(breaks = Nvec)