Project

Introduction

Jean Baptiste, a botanist, seeks to develop a method for retrieving a black tulip from the Azores islands that can survive a specified duration at sea and be presented to his beloved. He requires your assistance in formulating the optimal procedure, and has provided you with his castle and gardens for the task.

The gardens are divided into north and south sections with distinct variations in sunlight and soil composition.
Each section contains two types of roses: Floribunda and Hybrid Tea.
Your task is to research these roses using 15 compounds that may extend the lifespan of flowers,
You have a timeline of 3 months.

DATA EXPLORATION

The pilot data requested was 10*4 due to 2 gardens and 2 types of roses (10 each). The pilot data will be used to estimate variance in the control group.

data <- read.csv("G6.pilot.data.csv")
totvar = var(data$tot.vase.days)

#Convert garden and species to factor variables
fac_col <- c("garden", "species")
data[, fac_col] <- lapply(data[fac_col], factor)

sum_data <- data %>% group_by(garden, species) %>% 
  summarise(items = length(flowerID),
            mean = mean(tot.vase.days), 
            max = max(tot.vase.days), 
            min = min(tot.vase.days),
            sd = sd(tot.vase.days),
            var = var(tot.vase.days)) %>% as.data.frame()
sum_data

garden	species	items	mean	max	min	sd	var
1	1	10	15.5	20	6	4.4	19.4
1	2	10	16.3	25	11	3.83	14.7
2	1	10	13.7	20	8	3.5	12.2
2	2	10	14.4	19	11	2.22	4.93

tt=ttheme_minimal(base_size = 9)

tab_general <- data %>% 
  summarise(items = length(flowerID),
            mean = mean(tot.vase.days), 
            max = max(tot.vase.days), 
            min = min(tot.vase.days),
            sd = round(sd(tot.vase.days),3),
            var = round(var(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt) 



tab_garden <- data %>% group_by(garden) %>% 
  summarise(items = length(flowerID),
            mean = mean(tot.vase.days), 
            max = max(tot.vase.days), 
            min = min(tot.vase.days),
            sd = round(sd(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt) 

tab_species <- data %>% group_by(species) %>% 
  summarise(items = length(flowerID),
            mean = mean(tot.vase.days), 
            max = max(tot.vase.days), 
            min = min(tot.vase.days),
            sd=round(sd(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt) 


attach(data)
plot_gen <- ggplot(data, aes(x = tot.vase.days, fill = "violetred"))+
  geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
  scale_x_continuous(breaks=seq(6,25,1)) +
  theme(legend.position = "none") +
  xlab("Shelf life") + 
  ggtitle("Shelf life of roses")


plot_gard <- ggplot(data, aes(x = tot.vase.days, fill = garden))+
  geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
  scale_x_continuous(breaks=seq(6,25,1)) +
  theme(legend.position = c(0.9,0.9)) +
  xlab("Shelf life") + 
  ggtitle("Shelf life per garden")+
  facet_grid(rows = vars(garden))+
  scale_fill_manual(name='garden',values=c("violetred","cyan3"),labels=c("North","South"))

plot_specie <- ggplot(data, aes(x = tot.vase.days, fill = species))+
  geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
  scale_x_continuous(breaks=seq(6,25,1)) +
  theme(legend.position = c(0.9,0.9)) +
  xlab("Shelf life") + 
  ggtitle("Shelf life per species")+
  facet_grid(rows = vars(species))+
  scale_fill_manual(name='species',values=c("lightskyblue","lightsalmon"),labels=c("Fl","HT"))

## Box plots
bx_garden <- ggplot(data, aes(x = factor(garden), y = tot.vase.days)) + 
  geom_boxplot(fill = "darkgoldenrod1") + 
  labs(x = "Garden", y = "Shelf life (days)", title = "Distribution of by garden")

bx_species <- ggplot(data, aes(x = factor(species), y = tot.vase.days)) + 
  geom_boxplot(fill = "peachpuff") + 
  labs(x = "Species", y = "Shelf life (days)", title = "Distribution by species")

gridExtra::grid.arrange(plot_gen, tab_general,  nrow = 1)

gridExtra::grid.arrange(arrangeGrob(plot_specie, plot_gard, 
                        tab_species, tab_garden, 
                        nrow = 2))

gridExtra::grid.arrange(grob =
                        bx_species, bx_garden, nrow =1, ncol=2)

VARIABLES INFLUENCE

Performing a two way ANOVA The purpose of performing a two-way ANOVA on a dataset is to investigate the simultaneous effects of two categorical independent variables (factors) on a continuous dependent variable. In the given dataset of flower measurements, the two independent variables are garden and species, and the dependent variable is tot.vase.days, which represents the shelf life of the flowers.

By conducting a two-way ANOVA on this dataset, we can determine whether there is a significant effect of garden, species, or their interaction on the shelf life of the flowers. This can provide valuable insights into the factors that affect the longevity of flowers

anova_results <- aov(tot.vase.days ~ species + garden + species:garden, data = data)

# Finally, check the ANOVA table for the p-value
summary(anova_results)

               Df Sum Sq Mean Sq F value Pr(>F)
species         1    5.6    5.63   0.439  0.512
garden          1   34.2   34.22   2.672  0.111
species:garden  1    0.0    0.03   0.002  0.965
Residuals      36  461.1   12.81

Garden influences the shelf life? To determine if the garden influences the shelf life of the flower, an analysis of variance (ANOVA) is performed.

anova_results <- aov(tot.vase.days ~ garden, data = data)

# Finally, check the ANOVA table for the p-value
summary(anova_results)

            Df Sum Sq Mean Sq F value Pr(>F)
garden       1   34.2   34.22   2.786  0.103
Residuals   38  466.7   12.28

The p-value of 0.103 is greater than the significance level of 0.05, so we fail to reject the null hypothesis that the means of the shelf life of flowers are the same across different gardens. Therefore, we conclude that there is no significant effect of garden on the shelf life of flowers.

Species influences the shelf life? To determine if the species influences the shelf life of the flower, an analysis of variance (ANOVA) is performed.

anova_results <- aov(tot.vase.days ~ species, data = data)

# Finally, check the ANOVA table for the p-value
summary(anova_results)

            Df Sum Sq Mean Sq F value Pr(>F)
species      1    5.6   5.625   0.432  0.515
Residuals   38  495.3  13.036

The p-value of 0.515 is greater than the significance level of 0.05, so we fail to reject the null hypothesis that the means of the shelf life of flowers are the same across different species. Therefore, we conclude that there is no significant effect of species on the shelf life of flowers.

Conclusions

There is no influence on species or garden. How that influences or affects out model??

MODEL

The null hypothesis of the goodness-of-fit test for a Poisson distribution is that the observed data follows a Poisson distribution. The alternative hypothesis is that the observed data does not follow a Poisson distribution. Remember that If the p-value is less than or equal to the specified significance level α (p-value < alpha), the null hypothesis is rejected; otherwise, the null hypothesis is not rejected.

In this case, we do not have enough evidence to reject the null hypothesis that the data follows a poisson distribution.

gf <- goodfit(data$tot.vase.days,  type = "poisson")
summary(gf)


     Goodness-of-fit test for poisson distribution

                      X^2 df   P(> X^2)
Likelihood Ratio 19.47772 12 0.07763355

plot(gf, type = "standing", scale = "raw")

Fitting the model we get the same conclusions as before. i.e that the species and gardens do not affect the outcome.

model <- glm(formula = tot.vase.days ~ garden + species + rater, family = "poisson", data = data)
summary(model)


Call:
glm(formula = tot.vase.days ~ garden + species + rater, family = "poisson", 
    data = data)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.75996  -0.46957  -0.07294   0.60963   1.99811  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  2.7404701  0.1268337  21.607   <2e-16 ***
garden2     -0.1237099  0.0819240  -1.510    0.131    
species2     0.0500623  0.0820294   0.610    0.542    
rater        0.0002348  0.0508516   0.005    0.996    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 34.568  on 39  degrees of freedom
Residual deviance: 31.906  on 36  degrees of freedom
AIC: 220.91

Number of Fisher Scoring iterations: 4

The coefficient for garden2 is not significantly different from zero with an estimated value of -0.12 and a p-value of 0.131, indicating that there is no significant difference in tot.vase.days between gardens 1 and 2 after controlling for the other predictor variables.
The coefficient for species2 is also not significantly different from zero with an estimated value of 0.050 and a p-value of 0.542, indicating that there is no significant difference in tot.vase.days between species 1 and 2 after controlling for the other predictor variables.
The coefficient for rater is also not significantly different from zero with an estimated value of 0.0002 and a very large p-value of 0.996, indicating that there is no significant association between rater and tot.vase.days after controlling for the other predictor variables.

The variables with high p-values indicate that they are not significantly related to the response variable (tot.vase.days) in the model. However, it is important to consider whether these variables are theoretically or practically important to include in the model.

Should we remove some variables? i would remove raters .

Fixed effects model

The fixed effects represent the effects of variables that are assumed to have a constant effect on the outcome variable, while the random effects represent the effects of variables that have a varying effect on the outcome variable across groups or individuals.

model <- glmer(tot.vase.days ~ garden + (1|species) + (1|rater), data=data, family="poisson")
summary(model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: poisson  ( log )
Formula: tot.vase.days ~ garden + (1 | species) + (1 | rater)
   Data: data

     AIC      BIC   logLik deviance df.resid 
   221.3    228.0   -106.6    213.3       36 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.48277 -0.54691  0.02508  0.52184  2.28214 

Random effects:
 Groups  Name        Variance Std.Dev.
 rater   (Intercept) 0        0       
 species (Intercept) 0        0       
Number of obs: 40, groups:  rater, 3; species, 2

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  2.76632    0.05608  49.331   <2e-16 ***
garden2     -0.12370    0.08187  -1.511    0.131    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
        (Intr)
garden2 -0.685
optimizer (Nelder_Mead) convergence code: 0 (OK)
boundary (singular) fit: see help('isSingular')

We can compare the AIC or BIC values of the two models fitted. The model with the lower AIC or BIC value is the better fit. In this case would be the glm without the fixed effects.

Still not sure why we did the model fitting .

SAMPLE SIZE

Using basic ‘power.t.test’.

# Effect size
d = 1 #Iwould suggest changing the effectsize
# Significance level
alpha = 0.05

# Power level
power = 0.80

# Variance
sigma2 = totvar #12.8

# Calculate sample size
p_test <- power.t.test(delta = d, sd = sqrt(sigma2), sig.level = alpha, power = power, type = "two.sample", alternative = "one.sided")
p_test


     Two-sample t test power calculation 

              n = 159.517
          delta = 1
             sd = 3.584064
      sig.level = 0.05
          power = 0.8
    alternative = one.sided

NOTE: n is number in *each* group

Rounding up to the nearest whole number, we need a sample size of 159.5169505 roses in each group (control and each treatment) for this experiment.

Simulations

# 2. Select lambda and effect size
lambda.control = 15   
lambda.treated = 16 
alpha = (0.05)  # significance level

# 3. set number of experiments and test
numberSimulation <- 5000
pval <- numeric(numberSimulation) 

Nvec <- seq(100, 400, by = 50)
sampleSizePower <- numeric(length(Nvec))
names(sampleSizePower) <- Nvec   

set.seed(1234) 
for (j in 1:length(Nvec)){
  for (i in 1:numberSimulation){
    # we simulate from Poisson distribution
    controlGroup <- rpois(Nvec[j], lambda = lambda.control)
    treatedGroup <- rpois(Nvec[j], lambda = lambda.treated)
    simData <- data.frame(response = c(controlGroup, treatedGroup), treatment = rep(c(0, 1), each = Nvec[j]))
    # we use a GLM model for Poisson regression to test effect of treatment
    pval[i] <- summary(glm(response ~ treatment, data = simData, family=poisson()))$coeff["treatment", "Pr(>|z|)"]
  }
  
  # 4. Estimate power
  sampleSizePower[j] <-sum(pval/2 < alpha)/numberSimulation
}
sampleSizePower

   100    150    200    250    300    350    400 
0.5650 0.7120 0.8142 0.8844 0.9310 0.9594 0.9778

plot.data = data.frame(x=Nvec, y=sampleSizePower)
ggplot(data=plot.data, aes(x=x, y=y)) + 
  theme_bw(base_size=15) + 
  geom_point() + 
  geom_line() +
  labs(x="Sample size, N", y="Power") + 
  geom_hline(yintercept=0.85, linetype=1, col="red", size=1) +
  scale_x_continuous(breaks = Nvec)