Project

Introduction

Jean Baptiste, a botanist, seeks to develop a method for retrieving a black tulip from the Azores islands that can survive a specified duration at sea and be presented to his beloved. He requires your assistance in formulating the optimal procedure, and has provided you with his castle and gardens for the task.

The gardens are divided into north and south sections with distinct variations in sunlight and soil composition.
- Each garden has subplots
Each section contains two types of roses: Floribunda and Hybrid Tea.
Your task is to research these roses using 15 compounds that may extend the lifespan of flowers,
You have a timeline of 3 months.

DATA EXPLORATION

PURPOSE Determine the parameters to fit our model

OUTCOME The variance was found to be 13.

EXPLANATION

The pilot data requested was 10*4 due to 2 gardens and 2 types of roses (10 each). The pilot data will be used to estimate variance in the control group.

data <- read.csv("G6.pilot.data.csv")
totvar = var(data$tot.vase.days)

#Convert garden and species to factor variables
fac_col <- c("garden", "species")
data[, fac_col] <- lapply(data[fac_col], factor)

sum_data <- data %>% group_by(garden, species) %>% 
  summarise(items = length(flowerID),
            mean = mean(tot.vase.days), 
            max = max(tot.vase.days), 
            min = min(tot.vase.days),
            sd = sd(tot.vase.days),
            var = var(tot.vase.days)) %>% as.data.frame()
sum_data

garden	species	items	mean	max	min	sd	var
1	1	10	15.5	20	6	4.4	19.4
1	2	10	16.3	25	11	3.83	14.7
2	1	10	13.7	20	8	3.5	12.2
2	2	10	14.4	19	11	2.22	4.93

tt=ttheme_minimal(base_size = 9)

tab_general <- data %>% 
  summarise(items = length(flowerID),
            mean = mean(tot.vase.days), 
            max = max(tot.vase.days), 
            min = min(tot.vase.days),
            sd = round(sd(tot.vase.days),3),
            var = round(var(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt) 



tab_garden <- data %>% group_by(garden) %>% 
  summarise(items = length(flowerID),
            mean = mean(tot.vase.days), 
            max = max(tot.vase.days), 
            min = min(tot.vase.days),
            sd = round(sd(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt) 

tab_species <- data %>% group_by(species) %>% 
  summarise(items = length(flowerID),
            mean = mean(tot.vase.days), 
            max = max(tot.vase.days), 
            min = min(tot.vase.days),
            sd=round(sd(tot.vase.days),3)) %>% as.data.frame() %>% gridExtra::tableGrob(, theme=tt) 


attach(data)
plot_gen <- ggplot(data, aes(x = tot.vase.days, fill = "violetred"))+
  geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
  scale_x_continuous(breaks=seq(6,25,1)) +
  theme(legend.position = "none") +
  xlab("Shelf life") + 
  ggtitle("Shelf life of roses")


plot_gard <- ggplot(data, aes(x = tot.vase.days, fill = garden))+
  geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
  scale_x_continuous(breaks=seq(6,25,1)) +
  theme(legend.position = c(0.9,0.9)) +
  xlab("Shelf life") + 
  ggtitle("Shelf life per garden")+
  facet_grid(rows = vars(garden))+
  scale_fill_manual(name='garden',values=c("violetred","cyan3"),labels=c("North","South"))

plot_specie <- ggplot(data, aes(x = tot.vase.days, fill = species))+
  geom_histogram(color="#e9ecef", alpha=0.7, position = 'identity', binwidth = 1) +
  scale_x_continuous(breaks=seq(6,25,1)) +
  theme(legend.position = c(0.9,0.9)) +
  xlab("Shelf life") + 
  ggtitle("Shelf life per species")+
  facet_grid(rows = vars(species))+
  scale_fill_manual(name='species',values=c("lightskyblue","lightsalmon"),labels=c("Fl","HT"))

## Box plots
bx_garden <- ggplot(data, aes(x = factor(garden), y = tot.vase.days)) + 
  geom_boxplot(fill = "darkgoldenrod1") + 
  labs(x = "Garden", y = "Shelf life (days)", title = "Distribution of by garden")

bx_species <- ggplot(data, aes(x = factor(species), y = tot.vase.days)) + 
  geom_boxplot(fill = "peachpuff") + 
  labs(x = "Species", y = "Shelf life (days)", title = "Distribution by species")

gridExtra::grid.arrange(plot_gen, tab_general,  nrow = 1)

gridExtra::grid.arrange(arrangeGrob(plot_specie, plot_gard, 
                        tab_species, tab_garden, 
                        nrow = 2))

gridExtra::grid.arrange(grob =
                        bx_species, bx_garden, nrow =1, ncol=2)

We are going to use the variance 12.8455128 to calculate our sample size.

MODEL

PURPOSE See if the data follows a Poisson distribution.

OUTCOME The data follows a Poisson distribution.

EXPLANATION The null hypothesis of the goodness-of-fit test for a Poisson distribution is that the observed data follows a Poisson distribution. The alternative hypothesis is that the observed data does not follow a Poisson distribution.

In this case, we do not have enough evidence to reject the null hypothesis that the data follows a poisson distribution.

gf <- goodfit(data$tot.vase.days,  type = "poisson")
summary(gf)


     Goodness-of-fit test for poisson distribution

                      X^2 df   P(> X^2)
Likelihood Ratio 19.47772 12 0.07763355

plot(gf, type = "standing", scale = "raw")

Fitting the model we can conclude that the species and gardens do not affect the outcome.

model <- glm(formula = tot.vase.days ~ garden + species + rater, family = "poisson", data = data)
summary(model)


Call:
glm(formula = tot.vase.days ~ garden + species + rater, family = "poisson", 
    data = data)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  2.7404701  0.1268337  21.607   <2e-16 ***
garden2     -0.1237099  0.0819240  -1.510    0.131    
species2     0.0500623  0.0820294   0.610    0.542    
rater        0.0002348  0.0508516   0.005    0.996    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 34.568  on 39  degrees of freedom
Residual deviance: 31.906  on 36  degrees of freedom
AIC: 220.91

Number of Fisher Scoring iterations: 4

The coefficient for garden2 is not significantly different from zero with an estimated value of -0.12 and a p-value of 0.131, indicating that there is no significant difference in tot.vase.days between gardens 1 and 2 after controlling for the other predictor variables.
The coefficient for species2 is also not significantly different from zero with an estimated value of 0.050 and a p-value of 0.542, indicating that there is no significant difference in tot.vase.days between species 1 and 2 after controlling for the other predictor variables.
The coefficient for rater is also not significantly different from zero with an estimated value of 0.0002 and a very large p-value of 0.996, indicating that there is no significant association between rater and tot.vase.days after controlling for the other predictor variables.

The variables with high p-values indicate that they are not significantly related to the response variable (tot.vase.days) in the model. However, it is important to consider whether these variables are theoretically or practically important to include in the model.

Fixed effects model

The fixed effects represent the effects of variables that are assumed to have a constant effect on the outcome variable, while the random effects represent the effects of variables that have a varying effect on the outcome variable across groups or individuals.

model <- glmer(tot.vase.days ~ garden + species + (1|rater), data=data, family="poisson")
summary(model)

Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: poisson  ( log )
Formula: tot.vase.days ~ garden + species + (1 | rater)
   Data: data

     AIC      BIC   logLik deviance df.resid 
   220.9    227.7   -106.5    212.9       36 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.41333 -0.45883 -0.07386  0.62507  2.15547 

Random effects:
 Groups Name        Variance Std.Dev.
 rater  (Intercept) 0        0       
Number of obs: 40, groups:  rater, 3

Fixed effects:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  2.74096    0.07000  39.157   <2e-16 ***
garden2     -0.12370    0.08187  -1.511    0.131    
species2     0.05009    0.08174   0.613    0.540    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
         (Intr) gardn2
garden2  -0.549       
species2 -0.599  0.000
optimizer (Nelder_Mead) convergence code: 0 (OK)
boundary (singular) fit: see help('isSingular')

SAMPLE SIZE CALCULATION

Using power_Poisson to compute the sample size for a test of two sample means with Poisson distribution.

# Effect size
d = 2
# Significance level
alpha = 0.05

# Power level
power = 0.80

#Variance
lb1 = 13 

# Calculate sample size
p_test <- power_Poisson(n1 = NULL, n2 = NULL, 
                        power = 0.8, 
                        sig.level = 0.05, 
                        lambda1 = lb1, lambda2 = lb1 +d, 
                        equal.sample = TRUE, 
                        alternative = "one.sided")
p_test


     Two-sample Poisson Ratio Tests (Equal Sizes) 

              N = 44.30537
        lambda1 = 13
        lambda2 = 15
      sig.level = 0.05
          power = 0.8
    alternative = one.sided

NOTE: N is number in *each* group

Rounding up to the nearest whole number, we need a sample size of 44.3053746 roses in each group (control and each treatment) for this experiment. This is the value

Simulations

In this case, lambda is the mean number of events per unit of observation (in this case, per rose) and is used to define the Poisson distribution from which the number of events is simulated. Nvec is the range of sample sizes to consider.

CODE FOR SIMULATIONS

##FUNCTION
Poisson_sample <- function(n_grid, 
                           lambda0 = 13, 
                           lambda1 = 15, 
                           ngroups = 14, 
                           alpha =0.05, 
                           test = "two_sided", 
                           n_sims = 10000, 
                           seed_nr = 1234, 
                           method){
    
  #Where the power calculation is going to be stored.
  power_vec <- matrix(nrow = 1, ncol = length(n_grid))
  
  for (j in 1:length(n_grid)){
    # 1. Choose sample size per group
    N <- n_grid[j]
    alpha = alpha
    
    # 2. Select parameters
    lambda.control = lambda0
    for (group_id in 1:ngroups){
      lambda.group = paste0("lambda.treated",group_id)
      assign(lambda.group, lambda1)}
  
    # 3. Simulate experiments and test
    numberSimulation <- n_sims
    pval <- matrix(0, nrow = numberSimulation, ncol = ngroups)
    zval <- matrix(0, nrow = numberSimulation, ncol = ngroups)
    
    set.seed(seed_nr)
    
    for (i in 1:numberSimulation){
  
      # We simulate from Poisson distribution
      controlGroup <- rpois(N, lambda = lambda.control)
      treatedGroup <- vector()
      
      for (group_id in 1:ngroups){
        lambda.group = paste0("lambda.treated",group_id)
        treatedGroup = c(treatedGroup, rpois(N, lambda = get(lambda.group)))}

      simData <- data.frame(response = c(controlGroup, treatedGroup),
                            treatment = rep(c(0, 1:ngroups), each = N))
      # We use a GLM model for Poisson regression to test effect of treatment
      # (Wald test)
      glm_fit <- summary(glm(response ~ as.factor(treatment), data = simData, family=poisson()))
      pval[i,] <- glm_fit$coeff[-1, "Pr(>|z|)"]
      zval[i,] <- glm_fit$coeff[-1, "z value"]
      
      #The sign of the z-value indicates the direction of the effect of the treatment compared to the control group. A positive z-value indicates that the treatment group has a higher outcome than the control group, while a negative z-value indicates that the treatment group has a lower outcome than the control group.
      
      # If test is set to "greater" and the z-value is greater than 0, it means that we are testing if the treatment has a greater effect than the control group, and the p-value needs to be divided by 2 because it is a one-tailed test
      
      for (k in 1:ngroups){
        if (test == "greater" & zval[i,k] > 0){pval[i,k] <- pval[i,k]/2}
        if (test == "greater" & zval[i,k] < 0){pval[i,k] <- 1 - (pval[i,k]/2)}
        if (test == "less" & zval[i,k] < 0){pval[i,k] <- pval[i,k]/2 }
        if (test == "less" & zval[i,k] > 0){pval[i,k] <- 1 - (pval[i,k]/2)}
      }
      
      # Multiplicity adjustment(s)
      if (method == "Bonferroni"){pval[i,] = p.adjust(pval[i,], method = "bonferroni")}
      if (method == "Holm"){ pval[i,] = p.adjust(pval[i,], method = "holm")}
      if (method == "Benjamini-Hochberg"){pval[i,] = p.adjust(pval[i,], method = "BH")}
    }
    # 4. Estimate power
    power_vec[j] = mean(apply(pval, 2, FUN = function(x){sum(x < alpha)}))/numberSimulation
  }
  
  df_power<- data.frame(SS = n_grid, Power = t(power_vec))
  return(df_power)
}

ss_at_80 <- function(sampleSize, target_power = 0.8){
  # define the power level to interpolate
  target_power <- target_power
  
  # interpolate the sample size at the target power level
  ss_at_target_power <- approx(sampleSize$Power, sampleSize$SS, xout = target_power)$y
  
  return(round(ss_at_target_power, 2))
}

powerplot <- function(sampleSize, extra = "", power = 0.8){

  x.extra = ss_at_80(sampleSize)
  y.extra = power
  
  size_line = 1
  label = paste("Sample size vs Power ", extra)
  sublabel = paste("Sample size ", x.extra)

  plotCurve <- ggplot(sampleSize, aes(x=SS,y=Power)) + 
    geom_line( color = "black", size = size_line) +
    geom_point(color = "blue")+
    geom_point(aes(x=x.extra, y = y.extra), color = "orange")+
    geom_hline(yintercept = 0.8)+
    ggtitle(label ) +
    labs(title = label,
         subtitle = sublabel)+
    ylab("Power") +
    xlab("Sample Size") +
    theme_ipsum()
  
  return(plotCurve)
}

RESULTS OF SIMULATIONS

Fixed parameters

n_sim <- 10000

#number of groups
ngroups <- 14

#Type of test
test = "greater"

#Parameters
ES = 2 #Effect Size
lambda0 = 13
lambda1 = lambda0 + ES
alpha = 0.05

n_grid <- seq(from = 10, to = 100, by = 10)

samplesizeBon <- Poisson_sample(n_grid,lambda0,lambda1,ngroups,alpha,test = "greater",n_sims = 10000, seed_nr = 1234, method = "bonferroni")

powerplot(samplesizeBon, extra = 'Method: Bonferroni')

samplesizeBon <-data.frame(format(round(t(samplesizeBon),2),nsmall = 2))
samplesizeBon

X1	X2	X3	X4	X5	X6	X7	X8	X9	X10
10.00	20.00	30.00	40.00	50.00	60.00	70.00	80.00	90.00	100.00
0.33	0.52	0.66	0.77	0.85	0.90	0.93	0.96	0.97	0.98

n_grid <- seq(from = 50, to = 100, by = 10)
samplesizeHolmBon <- Poisson_sample(n_grid,lambda0,lambda1,ngroups,alpha,test = "greater",n_sims = 10000, seed_nr = 1234, method = "Holm")

powerplot(samplesizeHolmBon, "Method: Holm - Bonferroni")

samplesizeHolmBon <-as.data.frame(format(round(t(samplesizeHolmBon),2),nsmall = 2))
samplesizeHolmBon

V1	V2	V3	V4	V5	V6
50.00	60.00	70.00	80.00	90.00	100.00
0.62	0.72	0.81	0.87	0.92	0.95

n_grid <- seq(from = 10, to = 100, by = 10)
samplesizeBH <- Poisson_sample(n_grid,lambda0,lambda1,ngroups,alpha,test = "greater",n_sims = 10000, seed_nr = 1234, method = "BH")

powerplot(samplesizeBH, "Method: BH")

samplesizeBH <-data.frame(format(round(t(samplesizeBH),2),nsmall = 2))
samplesizeBH

X1	X2	X3	X4	X5	X6	X7	X8	X9	X10
10.00	20.00	30.00	40.00	50.00	60.00	70.00	80.00	90.00	100.00
0.33	0.52	0.66	0.77	0.85	0.90	0.93	0.96	0.97	0.98