Load libraries.
library(gridExtra)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(ggfortify)
library(readxl) # package to read .xls files
# Packages for raincloud plots
library(ggdist) # for stat_halfeye()
library(ggbeeswarm) # for nicer jitter
# Packages for modeling
library(lme4)
library(car)
library(arm)
library(lmerTest)
library(ggeffects)
library(emmeans)
Load lifespan dataset and check the data.
# Read the "Newts" sheet
lifespan <- read.csv("/Users/joylynngoh/Files/Uni/Y4/ES3307/Assignment 3/Assignment 3 data/lifespan.csv")
# Quick check of data structure
head(lifespan)
## year Male.ID IR.treatment population origin date lifespan line
## 1 2015 117 20Gy C1 A30 28-Dec 11 1
## 2 2015 99 20Gy C1 A30 28-Dec 12 1
## 3 2015 104 Ctrl C1 A30 28-Dec 14 1
## 4 2015 109 Ctrl C1 A30 28-Dec 14 1
## 5 2015 108 Ctrl C1 A30 28-Dec 14 1
## 6 2015 100 Ctrl C1 A30 28-Dec 14 1
str(lifespan)
## 'data.frame': 1049 obs. of 8 variables:
## $ year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ Male.ID : chr "117" "99" "104" "109" ...
## $ IR.treatment: chr "20Gy" "20Gy" "Ctrl" "Ctrl" ...
## $ population : chr "C1" "C1" "C1" "C1" ...
## $ origin : chr "A30" "A30" "A30" "A30" ...
## $ date : chr "28-Dec" "28-Dec" "28-Dec" "28-Dec" ...
## $ lifespan : int 11 12 14 14 14 14 14 14 14 16 ...
## $ line : int 1 1 1 1 1 1 1 1 1 1 ...
Research question: how is longevity affected by preadaptation and temperature? Do these effects vary with year?
Factorise line, year, origin,
population.
lifespan <- lifespan %>%
mutate(across(c(year, origin, line, population), as.factor))
str(lifespan)
## 'data.frame': 1049 obs. of 8 variables:
## $ year : Factor w/ 2 levels "2015","2016": 1 1 1 1 1 1 1 1 1 1 ...
## $ Male.ID : chr "117" "99" "104" "109" ...
## $ IR.treatment: chr "20Gy" "20Gy" "Ctrl" "Ctrl" ...
## $ population : Factor w/ 6 levels "C1","C2","C3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ origin : Factor w/ 3 levels "A30","A36","P36": 1 1 1 1 1 1 1 1 1 1 ...
## $ date : chr "28-Dec" "28-Dec" "28-Dec" "28-Dec" ...
## $ lifespan : int 11 12 14 14 14 14 14 14 14 16 ...
## $ line : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
Crossed fixed factors are year × origin.
There is partial crossing and partial nesting for
population within origin (C1,
C2, C3 are crossed between A30
and A36 while T1, T3,
T4 are nested within P36).
# Raincloud plot
ggplot(lifespan, aes(x = origin, y = lifespan, fill = population)) +
# boxplot (median + IQR)
geom_boxplot(width = 0.55, outlier.shape = NA,
position = position_dodge(1)) +
# raw points (population shown via colour)
geom_point(position = position_jitterdodge(jitter.width = 0.15,
jitter.height = 0.25,
dodge.width = 0.9),
alpha = 0.45, size = 0.3) +
facet_wrap(~ year) +
labs(
x = "Origin (treatment)",
y = "Lifespan (days)",
color = "Population",
title = "Fig 1A: Beetle lifespan by origin, population, and year"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "right")
From Fig. 1A, the higher temperature lines (A36) have lower lifespan than the lower temperature lines (A30) and the pre-adapted lines seem to be closer in lifespan to the lower temperature lines. There does not seem to be an obvious difference between 2015 and 2016 population lifespans, but then again, the variance across all the populations and years is quite broad, so none of the lines or years seem significantly different from each other.
lifespan_MeanSE <- lifespan %>%
group_by(origin, year) %>%
summarise(
mean = mean(lifespan, na.rm = TRUE),
se = sd(lifespan, na.rm = TRUE) / sqrt(n())
)
# interaction plot
ggplot(lifespan_MeanSE, aes(x = year, y = mean, group = origin,
color = factor(origin))) +
geom_line(linewidth = 0.5) +
geom_point(size = 1) +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se),
width = 0.05, linewidth = 0.5) +
labs(x = "Year", y = "Mean lifespan (days)",
color = "Origin",
title = "Fig 1B: Interaction plot of origin × year") +
theme_minimal()
Based on Fig. 1B, there seems to be strong evidence that the low
temperature (A30) lines and preadaptation line
(P36) have longer lifespan as compared to no preadaptation,
higher temperature (A36). Exposing the beetles to these
conditions for 15 generations longer (2016 vs 2015) also seems to
significantly increase lifespan for A30 and
P36 but the effect seems to be limited for
A36.
options(show.signif.stars = FALSE)
model1.1 <- lmer(lifespan ~ origin * year + (1 | population),
data = lifespan)
autoplot(lm(lifespan$lifespan ~ lifespan$origin*lifespan$year),
which = c(1:3, 5), ncol = 2, label.size = 3, label.repel = TRUE) +
theme_minimal()
The four plots show there is general homoscedasticity, normality and
linearity, and no heavily influential outliers. The residuals are
stratified because both predictors origin and
year are categorical variables, so the fitted values are
the estimated mean per level of each combination of origin
and year, while the residuals are within-group variations.
Overall, the model seems to be an acceptable fit.
model1.2 <- lmer(lifespan ~ origin * year + (1 | population),
data = lifespan, REML = F)
model1.3 <- lmer(lifespan ~ origin + year + (1 | population),
data = lifespan, REML = F)
anova(model1.2, model1.3, refit = F)
## Data: lifespan
## Models:
## model1.3: lifespan ~ origin + year + (1 | population)
## model1.2: lifespan ~ origin * year + (1 | population)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## model1.3 6 5520.2 5549.9 -2754.1 5508.2
## model1.2 8 5495.7 5535.4 -2739.8 5479.7 28.479 2 6.543e-07
Does including the origin × year
interaction significantly improve model fit? \(p < 0.0001\). There is extremely strong
evidence that interaction improves model fit. The interaction term is
must be kept.
model1.4 <- lmer(lifespan ~ origin * year + (1 | population),
data = lifespan, REML = T)
summary(model1.4)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: lifespan ~ origin * year + (1 | population)
## Data: lifespan
##
## REML criterion at convergence: 5480.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0126 -0.5977 -0.0044 0.6512 3.9868
##
## Random effects:
## Groups Name Variance Std.Dev.
## population (Intercept) 0.9284 0.9635
## Residual 10.7651 3.2810
## Number of obs: 1049, groups: population, 6
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 15.24401 0.61645 5.35687 24.729 9.95e-07
## originA36 -3.02709 0.36313 1040.17284 -8.336 2.41e-16
## originP36 -1.15280 0.86670 5.23283 -1.330 0.239
## year2016 2.02874 0.35858 1039.10745 5.658 1.98e-08
## originA36:year2016 -2.35309 0.51359 1039.22521 -4.582 5.17e-06
## originP36:year2016 0.03744 0.49103 1039.44397 0.076 0.939
##
## Correlation of Fixed Effects:
## (Intr) orgA36 orgP36 yr2016 oA36:2
## originA36 -0.316
## originP36 -0.711 0.225
## year2016 -0.318 0.541 0.226
## orgA36:2016 0.222 -0.703 -0.158 -0.698
## orgP36:2016 0.232 -0.395 -0.310 -0.730 0.510
The variance of the random effect is 1.318. Population random effect constitutes:
0.9274/(0.9274+10.7651)
## [1] 0.0793158
about 7.93% of remaining variance after the fixed effects are accounted for. This tells me that the random effect is not that important to be included in the model.
anova(model1.4)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## origin 2865.28 1432.64 2 6.64 133.082 4.383e-06
## year 406.95 406.95 1 1039.34 37.803 1.114e-09
## origin:year 310.72 155.36 2 1039.40 14.432 6.575e-07
Since the interaction is statistically significant, I will only
report the origin:year stats. There is very strong evidence
that the interaction of origin and year (ie.
the interaction between temperature and pre-adaptation conditions, and
the length of time allowed for adaptation), affects the lifespan of the
beetles (\(F_{2,1039} = 14.4, \ P <
0.0001\)). Fig. 1B confirms the results with direction: beetle
lifespans are significantly higher at low temperature and even higher
when there are more generations for beetles to adapt.
predict1 <- ggpredict(model1.4, terms = c("origin", "year"), type = "random")
predictPlot <- plot(predict1)
predictPlot + labs(title = "Fig. 1C: Estimated means and intervals for origin, year, and population")
Since the random effect of population is not
significantly influential on the model, I chose to remove it for Fig.
1C. Now, A30 and P36 are significantly
visually different from A36, but A30 and
P36 are not significantly different from each other.
Additionally, 2016 lifespans are moderately significantly higher than
2015 lifespans except for A36, where it seems that exposing
the beetles to higher temperature line for more generations decreased
their lifespan instead. These conclusions match up with Fig. 1B.
Load mixed models data dataset and check the data.
# Read the "Newts" sheet
soil <- read_excel(path = '/Users/joylynngoh/Files/Uni/Y4/ES3307/Assignment 3/Assignment 3 data/Mixed models data.xlsx')
# Quick check of data structure
head(soil)
## # A tibble: 6 × 3
## TREATMNT PLANT DENSITY
## <dbl> <dbl> <dbl>
## 1 1 1 87
## 2 1 1 106.
## 3 1 1 63.7
## 4 1 1 60.1
## 5 1 1 66.1
## 6 1 1 37.9
str(soil)
## tibble [60 × 3] (S3: tbl_df/tbl/data.frame)
## $ TREATMNT: num [1:60] 1 1 1 1 1 1 1 1 1 1 ...
## $ PLANT : num [1:60] 1 1 1 1 1 1 1 1 1 1 ...
## $ DENSITY : num [1:60] 87 106.1 63.7 60.1 66.1 ...
Factorise TREATMNT and PLANT.
soil <- soil %>%
mutate(across(c(TREATMNT, PLANT), as.factor))
str(soil)
## tibble [60 × 3] (S3: tbl_df/tbl/data.frame)
## $ TREATMNT: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ PLANT : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ DENSITY : num [1:60] 87 106.1 63.7 60.1 66.1 ...
PLANT should be numbered uniquely from 1 to 6 but this
will be dealt with later (initially I recoded the variable before
starting on 2A but moved the recoding below to question 2C).
ggplot(soil, aes(x = TREATMNT, y = DENSITY, fill = PLANT)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Fig. 2A: Raw data density by treatment")
From Fig. 2A, it seems that treatment 2 produces higher average bacterial density than treatment 1, and plant 3 has the highest average density and plant 1 the lowest average density. However, it is also hard to say whether the means are significantly different, given that variation within plant measurements is also high. There are also several outliers.
model2.1 <- lm(DENSITY~TREATMNT, data = soil)
anova(model2.1) # to compare F-values and p-values
## Analysis of Variance Table
##
## Response: DENSITY
## Df Sum Sq Mean Sq F value Pr(>F)
## TREATMNT 1 9543 9543.2 12.504 0.0008059
## Residuals 58 44266 763.2
display(model2.1) # to compare fitted coefficients
## lm(formula = DENSITY ~ TREATMNT, data = soil)
## coef.est coef.se
## (Intercept) 85.41 5.04
## TREATMNT2 25.22 7.13
## ---
## n = 60, k = 2
## residual sd = 27.63, R-Squared = 0.18
Assuming that this is a reasonable analysis (and that it’s reasonable
to ignore PLANT as a blocking factor), there is very strong
evidence that TREATMNT positively affects
DENSITY (\(F_{1,58} = 12.5, \ P =
0.0008\)), where soil type 2 produces higher bacterial density
than soil type 1, according to the display function.
autoplot(lm(soil$DENSITY ~ soil$TREATMNT),
which = c(1:3, 5), ncol = 2, label.size = 3, label.repel = TRUE) +
theme_minimal()
Testing the model fit, it seems that there is no heteroscedasticity, and
the residuals seem to be quite linear and normal. No influential
outliers either.
When PLANT is numbered 1 to 3 in each treatment, R
treats PLANT as crossed with TREATMNT
when it is actually nested within TREATMNT.
Renumber plants 1-3 under treatment 2 as 4-6. I do not need to replot
the data since the only difference would be the numbers in the x-axis
for PLANT.
soil <- soil %>%
mutate(PLANT_RENUM = case_when(
TREATMNT == 1 ~ as.numeric(as.character(PLANT)),
TREATMNT == 2 ~ as.numeric(as.character(PLANT)) + 3
))
model2.2 <- lmer(DENSITY~TREATMNT + (1|PLANT_RENUM), data = soil)
anova(model2.2)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## TREATMNT 1410.6 1410.6 1 4 2.5831 0.1833
display(model2.2)
## lmer(formula = DENSITY ~ TREATMNT + (1 | PLANT_RENUM), data = soil)
## coef.est coef.se
## (Intercept) 85.41 11.10
## TREATMNT2 25.22 15.69
##
## Error terms:
## Groups Name Std.Dev.
## PLANT_RENUM (Intercept) 17.74
## Residual 23.37
## ---
## number of obs: 60, groups: PLANT_RENUM, 6
## AIC = 552.6, DIC = 570.3
## deviance = 557.5
From the analysis, I observe that there is weak evidence that the soil treatments produce different bacterial densities (\(F_{1,4} = 2.58, \ P = 0.183\)).
What has changed? First, the denominator degrees of freedom
has dramatically deflated with proper nesting of PLANT in
the model. Secondly, the F-value has decreased and the p-value shows
that there is weak evidence that treatment 1 results in higher bacterial
density than in treatment 2. These results how seem to reflect Fig. 2B
much more accurately.
The difference is that when PLANT was not included in
the model (and was improperly crossed with TREATMNT), I am
led to conclude that there is strong evidence to show that treatment
affects density (in 2B), but when PLANT was included as a
random blocking factor, I conclude that treatment does not statistically
significantly affect density.
First, TREATMNT is not a continuous variable, which
means it does not vary continuously with DENSITY. With only
2 levels, the calculated slope will not be accurate.
Secondly, there are not enough replicates at the plant (random effect) level to fit a random slope AND intercept. There are only 6 plants, and each belong to only 1 treatment. To include a random slope, each plant must be observed under both treatments, and far more replication on the same plant is needed to estimate within-plant treatment slope.
From the analysis in 2C, I observe that there is weak evidence that
treatment 1 results in higher bacterial density than in treatment 2
(\(F_{1,4} = 2.58, \ P = 0.183\)).
Next, I report the importance of random effects by calculating
percentage variation from the summary() table.
summary(model2.2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DENSITY ~ TREATMNT + (1 | PLANT_RENUM)
## Data: soil
##
## REML criterion at convergence: 544.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.57742 -0.50114 0.03571 0.68858 1.76106
##
## Random effects:
## Groups Name Variance Std.Dev.
## PLANT_RENUM (Intercept) 314.8 17.74
## Residual 546.1 23.37
## Number of obs: 60, groups: PLANT_RENUM, 6
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 85.41 11.10 4.00 7.697 0.00153
## TREATMNT2 25.22 15.69 4.00 1.607 0.18329
##
## Correlation of Fixed Effects:
## (Intr)
## TREATMNT2 -0.707
From the summary table, PLANT contributes to:
314.8/(314.8 + 546.1)
## [1] 0.3656638
36.6% of total variance. This is quite a significant proportion of
variance that the random effect accounts for, and would lead me to think
that PLANT should be included as a fixed effect, and the
study should consider other factors that might be affecting bacterial
density.
emm2 <- as.data.frame(emmeans(model2.2, ~ TREATMNT)) # this averages over the random effects, ie. PLANT.
ggplot(emm2, aes(x = TREATMNT, y = emmean)) +
geom_point(size = 2) +
geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL), width = 0.1) +
labs(x = "Soil treatment", y = "Estimated mean density",
title = "Fig. 2B: Model estimated means (95% CI) by soil treatment") +
theme_minimal()
From Fig. 2D, the 95% confidence intervals show that there is weak visual evidence that soil type 2 produces higher mean bacterial density than soil type 1. This confirms the results and conclusions from above.
The random effects is Block. Blocks are randomised to
all possible field conditions so that the results can be generalizable
outside the study (in case certain configurations of nitrogen
concentrations and oat varieties might somehow affect adjacent plots’
yields). The fixed effects are nitro and
Variety.
Fixed effects are the focal predictor variables of the experiment whose effects the study is interested in, while random effects are effects that the study is not interested in, but that may potentially influence (confound) the results and thus need to be accounted for in the model.
This study design is both crossed and nested. First, all levels of
nitro are crossed with all levels of Variety,
but each of the 4 nitro levels is also nested within each
of the 3 Variety levels. Second, each of the
Variety levels is crossed in each of the 6
Block (all the oat varieties appear in all of the 6
blocks).
Crossed designs are where all levels of a factor experience all levels of another factor(s). Nested designs are where all levels of one factor only experience one level of another factor.
The full model is
Yield ~ nitro * Variety + (1|Block/Variety). Another
alternative is to exclude Variety in the random effect
structure: Yield ~ nitro * Variety + (1|Block).
Variety: each level of oat variety is replicated 6 times in total over the 6 blocks.
Nitrogen fertilizer: there are 18 replicates of each nitrogen concentration over the 6 blocks.
mean_treated_greenhouses = 10.442 # the larger the effect size, the higher power will be. increasing effect size, keeping everything else constant, by 0.3 increases power to nearly 1.
mean_control_greenhouses = 10.142
greenhouse_variation = 0.153
plant_variation = 0.5 # halving plant variation, keeping all other factors constant, improves power by 0.03.
number_of_treated_greenhouses = 5
number_of_control_greenhouses = 5
number_of_plants = 100
I choose to assume that realistically, I cannot control the means or variations of the greenhouses and plants. I also want to keep the number of treated and control greenhouses equal so the analysis can be orthogonal, and realistically, it is probably not feasible to use more than 5 greenhouses per treatment type (ie. the only variable I choose to change is the number of plants). I choose to aim for the widely accepted threshold of 80% power, which will guarantee that we expect to miss only 20% of true effects; however, I understand that in reality, a power of 70% is acceptable and more realistic for most ecological field studies. Since the goal is to maximise power with the fewest expensive factors (greenhouses), the above combination of 5 treated and 5 control greenhouses with 100 plants in each gives me a power of 73% (result is shown below) which is reasonable.
# Code copied over from tutorial
results = vector()
correct = 0
runs = 10000
set.seed(42)
variation = sqrt(greenhouse_variation^2 + ((plant_variation^2)/number_of_plants))
for(i in 1:runs){
treated <- rnorm(number_of_treated_greenhouses, mean =
mean_treated_greenhouses, sd = variation)
control <- rnorm(number_of_control_greenhouses, mean =
mean_control_greenhouses, sd = variation)
model4.1 <- t.test(treated, control, var.equal = TRUE)
p <- model4.1$p.value
if (p<0.05) correct = correct + 1
results[i] <- p
}
(power = correct/runs)
## [1] 0.7259
Since power is reasonably above the minimum 70% threshold, I will proceed with the analysis.
set.seed(42)
variation1 = sqrt(greenhouse_variation^2 + ((plant_variation^2)/number_of_plants))
treated1 <- rnorm(number_of_treated_greenhouses, mean =
mean_treated_greenhouses, sd = variation1)
control1 <- rnorm(number_of_control_greenhouses, mean =
mean_control_greenhouses, sd = variation1)
t.test(treated1, control1)
##
## Welch Two Sample t-test
##
## data: treated1 and control1
## t = 2.9741, df = 7.0025, p-value = 0.02068
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.05449848 0.47725793
## sample estimates:
## mean of x mean of y
## 10.51303 10.24716
Using the t-test used in the looped simulation, I observe that there is moderately strong evidence that applying Chalcid wasps to the fertiliser increases tomato yield (\(t_{7} = 2.97, \ P = 0.0207\)).
# Combine data into one dataframe
results4 <- data.frame(Treated = treated1, Control = control1)
# Calculate group means and 95% CI
results4 <- pivot_longer(
results4,
cols = c(Treated, Control),
names_to = "Treatment",
values_to = "Value"
)
MEM4 <- results4 %>%
group_by(Treatment) %>%
summarise(
Mean = mean(Value),
SE = sd(Value)/sqrt(n()),
lower_CI = Mean - qt(0.975, df = n()-1) * SE,
upper_CI = Mean + qt(0.975, df = n()-1) * SE
)
# Plot
ggplot(MEM4, aes(x = Treatment, y = Mean, color = Treatment)) +
geom_point(size = 4) +
geom_errorbar(aes(ymin = lower_CI, ymax = upper_CI), width = 0.1, linewidth = 1) +
labs(
title = "Fig. 4A: Model-Estimated Means ± 95% CI",
x = "Treatment",
y = "Yield (kg)"
) +
theme_minimal() +
theme(legend.position = "none")