library(tidyverse)
library(palmerpenguins)
library(ggfortify)
library(dplyr)
lifespan<-read.csv("C:/Users/paul/OneDrive/Documents/ES3307/lifespan.csv",header=TRUE)
head(lifespan)
## year Male.ID IR.treatment population origin date lifespan line
## 1 2015 117 20Gy C1 A30 28-Dec 11 1
## 2 2015 99 20Gy C1 A30 28-Dec 12 1
## 3 2015 104 Ctrl C1 A30 28-Dec 14 1
## 4 2015 109 Ctrl C1 A30 28-Dec 14 1
## 5 2015 108 Ctrl C1 A30 28-Dec 14 1
## 6 2015 100 Ctrl C1 A30 28-Dec 14 1
str(lifespan)
## 'data.frame': 1049 obs. of 8 variables:
## $ year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ Male.ID : chr "117" "99" "104" "109" ...
## $ IR.treatment: chr "20Gy" "20Gy" "Ctrl" "Ctrl" ...
## $ population : chr "C1" "C1" "C1" "C1" ...
## $ origin : chr "A30" "A30" "A30" "A30" ...
## $ date : chr "28-Dec" "28-Dec" "28-Dec" "28-Dec" ...
## $ lifespan : int 11 12 14 14 14 14 14 14 14 16 ...
## $ line : int 1 1 1 1 1 1 1 1 1 1 ...
summary(lifespan)
## year Male.ID IR.treatment population
## Min. :2015 Length:1049 Length:1049 Length:1049
## 1st Qu.:2015 Class :character Class :character Class :character
## Median :2016 Mode :character Mode :character Mode :character
## Mean :2016
## 3rd Qu.:2016
## Max. :2016
## origin date lifespan line
## Length:1049 Length:1049 Min. : 5.0 Min. :1.000
## Class :character Class :character 1st Qu.:12.0 1st Qu.:3.000
## Mode :character Mode :character Median :15.0 Median :5.000
## Mean :14.6 Mean :5.214
## 3rd Qu.:17.0 3rd Qu.:7.000
## Max. :29.0 Max. :9.000
Origin (3 levels: A30, A36, P36)
├─ Population 1 (line) ──> many Individuals
├─ Population 2 (line) ──> many Individuals
└─ Population 3 (line) ──> many Individuals
Each Origin has 3 Populations → total 9 populations. Each Population was sampled in 2015 and/or 2016 (individuals in each year). Fixed factors: Origin, Year Random factors: Population (line) as a grouping effect.
Population is nested within origin: each population belongs to exactly one origin (A30, A36 or P36) Year is crossed with origin (and with population): the factor year (2015 or 2016) applies across all origins and populations. Individuals are nested within population: each lifespan observation is from a single individual which belongs to one population and one year.
lifespan <- lifespan %>%
mutate(
origin = factor(origin, levels = c("A30", "A36", "P36")),
population = factor(population),
year = factor(year)
)
ggplot(lifespan, aes(x = origin, y = lifespan, color = year)) +
geom_jitter(aes(shape = year), width = 0.2, alpha = 0.6, size = 2) +
stat_summary(fun = mean, geom = "crossbar", width = 0.4,
color = "black", fatten = 2, alpha = 0.8) +
facet_wrap(~ population, ncol = 3) +
labs(
x = "Origin (Treatment)",
y = "Lifespan (days)",
color = "Year",
shape = "Year",
title = "Variation in beetle lifespan by origin, population, and year"
) +
theme_minimal(base_size = 13) +
theme(
strip.background = element_rect(fill = "grey90"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
Figure 1 is a plot of the raw data to show how the lifespan of beetles
varies within origin, population and year
lifespan_summary <- lifespan %>%
group_by(origin, year) %>%
summarise(
mean_life = mean(lifespan, na.rm = TRUE),
se = sd(lifespan, na.rm = TRUE) / sqrt(n()),
.groups = "drop"
)
ggplot(lifespan_summary, aes(x = origin, y = mean_life,
group = year, color = year, shape = year)) +
geom_line(size = 1) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = mean_life - se, ymax = mean_life + se),
width = 0.15, linewidth = 0.6) +
labs(
x = "Origin",
y = "Mean lifespan (days)",
color = "Year",
shape = "Year",
title = "Interaction plot: Origin × Year effects on beetle lifespan"
) +
theme_minimal(base_size = 13) +
theme(
legend.position = "top",
panel.grid.minor = element_blank()
)
Figure 2 is an interaction plot between origin and year.The lines of
both years 2015 and 2016 cross at A36 origin suggesting an
interaction.
library(lme4)
library(lmerTest)
m1 <- lmer(lifespan ~ origin*year + (1 | population), data = lifespan)
summary(m1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: lifespan ~ origin * year + (1 | population)
## Data: lifespan
##
## REML criterion at convergence: 5480.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0126 -0.5977 -0.0044 0.6512 3.9868
##
## Random effects:
## Groups Name Variance Std.Dev.
## population (Intercept) 0.9284 0.9635
## Residual 10.7651 3.2810
## Number of obs: 1049, groups: population, 6
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 15.24401 0.61645 5.35688 24.729 9.95e-07 ***
## originA36 -3.02709 0.36313 1040.17284 -8.336 2.41e-16 ***
## originP36 -1.15280 0.86670 5.23284 -1.330 0.239
## year2016 2.02874 0.35858 1039.10745 5.658 1.98e-08 ***
## originA36:year2016 -2.35309 0.51359 1039.22521 -4.582 5.17e-06 ***
## originP36:year2016 0.03744 0.49103 1039.44397 0.076 0.939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) orgA36 orgP36 yr2016 oA36:2
## originA36 -0.316
## originP36 -0.711 0.225
## year2016 -0.318 0.541 0.226
## orgA36:2016 0.222 -0.703 -0.158 -0.698
## orgP36:2016 0.232 -0.395 -0.310 -0.730 0.510
model1 <- lmer(lifespan ~ origin * year + (1 | population),
data = lifespan, REML = FALSE)
model2 <- lmer(lifespan ~ origin + year + (1 | population),
data = lifespan, REML = FALSE)
anova(model1,model2,refit = FALSE)
## Data: lifespan
## Models:
## model2: lifespan ~ origin + year + (1 | population)
## model1: lifespan ~ origin * year + (1 | population)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## model2 6 5520.2 5549.9 -2754.1 5508.2
## model1 8 5495.7 5535.4 -2739.8 5479.7 28.479 2 6.543e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The interaction term is significant in the model as P<0.001 so there is a significant difference between the two models with and without the interaction. The effect of origin does depends on year so the interaction should be kept in the model.
model3 <- lmer(lifespan ~ origin * year + (1 | population),
data = lifespan, REML = TRUE)
summary(model3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: lifespan ~ origin * year + (1 | population)
## Data: lifespan
##
## REML criterion at convergence: 5480.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0126 -0.5977 -0.0044 0.6512 3.9868
##
## Random effects:
## Groups Name Variance Std.Dev.
## population (Intercept) 0.9284 0.9635
## Residual 10.7651 3.2810
## Number of obs: 1049, groups: population, 6
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 15.24401 0.61645 5.35688 24.729 9.95e-07 ***
## originA36 -3.02709 0.36313 1040.17284 -8.336 2.41e-16 ***
## originP36 -1.15280 0.86670 5.23284 -1.330 0.239
## year2016 2.02874 0.35858 1039.10745 5.658 1.98e-08 ***
## originA36:year2016 -2.35309 0.51359 1039.22521 -4.582 5.17e-06 ***
## originP36:year2016 0.03744 0.49103 1039.44397 0.076 0.939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) orgA36 orgP36 yr2016 oA36:2
## originA36 -0.316
## originP36 -0.711 0.225
## year2016 -0.318 0.541 0.226
## orgA36:2016 0.222 -0.703 -0.158 -0.698
## orgP36:2016 0.232 -0.395 -0.310 -0.730 0.510
The random effect population only explains 7.9% of variance after variance is explained by our fixed effects as 0.9284/(0.9284+10.7651) = 0.079 x 100 = 7.9% Therefore, individuals from different populations are not much different in average lifespan and the random intercept may not be critical. However it should still be kept in the model as the design is grouped by population. More of the variance is therefore explained by the interactions between our fixed effects.
anova(model3)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## origin 2865.28 1432.64 2 6.64 133.082 4.383e-06 ***
## year 406.95 406.95 1 1039.34 37.803 1.114e-09 ***
## origin:year 310.72 155.36 2 1039.40 14.432 6.575e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is a very strong statistically significant difference between beetle lifespan and the three origins (A30,A36 and P36) (F2=133.1, p<0.001), suggesting that preadaptation and temperture both affect longevity. There is a statistcially significant difference between lifespan across the two years 2015 and 2016, likely reflecting the preadapted populations in 2026 experiencing 15 more generations at higher temperatures (F1=37.8, p<0.001).The interaction between origin and year is also statistcially significant, suggesting that the effect of origin on lifespan depends on the year, therefore, you cannot interpret origin or year alone and they must be considered together (F2 = 14.4, p<0.001).
library(emmeans)
library(ggplot2)
emm <- emmeans(model3, ~ origin * year)
emm_df <- as.data.frame(emm)
ggplot(emm_df, aes(x = origin, y = emmean, color = year, group = year)) +
geom_point(size = 3) +
geom_line(size = 1) +
geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL), width = 0.15, size = 0.7) +
labs(
x = "Origin (Treatment)",
y = "Predicted lifespan (days)",
color = "Year",
title = "Model estimated means of lifespan by origin and year"
) +
theme_minimal(base_size = 13) +
theme(
legend.position = "top",
panel.grid.minor = element_blank()
)
Figure 3 is a plot of the model estimated means and intervals of
lifespan by origin and year against predicted lifespan.
density<-read.csv("C:/Users/paul/OneDrive/Documents/ES3307/density(Sheet1).csv",header=TRUE)
head(density)
## TREATMNT PLANT DENSITY
## 1 1 1 87.0
## 2 1 1 106.1
## 3 1 1 63.7
## 4 1 1 60.1
## 5 1 1 66.1
## 6 1 1 37.9
str(density)
## 'data.frame': 60 obs. of 3 variables:
## $ TREATMNT: int 1 1 1 1 1 1 1 1 1 1 ...
## $ PLANT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DENSITY : num 87 106.1 63.7 60.1 66.1 ...
summary(density)
## TREATMNT PLANT DENSITY
## Min. :1.0 Min. :1 Min. : 12.30
## 1st Qu.:1.0 1st Qu.:1 1st Qu.: 79.78
## Median :1.5 Median :2 Median :100.00
## Mean :1.5 Mean :2 Mean : 98.03
## 3rd Qu.:2.0 3rd Qu.:3 3rd Qu.:119.05
## Max. :2.0 Max. :3 Max. :156.10
density <- density %>%
mutate(
TREATMNT = factor(TREATMNT, labels = c("Soil 1", "Soil 2")),
PLANT = factor(PLANT)
)
ggplot(density, aes(x = TREATMNT, y = DENSITY, fill = PLANT)) +
geom_boxplot(alpha = 0.3, outlier.shape = NA) +
geom_jitter(aes(color = PLANT), width = 0.2, size = 2) +
labs(x = "Soil Treatment", y = "Bacterial density", title = "How bacterial density varies with treatment and plant") +
theme_classic()
Figure 4 is a plot to show how bacterial density varies with soil
treatment (1 or 2) and plant (1, 2 or 3 for each treatment) including
boxlots to show the variance around the mean for each one.
dm1<-lm(DENSITY~TREATMNT,data=density)
summary(dm1)
##
## Call:
## lm(formula = DENSITY ~ TREATMNT, data = density)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73.113 -19.913 -0.413 19.862 57.487
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85.413 5.044 16.934 < 2e-16 ***
## TREATMNTSoil 2 25.223 7.133 3.536 0.000806 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.63 on 58 degrees of freedom
## Multiple R-squared: 0.1774, Adjusted R-squared: 0.1632
## F-statistic: 12.5 on 1 and 58 DF, p-value: 0.0008059
Soil treatment has a statistically significant effect on bacterial density (F1,58=12.504, p<0.001).This interpreation assumes independence of all observations, when in reality measurements are nested within plants so there may be plant-level variation, there potentially inflating the significance leading to a possible type 1 error
density <- density %>%
mutate(
PLANT_ID = paste0("T", TREATMNT, "_P", PLANT),
PLANT_ID = factor(PLANT_ID)
)
dm2<-lmer(DENSITY~TREATMNT+(1|PLANT_ID),data=density)
summary(dm2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DENSITY ~ TREATMNT + (1 | PLANT_ID)
## Data: density
##
## REML criterion at convergence: 544.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.57742 -0.50114 0.03571 0.68858 1.76106
##
## Random effects:
## Groups Name Variance Std.Dev.
## PLANT_ID (Intercept) 314.8 17.74
## Residual 546.1 23.37
## Number of obs: 60, groups: PLANT_ID, 6
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 85.41 11.10 4.00 7.697 0.00153 **
## TREATMNTSoil 2 25.22 15.69 4.00 1.607 0.18329
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## TREATMNTSl2 -0.707
This model accounts for Plant as a random effect which explains 36.6% of the variance in the model after variance had been explained by the fixed effects. As a result, the p-value is increased and there is no longer a statistically significant difference between soil treatment 2 and bacteria density when plant is included as a random effect due to it being nested in the experimental design.
To fit random slopes the x variable fixed effect needs to be a continuous variable and our here is a categorical A random slope model allows the effect of treatment to vary across plants. However, in our experimental design there is only one treatment per plant and each plant is nested in either soil 1 or soil 2. Random slopes require repeated measures of the predictor within each level of the random effect so each plant would need observations in both soil 1 and 2 to estimate how the slope of the treatments varies by plant. There is no within-plant replication across treatments so the model cannot distinguish the slope variation from the fixed effect itself.
There is a significant difference between soil 1 and bacterial density (t4=7.697, p=0.002), however there is no statistical significance between soil treatment 2 and bacterial density (t4=1.607, p=0.183)once plant-to-plant variability has been accounted for. Whereas, in the previous model where the random effects were no accounted for there was a significant difference in both soil treatments and bacterial densities. Therefore, using random effects in this model eliminates the type 1 error.
emm <- emmeans(dm2, ~ TREATMNT)
emm_df <- as.data.frame(emm)
ggplot() +
geom_jitter(data = density, aes(x = TREATMNT, y = DENSITY), width = 0.2, alpha = 0.5) +
geom_point(data = emm_df, aes(x = TREATMNT, y = emmean), color = "red", size = 3) +
geom_errorbar(data = emm_df, aes(x = TREATMNT, ymin = lower.CL, ymax = upper.CL), width = 0.2, color = "red", size = 0.7) +
labs(x = "Soil Treatment", y = "Bacterial density (colonies per plate)",
title = "Model-estimated means and raw data") +
theme_minimal(base_size = 13)
Figure 5 is a plot showing the model estimated means and intervals for
both soil treatments for bacterial density using the model from 2C which
accounts for plant as a random effect. the plot alos includes points
from the raw data demonstrtaing the fit of the model.
Fixed effects = Variety and Nitrogen fertiliser (these are what we are comparing). Fixed effects are our explanatory variables which we are interested in making conclusions about. Random effects are usually grouping factors for which we are trying to control, however we are not interested in their impact on the response variable, but we know they might be influencing the patterns we see. Random effects = block because the six blocks are a random sample of possible field locations, we are not interested in the difference of their impact, just understand that Blocks will account for some of the models variation and influence the results.
Nested designs are where levels of one factor exist only within levels of another factor whereas crossed esigns are where all levels of one factor occur with all levels of another factor. Nitrogen is nested within variety plots (each variety transect contains four subplots with different nitrogen levels) All six blocks contain all varieties suggesting that varieties are crossed with block So the experimental design is crossed and nested
lmer(yield ~ variety * Nitro + (1|Block/Variety))
(interaction present between variety and nitrogen as variety x nitrogen is a fixed interaction as we are interested in whether varieties respond differently to nitrogen)
Variety = (6 blocks x 1 plot per block per variety) = 6 replicates per variety Nitrogen fertilizer = (6 blocks x 3 varieties x 1 subplot per level) = 6 replicates per nitrogen level per variety
mean_treated_greenhouses = 15
mean_control_greenhouses = 10
greenhouse_variation = 0.153
plant_variation = 0.5
number_of_treated_greenhouses = 6
number_of_control_greenhouses = 6
number_of_plants = 5
Power (based on 10,000 simulations) = 1
A large mean difference between treated greenhouses and control greenhouses of 5 (15-10) means a fewer number of treated and control greenhouses are required. A total of 12 greenhouses is a realistic number in terms of space,cost and logistics but it also provides a sufficient amount of replication. There is a very small between-greenhouse variation (SD=0.153) assuming excellent control of external environmnetal conditions (temperature, light, soil, water) so highly controlled facilities woukd be needed. Sampling multiple plants, 5 per greehouse, reduces the uncertaintly of each greenhouse mean, but is also resource conscious. Harvesting 60 plants is managable for a small team and having 12 greenhouses is demanding but feasible for well-funded labs or universities. Choosing a design that achieves high power (like this one) with modest replication avoids waste so that we are not running an unnecesarily large experiment than needed.
mean_treated_greenhouses = 15
mean_control_greenhouses = 10
greenhouse_variation = 0.153
plant_variation = 0.5
number_of_treated_greenhouses = 6
number_of_control_greenhouses = 6
number_of_plants = 5
variation = sqrt(greenhouse_variation^2 + ((plant_variation^2)/number_of_plants))
variation
## [1] 0.270941
treated <- rnorm(number_of_treated_greenhouses, mean = mean_treated_greenhouses, sd = variation)
control <- rnorm(number_of_control_greenhouses, mean = mean_control_greenhouses, sd = variation)
data <- data.frame(
Treatment = factor(rep(c("Control", "Treated"), each = 6)),
Yield = c(control, treated)
)
model <- lm(Yield ~ Treatment, data = data)
anova(model)
## Analysis of Variance Table
##
## Response: Yield
## Df Sum Sq Mean Sq F value Pr(>F)
## Treatment 1 74.589 74.589 697.15 1.4e-10 ***
## Residuals 10 1.070 0.107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There was a significant difference between treated greenhouses and controlled greenhouses (F1,10 =1085.2, p<0.001). The very low p value and very high f value reiterate the power of the experimental design and the very strong significant difference between the two tomato yield means.
emm <- emmeans(model, ~ Treatment)
emm
## Treatment emmean SE df lower.CL upper.CL
## Control 10 0.134 10 9.75 10.3
## Treated 15 0.134 10 14.74 15.3
##
## Confidence level used: 0.95
emm_df <- as.data.frame(emm)
ggplot(emm_df, aes(x = Treatment, y = emmean, colour = Treatment)) +
geom_point(size = 4) +
geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL), width = 0.2, linewidth = 1) +
labs(
title = "Model Estimated Tomato Yields by Treatment",
x = "Treatment",
y = "Estimated Mean Yield (±95% CI)"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "none")