#installing packages
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("lme4")
## package 'lme4' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LIEWY\AppData\Local\Temp\RtmpKYxjTR\downloaded_packages
install.packages("emmeans")
## package 'emmeans' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LIEWY\AppData\Local\Temp\RtmpKYxjTR\downloaded_packages
install.packages("ggeffects")
## package 'ggeffects' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LIEWY\AppData\Local\Temp\RtmpKYxjTR\downloaded_packages
#loading packages
library(tidyverse)
library(ggplot2)
library(Biostatistics)
library(readxl)
library(Matrix)
library(lme4)
library(emmeans)
library(ggeffects)
#importing data
bruchid <- read.csv("C:/Users/LIEWY/Downloads/lifespan.csv")
mmd <- read_excel("C:/Users/LIEWY/Downloads/Mixed models data.xlsx", sheet = "Sheet1")
#exploring data structure
str(bruchid)
## 'data.frame': 1049 obs. of 8 variables:
## $ year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ Male.ID : chr "117" "99" "104" "109" ...
## $ IR.treatment: chr "20Gy" "20Gy" "Ctrl" "Ctrl" ...
## $ population : chr "C1" "C1" "C1" "C1" ...
## $ origin : chr "A30" "A30" "A30" "A30" ...
## $ date : chr "28-Dec" "28-Dec" "28-Dec" "28-Dec" ...
## $ lifespan : int 11 12 14 14 14 14 14 14 14 16 ...
## $ line : int 1 1 1 1 1 1 1 1 1 1 ...
#conversion of characters into factors
bruchid$year <- as.factor(bruchid$year)
bruchid$IR.treatment <- as.factor(bruchid$IR.treatment)
bruchid$population <- as.factor(bruchid$population)
bruchid$origin <- as.factor(bruchid$origin)
bruchid$date <- as.factor(bruchid$date)
str(bruchid)
## 'data.frame': 1049 obs. of 8 variables:
## $ year : Factor w/ 2 levels "2015","2016": 1 1 1 1 1 1 1 1 1 1 ...
## $ Male.ID : chr "117" "99" "104" "109" ...
## $ IR.treatment: Factor w/ 2 levels "20Gy","Ctrl": 1 1 2 2 2 2 2 1 1 1 ...
## $ population : Factor w/ 6 levels "C1","C2","C3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ origin : Factor w/ 3 levels "A30","A36","P36": 1 1 1 1 1 1 1 1 1 1 ...
## $ date : Factor w/ 15 levels "1-Jan","1-Mar",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ lifespan : int 11 12 14 14 14 14 14 14 14 16 ...
## $ line : int 1 1 1 1 1 1 1 1 1 1 ...
Visualizing the data
#creating a boxplot
bruchid %>%
ggplot(aes(x = origin, y = lifespan, fill = population))+
geom_boxplot(width = 0.5, position = position_dodge(0.6))+
facet_grid(~year)+
labs( x = "Origin", y = "Lifespan(days)")+
theme_minimal()
To plot an interaction plot, the interaction considered must be significant. The significance of the interaction between origin and year is tested using an ANOVA including origin*year as an interaction.
#fitting linear model with interaction
bruchidlm1 <- lm(lifespan ~ origin*year, data = bruchid)
anova(bruchidlm1)
## Analysis of Variance Table
##
## Response: lifespan
## Df Sum Sq Mean Sq F value Pr(>F)
## origin 2 3571.0 1785.49 155.612 < 2.2e-16 ***
## year 1 454.6 454.64 39.624 4.528e-10 ***
## origin:year 2 280.8 140.40 12.236 5.587e-06 ***
## Residuals 1043 11967.4 11.47
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#checking fit of linear model
par(mfrow = c(2, 2))
plot(bruchidlm1)
From the ANOVA table, there was a statistically significant interaction between origin and year (Origin X Year:F2,1043 = 12.24, P < 0.001). The validity of these results are supported by the good fit and compliance to assumptions of the linear plot used for the data, which is seen from the diagnostic plots:
Residuals vs Fitted: The spread of residuals around the horizontal line is even, indicating a linear relationship.
Q-Q Residuals: The residuals closely the diagonal line, indicating a normal distribution.
Scale-Location: The residuals are evenly spread along the range of predictors, indicating that the data fulfills the assumption of homoscedasticity (equal variance).
Residuals vs Leverage: No residual falls outside the Cook’s distance line, indicating that there are no influential outliers.
#plotting the interaction plot
interaction.plot(x.factor = bruchid$year,
trace.factor = bruchid$origin,
response = bruchid$lifespan, #y-variable
fun = median,
ylab = "Lifespan (days)",
xlab = "Year",
col = c("pink", "blue", "purple"),
lty = 1, #specifies that desired line type is solid
lwd = 2, #width of the lines
trace.label = "Origin")
#fitting a random intercept model
bruchidlm2 <- lmer(lifespan ~ origin * year + (1|year), data = bruchid)
anova(bruchidlm2)
## Analysis of Variance Table
## npar Sum Sq Mean Sq F value
## origin 2 3326.6 1663.28 144.960
## year 1 18.4 18.44 1.607
## origin:year 2 280.8 140.40 12.236
#model without interaction term
bruchidlm3 <- lmer(lifespan ~ origin + year + (1| year/population), bruchid, REML = F)
#model with interaction term
bruchidlm4 <- lmer(lifespan ~ origin * year + (1| year/population), bruchid, REML = F)
#comparing anovas
anova(bruchidlm3, bruchidlm4, refit = FALSE)
## Data: bruchid
## Models:
## bruchidlm3: lifespan ~ origin + year + (1 | year/population)
## bruchidlm4: lifespan ~ origin * year + (1 | year/population)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## bruchidlm3 7 5526.8 5561.5 -2756.4 5512.8
## bruchidlm4 9 5508.0 5552.6 -2745.0 5490.0 22.805 2 1.117e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
When the 2 linear models are compared, the interaction term (origin:year) significantly improves the model fit (P <0.001). This tells us that the effect of origin depends on year and that year should be included as an interaction term in the model.
#creating a linear model where year is an interaction term and population is a random effect
bruchidlm5 <- lmer(lifespan ~ origin * year + (1| population), bruchid, REML = T)
summary(bruchidlm5)
## Linear mixed model fit by REML ['lmerMod']
## Formula: lifespan ~ origin * year + (1 | population)
## Data: bruchid
##
## REML criterion at convergence: 5480.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0126 -0.5977 -0.0044 0.6512 3.9868
##
## Random effects:
## Groups Name Variance Std.Dev.
## population (Intercept) 0.9284 0.9635
## Residual 10.7651 3.2810
## Number of obs: 1049, groups: population, 6
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 15.24401 0.61645 24.729
## originA36 -3.02709 0.36313 -8.336
## originP36 -1.15280 0.86670 -1.330
## year2016 2.02874 0.35858 5.658
## originA36:year2016 -2.35309 0.51359 -4.582
## originP36:year2016 0.03744 0.49103 0.076
##
## Correlation of Fixed Effects:
## (Intr) orgA36 orgP36 yr2016 oA36:2
## originA36 -0.316
## originP36 -0.711 0.225
## year2016 -0.318 0.541 0.226
## orgA36:2016 0.222 -0.703 -0.158 -0.698
## orgP36:2016 0.232 -0.395 -0.310 -0.730 0.510
The inclusion of population as a random intercept improved model fit and accounted for approximately 8% of the variance in lifespan. This indicates modest but meaningful differences in mean lifespan among populations.
anova(bruchidlm5)
## Analysis of Variance Table
## npar Sum Sq Mean Sq F value
## origin 2 3006.04 1503.02 139.620
## year 1 450.61 450.61 41.859
## origin:year 2 310.72 155.36 14.432
A linear mixed-effects model with population as a random intercept revealed significant effects of origin (F~2, 6.64~ = 133.08, p < 0.001), year (F~1, 1039~ = 37.80, p < 0.001), and their interaction (F~2, 1039~ = 14.43, p < 0.001) on beetle lifespan. The significant interaction indicates that differences in lifespan among origins depend on the year.
#Final model
finallm <- lmer(lifespan ~ origin * year + (1 | population),
data = bruchid, REML = TRUE)
#Predicted means and 95% CI
predmean <- ggpredict(finallm, terms = c("origin", "year"))
#Plotting model
ggplot(predmean, aes(x = x, y = predicted, color = group, group = group)) +
geom_point(size = 4) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.15, size = 0.9) +
labs(x = "Origin",
y = "Predicted Lifespan",
color = "Year",
title = "Predicted Lifespan by Origin and Year")+
theme_minimal()
Checking the data structure
#checking data structure
str(mmd)
## tibble [60 × 3] (S3: tbl_df/tbl/data.frame)
## $ TREATMNT: num [1:60] 1 1 1 1 1 1 1 1 1 1 ...
## $ PLANT : num [1:60] 1 1 1 1 1 1 1 1 1 1 ...
## $ DENSITY : num [1:60] 87 106.1 63.7 60.1 66.1 ...
#converting PLANT and TREATMNT to categorical variables
mmd$TREATMNT <- as.factor(mmd$TREATMNT)
mmd$PLANT <- as.factor(mmd$PLANT)
#checking after conversion
str(mmd)
## tibble [60 × 3] (S3: tbl_df/tbl/data.frame)
## $ TREATMNT: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ PLANT : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ DENSITY : num [1:60] 87 106.1 63.7 60.1 66.1 ...
Plotting the data
#plotting a boxplot
mmd %>%
ggplot(aes(x = TREATMNT, y = DENSITY, fill = PLANT))+
geom_boxplot(width = 0.5,
position = position_dodge(width = 0.8),
alpha = 0.5)+
geom_point(size = 0.5)+
geom_jitter(
aes(color = PLANT),
position = position_dodge(width = 0.8))+
labs(x = "Soil type",
y = "Colony Density")+
theme_minimal()
#fitting a linear model
mmdlm1 <- lm(DENSITY~TREATMNT, data = mmd)
anova(mmdlm1)
## Analysis of Variance Table
##
## Response: DENSITY
## Df Sum Sq Mean Sq F value Pr(>F)
## TREATMNT 1 9543 9543.2 12.504 0.0008059 ***
## Residuals 58 44266 763.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We assume that this model is reasonable, where the model fits the data well and the assumptions of linearity and homoscedasticity are met. The results of this ANOVA analysis strongly support the observation from the boxplot that the leaves of sugar beets grown in Soil Type 2 have a significantly higher bacterial density than those grown in Soil Type 1 (F1,58 = 12.5, P < 0.001).
When PLANT to be considered a random factor (blocking factor), each of the six plants tested is considered an individual block.
#recoding each plant to be an individual block
mmd$PLANTNEW <- as.factor((rep(c(1:6), c(10, 10, 10, 10, 10, 10))))
str(mmd)
## tibble [60 × 4] (S3: tbl_df/tbl/data.frame)
## $ TREATMNT: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ PLANT : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ DENSITY : num [1:60] 87 106.1 63.7 60.1 66.1 ...
## $ PLANTNEW: Factor w/ 6 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
#running a linear model
install.packages("lmerTest")
## package 'lmerTest' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LIEWY\AppData\Local\Temp\RtmpKYxjTR\downloaded_packages
library(lmerTest)
mmdlm2 <- lmer(DENSITY ~ TREATMNT + (1| PLANTNEW), mmd)
anova(mmdlm2)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## TREATMNT 1410.6 1410.6 1 4 2.5831 0.1833
summary(mmdlm2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DENSITY ~ TREATMNT + (1 | PLANTNEW)
## Data: mmd
##
## REML criterion at convergence: 544.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.57742 -0.50114 0.03571 0.68858 1.76106
##
## Random effects:
## Groups Name Variance Std.Dev.
## PLANTNEW (Intercept) 314.8 17.74
## Residual 546.1 23.37
## Number of obs: 60, groups: PLANTNEW, 6
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 85.41 11.10 4.00 7.697 0.00153 **
## TREATMNT2 25.22 15.69 4.00 1.607 0.18329
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## TREATMNT2 -0.707
The results of the linear model with PLANT is included as a random factor provide little evidence that usage of either soil type increases the density of bacteria on sugar beet leaves s (F1,4 = 2.5831, p = 0.1833). This is in contrast to the conclusion of the intitial model that shows very strong evidence that soil type 2 increases the density of bacteria.
Random slopes can only be fitted if X is a continuous variable. In this case, X is a categorical variable with 2 levels - Soil types 1 and 2, and hence random slopes cannot be used to model this data.
When Plant is included as a random factor, the ANOVA results show that there is little evidence that there is a difference in the density of bacteria on sugar beet leaves when they are grown in different soil types (F = 2.5831, p = 0.1833).
mmd_emmean <- emmeans(mmdlm2, ~ TREATMNT)
me_df <- as.data.frame(mmd_emmean)
ggplot(me_df) +
geom_point(aes(x = TREATMNT, y = emmean),
position = position_dodge(width = 0.5),
size = 4) +
geom_errorbar(aes(x = TREATMNT, ymin=lower.CL, ymax = upper.CL), width = 0.5, size = 1, position = position_dodge(width = 0.5)) +
labs(x = "Soil Treatment", y = "Bacterial density")+
theme_minimal()
The fixed effects are levels of nitrogen fertilizer used and the oat varieties.
The random effects are the study blocks.
Fixed effects are explanatory variables whose effects we are interested in testing. Random effects are grouping factors whose effects we are mostly not interested in, that can be used to control for non-independence within a nested/hierarchical structure.
This study’s design is fully crossed, since every nitrogen level is associated with every variety, and every variety is associated with each block.
In crossed designs, each level of predictor is associated with every level of the higher-level predictor.
In nested designs, each level of predictors is uniquely associated with only one level of the higher-level predictor.
lmer(yield ~ Variety * nitro + (1| Block/Variety))
Variety: 6 Nitrogen fertilizer: 18
mean_treated_greenhouses = 10.442 kg
mean_control_greenhouses = 10.142 kg
greenhouse_variation = 0.10 kg
plant_variation = 0.10 kg
number_of_treated_greenhouses = 5
number_of_control_greenhouses = 5
number_of_plants = 50
Power (based on 10,000 simulations) ~1.0
The experimental design uses 5 treated and 5 control greenhouses, each with 10 plants measured, due to limitations in resources. Averaging 50 plants per greenhouse reduces within-greenhouse variation, providing a reliable estimate of greenhouse yield. This design accounts for the targeted increase of 0.3 kg. With relatively low variability between greenhouses, this setup provides high power to detect the expected effect while remaining feasible to implement.
#setting up simulation
mean_treated_greenhouses = 10.442
mean_control_greenhouses = 10.142
greenhouse_variation = 0.10
plant_variation = 0.10
number_of_treated_greenhouses = 5
number_of_control_greenhouses = 5
number_of_plants = 50
variation = sqrt(greenhouse_variation^2 + ((plant_variation^2)/number_of_plants))
treated <- rnorm((number_of_treated_greenhouses * number_of_plants), mean = mean_treated_greenhouses, sd = variation)
control <- rnorm((number_of_control_greenhouses* number_of_plants), mean = mean_control_greenhouses, sd = variation)
#creating data
yield <- c(treated, control)
tomatoes <- as.data.frame(yield)
colnames(tomatoes) <- "yield"
tomatoes$wasps <- as.factor(rep(c("Treated", "Control"), each = 50))
tomatoes$greenhouse <- as.factor(rep(1:10, each = 10))
#visualizing data
str(tomatoes)
## 'data.frame': 500 obs. of 3 variables:
## $ yield : num 10.5 10.6 10.4 10.4 10.4 ...
## $ wasps : Factor w/ 2 levels "Control","Treated": 2 2 2 2 2 2 2 2 2 2 ...
## $ greenhouse: Factor w/ 10 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
ggplot(tomatoes, aes(wasps, yield, fill = greenhouse)) +
geom_boxplot()
#fitting a model
tomatolm1 <- lmer(yield ~ wasps + (1| greenhouse), tomatoes)
#checking diagnostic plot
plot(tomatolm1)
#running linear model
anova(tomatolm1)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## wasps 0.49997 0.49997 1 498 16.175 6.671e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
When the simulation data is plotted, normality with no obvious outliers are observed and the diagnostic plot also shows homoscedasticity. A linear model with random effects is hence fitted.
The ANOVA results provide strong evidence that greenhouses with wasps have significantly increased tomato yields as compared to that of the control greenhouses (F1,498 = 20.4, P < 0.001).
t1_emm <- emmeans(tomatolm1, ~ wasps)
t1_emm <- as.data.frame(t1_emm)
ggplot(t1_emm) +
geom_point(aes(x = wasps, y = emmean),
position = position_dodge(width = 0.5),
size = 3) +
geom_errorbar(aes(x = wasps, ymin=lower.CL, ymax = upper.CL),
position = position_dodge(width = 0.5)) +
labs(x = "Treatment", y = "Tomato Yield")+
theme_minimal()