This material corresponds to Week 4 of the labs and demos for Bio 4259F (Undergraduate) and Bio 9915A (Graduate) at Western University.

Load libraries

Note: it’s a good idea to load libraries in a separate code chunk, so you don’t have to mess with the setup chunk again.

library(ggplot2)      
library(dplyr)
library(ggfortify)
library(arm)

Why simulate data?

  • explore the behaviour of a statistical model in order to advance your understanding (e.g., vary the slope and intercept of a regression)
  • evaluate the applicability of a statistical model for a particular study design, before undertaking the experiment
  • create exemplars for teaching purposes

Get data

Make sure to get data in a new code chunk, otherwise the pathname will not work.

test1 <- read.csv("data/plant.growth.rate.csv",header=T)
glimpse(test1)
## Rows: 50
## Columns: 2
## $ soil.moisture.content <dbl> 0.4696876, 0.5413106, 1.6979915, 0.8255799, 0.85…
## $ plant.growth.rate     <dbl> 21.31695, 27.03072, 38.98937, 30.19529, 37.06547…
head(test1)
##   soil.moisture.content plant.growth.rate
## 1             0.4696876          21.31695
## 2             0.5413106          27.03072
## 3             1.6979915          38.98937
## 4             0.8255799          30.19529
## 5             0.8568173          37.06547
## 6             1.6082405          43.24982

Simulatting data

Simulate data for a simple linear regression. This is based on the example in Beckerman et al. (2017), which models plant growth rate as a function of soil moisture content.

# Set coefficients (see page 116)
alpha = 2.348
beta1 = 20.750

# set sample size
n = 200

# Set residual standard error (see page 116)
# this is the positive square root of the MSE
rse = 8.019

# get max and min values for soil.moisture
min=min(test1$soil.moisture.content)
max=max(test1$soil.moisture.content)

# randomize values of soil moisture across known range
data <- data.frame(soil.moisture=runif(n,min=min,max=max))
# generate response values
data1 <- data %>%
  mutate(e = rnorm(n(),mean=0,sd=rse),
         growth.rate =alpha+beta1*soil.moisture+e) 
  # used MSE as an estimate of error variance
# view data
#head(data1)

Visualize:

# visualize
ggplot(data=data1,aes(x=soil.moisture,y=growth.rate))+
  geom_point()+
  geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'

Pretty the same? What does the slope represent in this regression line?

Model fitting and residuals

resid_model: residuals from regression line. resid_y_only: deviation from the mean (as if only intercept).

fit <- lm(growth.rate ~ soil.moisture, data = data1)

# Residuos del modelo (SSE = sum e^2)
data1$resid_model <- resid(fit)

# "Residuos de solo Y": centrar Y respecto a su media (equivalen a resid(lm(y ~ 1)))
ybar <- mean(data1$growth.rate)
data1$resid_y_only <- data1$growth.rate - ybar   # esto es lo que pides

# Cálculo manual de R^2
SSE <- sum(data1$resid_model^2)         # Error Sum of Squares
SSE
## [1] 13191.41

We can calculate R² manually:

There is first visually deviation from the mean Y

y_mean   <- mean(data1$growth.rate)
y_median <- median(data1$growth.rate)

ggplot(data1, aes(x = "", y = growth.rate)) +
  # Boxplot principal
  geom_boxplot(width = 0.25, outlier.alpha = 0.5, fill = "grey90") +
  # Puntos individuales
  geom_jitter(width = 0.05, alpha = 0.5, color = "steelblue") +
  # Segmentos desde cada punto hacia la media
  geom_segment(aes(x = 0.9, xend = 1.1, 
                 y = growth.rate, yend = y_median),
             alpha = 0.3, color = "darkgreen") +
geom_hline(yintercept = y_median, linetype = "dotted", color = "darkgreen") +
  geom_hline(yintercept = y_median, linetype = "dotted", color = "black") +
  # Anotaciones
  annotate("text", x = 1.2, y = y_mean, 
           label = sprintf("mean = %.2f", y_mean),
           hjust = 0, vjust = -0.3, color = "red") +
  annotate("text", x = 1.2, y = y_median, 
           label = sprintf("median = %.2f", y_median),
           hjust = 0, vjust = 1.3) +
  labs(title = "Boxplot de Y con residuos hacia la media",
       y = "Growth rate", x = NULL) +
  theme_classic() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

SST <- sum(data1$resid_y_only^2)        # Total Sum of Squares
SST
## [1] 35325.57

The total “Error” or Y residuals

There is visually Error or residuals from regression line

data1$fitted <- predict(fit, newdata = data1)

ggplot(data1, aes(x = soil.moisture, y = growth.rate)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = lm, se = FALSE) +
  # segmentos desde cada punto hasta su valor ajustado (misma x)
  geom_segment(aes(xend = soil.moisture, yend = fitted),
               alpha = 0.4) +
  labs(title = "Puntos y distancias (residuos) a la recta de regresion",
       x = "Soil moisture", y = "Growth rate") +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

SSE <- sum(data1$resid_model^2)         # Error Sum of Squares
SST <- sum(data1$resid_y_only^2)        # Total Sum of Squares
R2_manual <- 1 - SSE / SST
SSE
## [1] 13191.41
SST
## [1] 35325.57
R2_manual
## [1] 0.6265761

What does 𝑅2 tell us in the context of regression? Try to use summary(fit)

Two populations

We now simulate two groups (A & B) with different intercepts and error structures.

# Coefficients for Population A
alpha_A <- 2.348
beta1_A <- 20.750
rse_A   <- 8.019

# Coefficients for Population B
alpha_B <- 6.5
beta1_B <- 20.750
rse_B   <- 5.0

# Total sample size
n <- 200

# Soil moisture range
min_val <- min(test1$soil.moisture.content)
max_val <- max(test1$soil.moisture.content)

# Assign half the cases to each population
data <- data.frame(
  soil.moisture = runif(n, min = min_val, max = max_val),
  population = rep(c("A", "B"), each = n / 2)
)
# Generate residuals and growth rate per population
data2 <- data %>%
  rowwise() %>%                     # evaluate each row individually
  mutate(
    e = rnorm(1, mean = 0,
              sd = if_else(population == "A", rse_A, rse_B)),
    growth.rate = if_else(
      population == "A",
      alpha_A + beta1_A * soil.moisture + e,
      alpha_B + beta1_B * soil.moisture + e
    )
  ) %>%
  ungroup()
head(data2)
## # A tibble: 6 × 4
##   soil.moisture population      e growth.rate
##           <dbl> <chr>       <dbl>       <dbl>
## 1         1.75  A            2.12        40.7
## 2         1.58  A           -5.81        29.3
## 3         1.37  A           -7.41        23.3
## 4         0.514 A           10.0         23.0
## 5         1.51  A           -5.17        28.5
## 6         1.31  A          -14.1         15.3
#table(data2$population)

Y Residual as we take both populations as one

fit2 <- lm(growth.rate ~ soil.moisture, data = data2)

# Residuos del modelo (SSE = sum e^2)
data2$resid_model <- resid(fit2)

# "Residuos de solo Y": centrar Y respecto a su media (equivalen a resid(lm(y ~ 1)))
ybar <- mean(data2$growth.rate)
data2$resid_y_only <- data2$growth.rate - ybar  # esto es lo que pides

# Cálculo manual de R^2
SSE <- sum(data2$resid_model^2)         # Error Sum of Squares
SSE
## [1] 8429.782

Total manually R2 calculation

SSE <- sum(data2$resid_model^2)   # Error Sum of Squares
SST <- sum(data2$resid_y_only^2)
SSE
## [1] 8429.782
SST
## [1] 28946.17
R2_manual <- 1 - SSE / SST
R2_manual
## [1] 0.7087773

Visualization:

data2$fitted <- predict(fit2, newdata = data2)

ggplot(data2, aes(x = soil.moisture, y = growth.rate)) +
  geom_point(alpha = 0.7, aes(color = population)) +   # color by population
  geom_smooth(method = lm, se = FALSE) +               # one regression line (for all data)
  geom_segment(aes(xend = soil.moisture, yend = fitted), alpha = 0.4) +
  labs(
    title = "Points and distances (residuals) to the regression line",
    x = "Soil moisture",
    y = "Growth rate"
  ) +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

How do the regression lines differ between populations A and B?

Comparing models

Go directly to summary and now we focus on the populations. First model without population:

fit2 <- lm(growth.rate ~ soil.moisture, data = data2)
summary(fit2)$r.squared
## [1] 0.7087773

Model with population (different intercepts)

fit3 <- lm(growth.rate ~ soil.moisture + population, data = data2)
summary(fit3)
## 
## Call:
## lm(formula = growth.rate ~ soil.moisture + population, data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1463  -3.5222   0.0876   3.6215  17.4463 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.4672     1.1034   3.142  0.00193 ** 
## soil.moisture  19.6458     0.8761  22.425  < 2e-16 ***
## populationB     3.8099     0.8882   4.289 2.81e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.256 on 197 degrees of freedom
## Multiple R-squared:  0.7337, Adjusted R-squared:  0.7309 
## F-statistic: 271.3 on 2 and 197 DF,  p-value: < 2.2e-16
R2_with_pop <- summary(fit3)$r.squared
R2_with_pop
## [1] 0.7336524

Now with interaction

Model with population (different intercepts)

fit4 <- lm(growth.rate ~ soil.moisture*population, data = data2)
summary(fit4)
## 
## Call:
## lm(formula = growth.rate ~ soil.moisture * population, data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1534  -3.5329   0.0889   3.6294  17.4316 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                3.49393    1.47303   2.372   0.0187 *  
## soil.moisture             19.62000    1.28473  15.272   <2e-16 ***
## populationB                3.75766    2.09866   1.791   0.0749 .  
## soil.moisture:populationB  0.04838    1.76036   0.027   0.9781    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.272 on 196 degrees of freedom
## Multiple R-squared:  0.7337, Adjusted R-squared:  0.7296 
## F-statistic:   180 on 3 and 196 DF,  p-value: < 2.2e-16
R2_with_pop <- summary(fit4)$r.squared
R2_with_pop
## [1] 0.7336534

Model with interaction (different intercepts + slopes) How does 𝑅2 change as we include population and then the interaction term? What is the Interncept in our case?

Discussion

Adding population helps explain more variance, because it accounts for group-level differences.

Adding soil.moisture * population allows each population to have its own slope as well.

In your own words: what is the intercept in a linear regression model? It is a linear regression or an ANOVA? (Hint: it represents the expected value of Y when X = 0.)