This material corresponds to Week 4 of the labs and demos for Bio 4259F (Undergraduate) and Bio 9915A (Graduate) at Western University.
Note: it’s a good idea to load libraries in a separate code chunk, so you don’t have to mess with the setup chunk again.
library(ggplot2)
library(dplyr)
library(ggfortify)
library(arm)
Make sure to get data in a new code chunk, otherwise the pathname will not work.
test1 <- read.csv("data/plant.growth.rate.csv",header=T)
glimpse(test1)
## Rows: 50
## Columns: 2
## $ soil.moisture.content <dbl> 0.4696876, 0.5413106, 1.6979915, 0.8255799, 0.85…
## $ plant.growth.rate <dbl> 21.31695, 27.03072, 38.98937, 30.19529, 37.06547…
head(test1)
## soil.moisture.content plant.growth.rate
## 1 0.4696876 21.31695
## 2 0.5413106 27.03072
## 3 1.6979915 38.98937
## 4 0.8255799 30.19529
## 5 0.8568173 37.06547
## 6 1.6082405 43.24982
Simulate data for a simple linear regression. This is based on the example in Beckerman et al. (2017), which models plant growth rate as a function of soil moisture content.
# Set coefficients (see page 116)
alpha = 2.348
beta1 = 20.750
# set sample size
n = 200
# Set residual standard error (see page 116)
# this is the positive square root of the MSE
rse = 8.019
# get max and min values for soil.moisture
min=min(test1$soil.moisture.content)
max=max(test1$soil.moisture.content)
# randomize values of soil moisture across known range
data <- data.frame(soil.moisture=runif(n,min=min,max=max))
# generate response values
data1 <- data %>%
mutate(e = rnorm(n(),mean=0,sd=rse),
growth.rate =alpha+beta1*soil.moisture+e)
# used MSE as an estimate of error variance
# view data
#head(data1)
Visualize:
# visualize
ggplot(data=data1,aes(x=soil.moisture,y=growth.rate))+
geom_point()+
geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'
Pretty the same? What does the slope represent in this regression
line?
resid_model: residuals from regression line. resid_y_only: deviation from the mean (as if only intercept).
fit <- lm(growth.rate ~ soil.moisture, data = data1)
# Residuos del modelo (SSE = sum e^2)
data1$resid_model <- resid(fit)
# "Residuos de solo Y": centrar Y respecto a su media (equivalen a resid(lm(y ~ 1)))
ybar <- mean(data1$growth.rate)
data1$resid_y_only <- data1$growth.rate - ybar # esto es lo que pides
# Cálculo manual de R^2
SSE <- sum(data1$resid_model^2) # Error Sum of Squares
SSE
## [1] 13191.41
There is first visually deviation from the mean Y
y_mean <- mean(data1$growth.rate)
y_median <- median(data1$growth.rate)
ggplot(data1, aes(x = "", y = growth.rate)) +
# Boxplot principal
geom_boxplot(width = 0.25, outlier.alpha = 0.5, fill = "grey90") +
# Puntos individuales
geom_jitter(width = 0.05, alpha = 0.5, color = "steelblue") +
# Segmentos desde cada punto hacia la media
geom_segment(aes(x = 0.9, xend = 1.1,
y = growth.rate, yend = y_median),
alpha = 0.3, color = "darkgreen") +
geom_hline(yintercept = y_median, linetype = "dotted", color = "darkgreen") +
geom_hline(yintercept = y_median, linetype = "dotted", color = "black") +
# Anotaciones
annotate("text", x = 1.2, y = y_mean,
label = sprintf("mean = %.2f", y_mean),
hjust = 0, vjust = -0.3, color = "red") +
annotate("text", x = 1.2, y = y_median,
label = sprintf("median = %.2f", y_median),
hjust = 0, vjust = 1.3) +
labs(title = "Boxplot de Y con residuos hacia la media",
y = "Growth rate", x = NULL) +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
SST <- sum(data1$resid_y_only^2) # Total Sum of Squares
SST
## [1] 35325.57
The total “Error” or Y residuals
There is visually Error or residuals from regression line
data1$fitted <- predict(fit, newdata = data1)
ggplot(data1, aes(x = soil.moisture, y = growth.rate)) +
geom_point(alpha = 0.7) +
geom_smooth(method = lm, se = FALSE) +
# segmentos desde cada punto hasta su valor ajustado (misma x)
geom_segment(aes(xend = soil.moisture, yend = fitted),
alpha = 0.4) +
labs(title = "Puntos y distancias (residuos) a la recta de regresion",
x = "Soil moisture", y = "Growth rate") +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
SSE <- sum(data1$resid_model^2) # Error Sum of Squares
SST <- sum(data1$resid_y_only^2) # Total Sum of Squares
R2_manual <- 1 - SSE / SST
SSE
## [1] 13191.41
SST
## [1] 35325.57
R2_manual
## [1] 0.6265761
What does 𝑅2 tell us in the context of regression? Try to use summary(fit)
We now simulate two groups (A & B) with different intercepts and error structures.
# Coefficients for Population A
alpha_A <- 2.348
beta1_A <- 20.750
rse_A <- 8.019
# Coefficients for Population B
alpha_B <- 6.5
beta1_B <- 20.750
rse_B <- 5.0
# Total sample size
n <- 200
# Soil moisture range
min_val <- min(test1$soil.moisture.content)
max_val <- max(test1$soil.moisture.content)
# Assign half the cases to each population
data <- data.frame(
soil.moisture = runif(n, min = min_val, max = max_val),
population = rep(c("A", "B"), each = n / 2)
)
# Generate residuals and growth rate per population
data2 <- data %>%
rowwise() %>% # evaluate each row individually
mutate(
e = rnorm(1, mean = 0,
sd = if_else(population == "A", rse_A, rse_B)),
growth.rate = if_else(
population == "A",
alpha_A + beta1_A * soil.moisture + e,
alpha_B + beta1_B * soil.moisture + e
)
) %>%
ungroup()
head(data2)
## # A tibble: 6 × 4
## soil.moisture population e growth.rate
## <dbl> <chr> <dbl> <dbl>
## 1 1.75 A 2.12 40.7
## 2 1.58 A -5.81 29.3
## 3 1.37 A -7.41 23.3
## 4 0.514 A 10.0 23.0
## 5 1.51 A -5.17 28.5
## 6 1.31 A -14.1 15.3
#table(data2$population)
Y Residual as we take both populations as one
fit2 <- lm(growth.rate ~ soil.moisture, data = data2)
# Residuos del modelo (SSE = sum e^2)
data2$resid_model <- resid(fit2)
# "Residuos de solo Y": centrar Y respecto a su media (equivalen a resid(lm(y ~ 1)))
ybar <- mean(data2$growth.rate)
data2$resid_y_only <- data2$growth.rate - ybar # esto es lo que pides
# Cálculo manual de R^2
SSE <- sum(data2$resid_model^2) # Error Sum of Squares
SSE
## [1] 8429.782
Total manually R2 calculation
SSE <- sum(data2$resid_model^2) # Error Sum of Squares
SST <- sum(data2$resid_y_only^2)
SSE
## [1] 8429.782
SST
## [1] 28946.17
R2_manual <- 1 - SSE / SST
R2_manual
## [1] 0.7087773
Visualization:
data2$fitted <- predict(fit2, newdata = data2)
ggplot(data2, aes(x = soil.moisture, y = growth.rate)) +
geom_point(alpha = 0.7, aes(color = population)) + # color by population
geom_smooth(method = lm, se = FALSE) + # one regression line (for all data)
geom_segment(aes(xend = soil.moisture, yend = fitted), alpha = 0.4) +
labs(
title = "Points and distances (residuals) to the regression line",
x = "Soil moisture",
y = "Growth rate"
) +
theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
How do the regression lines differ between populations A and B?
Go directly to summary and now we focus on the populations. First model without population:
fit2 <- lm(growth.rate ~ soil.moisture, data = data2)
summary(fit2)$r.squared
## [1] 0.7087773
Model with population (different intercepts)
fit3 <- lm(growth.rate ~ soil.moisture + population, data = data2)
summary(fit3)
##
## Call:
## lm(formula = growth.rate ~ soil.moisture + population, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1463 -3.5222 0.0876 3.6215 17.4463
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.4672 1.1034 3.142 0.00193 **
## soil.moisture 19.6458 0.8761 22.425 < 2e-16 ***
## populationB 3.8099 0.8882 4.289 2.81e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.256 on 197 degrees of freedom
## Multiple R-squared: 0.7337, Adjusted R-squared: 0.7309
## F-statistic: 271.3 on 2 and 197 DF, p-value: < 2.2e-16
R2_with_pop <- summary(fit3)$r.squared
R2_with_pop
## [1] 0.7336524
Now with interaction
Model with population (different intercepts)
fit4 <- lm(growth.rate ~ soil.moisture*population, data = data2)
summary(fit4)
##
## Call:
## lm(formula = growth.rate ~ soil.moisture * population, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1534 -3.5329 0.0889 3.6294 17.4316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.49393 1.47303 2.372 0.0187 *
## soil.moisture 19.62000 1.28473 15.272 <2e-16 ***
## populationB 3.75766 2.09866 1.791 0.0749 .
## soil.moisture:populationB 0.04838 1.76036 0.027 0.9781
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.272 on 196 degrees of freedom
## Multiple R-squared: 0.7337, Adjusted R-squared: 0.7296
## F-statistic: 180 on 3 and 196 DF, p-value: < 2.2e-16
R2_with_pop <- summary(fit4)$r.squared
R2_with_pop
## [1] 0.7336534
Model with interaction (different intercepts + slopes) How does 𝑅2 change as we include population and then the interaction term? What is the Interncept in our case?
Adding population helps explain more variance, because it accounts for group-level differences.
Adding soil.moisture * population allows each population to have its own slope as well.
In your own words: what is the intercept in a linear regression model? It is a linear regression or an ANOVA? (Hint: it represents the expected value of Y when X = 0.)