Simple Linear Regression
Fitting a linear regression
- Linear regression models always fit a straight line to the data
- \(y = \text{intercept} + \text{slope} ∗
x\)
- Single categorical explanatory variables
- The linear regression coefficients are the means of each
category
Numeric explanatory variable
# Simple Linear Regression
# Numeric explanatory variable
# Descriptive statistics
swedish_motor_insurance %>%
summarize_all(mean)
swedish_motor_insurance %>%
summarise(correlation = cor(n_claims, total_payment_sek))
# Visualizing pairs of variables
ggplot(swedish_motor_insurance, aes(n_claims, total_payment_sek)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

# Fitting a linear regression
lm(total_payment_sek ~ n_claims, data = swedish_motor_insurance)
##
## Call:
## lm(formula = total_payment_sek ~ n_claims, data = swedish_motor_insurance)
##
## Coefficients:
## (Intercept) n_claims
## 19.994 3.414
# total_payment_sek = 19.994 + 3.414 ∗ n_claims
# Visualizing pairs of variables
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

# Fitting a linear regression
lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
##
## Coefficients:
## (Intercept) n_convenience
## 8.2242 0.7981
Categorical explanatory variables
# Categorical explanatory variables
ggplot(fish, aes(mass_g)) +
geom_histogram(bins = 9) +
facet_wrap(vars(species))

# Descriptive statistics
fish %>%
group_by(species) %>%
summarise(mean_mass_g = mean(mass_g))
lm(mass_g ~ species + 0, data = fish) # +0 specifies that all the coefficients should be given relative to zero
##
## Call:
## lm(formula = mass_g ~ species + 0, data = fish)
##
## Coefficients:
## speciesBream speciesPerch speciesPike speciesRoach
## 617.8 382.2 718.7 152.0
ggplot(taiwan_real_estate, aes(price_twd_msq)) +
geom_histogram(bins = 10) +
facet_wrap(vars(house_age_years))

taiwan_real_estate %>%
group_by(house_age_years) %>%
summarise(mean_by_group = mean(price_twd_msq))
lm(price_twd_msq ~ house_age_years, data = taiwan_real_estate)
##
## Call:
## lm(formula = price_twd_msq ~ house_age_years, data = taiwan_real_estate)
##
## Coefficients:
## (Intercept) house_age_years.L house_age_years.Q
## 11.3025 -0.8798 1.7462
lm(price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)
##
## Call:
## lm(formula = price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)
##
## Coefficients:
## house_age_years0 to 15 house_age_years15 to 30 house_age_years30 to 45
## 12.637 9.877 11.393
Predictions
- Extrapolating - Making predictions outside the
range of observed data
# Predictions
bream <- fish %>%
filter(species == "Bream")
g1 <- ggplot(bream, aes(length_cm, mass_g)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
mdl_mass_vs_length <- lm(mass_g ~ length_cm, data = bream)
explanatory_data <- tibble(length_cm = 20:40)
prediction_data <- explanatory_data %>%
mutate(
mass_g = predict(mdl_mass_vs_length, explanatory_data)
)
# Extrapolating
explanatory_little_bream <- tibble(length_cm = 10)
prediction_little_data <- explanatory_little_bream %>%
mutate(
mass_g = predict(mdl_mass_vs_length, explanatory_little_bream)
)
g2 <- ggplot(bream, aes(length_cm, mass_g)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(
data = prediction_data,
color = "green"
) +
geom_point(
data = prediction_little_data,
color = "red"
)
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

mdl_price_vs_conv <- lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)
# Predicting house prices
explanatory_data <- tibble(n_convenience = 0:10)
prediction_data <- explanatory_data %>%
mutate(
price_twd_msq = predict(mdl_price_vs_conv, explanatory_data)
)
# Visualizing predictions
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data, color = "yellow")
## `geom_smooth()` using formula = 'y ~ x'

Model objects
coefficients()
fitted() - Predictions on the original dataset
residuals() - Actual response values minus predicted
response values
summary() - Shows a more extended printout of the
details of the model
Call - The code used to create the model
Residuals - Summary statistics of the residuals
- If the model is a good fit, the residuals should
- Follow a normal distribution
- Median is close to zero
- 1Q and 3Q have about the same absolute value
Coefficients - Details of the coefficients
- First column are the ones returned by the
coefficients() function
- Last column are the p-values, which refer to statistical
significance
- Model metrics - Some metrics on the performance of the model
broom package - Contains functions that decompose
models into three data frames
tidy() - Returns the coefficient details in a data
frame
augment() - Returns observation level result
glance() - Returns model-level results
# Model objects
mdl_mass_vs_length <- lm(mass_g ~ length_cm, data = bream)
# coefficients()
coefficients(mdl_mass_vs_length)
## (Intercept) length_cm
## -1035.34757 54.54998
# fitted()
head(fitted(mdl_mass_vs_length))
## 1 2 3 4 5 6
## 230.2120 273.8520 268.3970 399.3169 410.2269 426.5919
# explanatory_data <- bream %>% select(length_cm)
# predict(mdl_mass_vs_length, explanatory_data)
# residuals()
head(residuals(mdl_mass_vs_length))
## 1 2 3 4 5 6
## 11.78801 16.14802 71.60302 -36.31693 19.77307 23.40808
# bream$mass_g - fitted(mdl_mass_vs_length)
# summary()
summary(mdl_mass_vs_length)
##
## Call:
## lm(formula = mass_g ~ length_cm, data = bream)
##
## Residuals:
## Min 1Q Median 3Q Max
## -233.877 -35.377 -4.797 31.578 207.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1035.348 107.973 -9.589 4.58e-11 ***
## length_cm 54.550 3.539 15.415 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74.15 on 33 degrees of freedom
## Multiple R-squared: 0.8781, Adjusted R-squared: 0.8744
## F-statistic: 237.6 on 1 and 33 DF, p-value: < 2.2e-16
# broom
tidy(mdl_mass_vs_length)
augment(mdl_mass_vs_length)
glance(mdl_mass_vs_length)
mdl_price_vs_conv <- lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)
coefficients(mdl_price_vs_conv)
## (Intercept) n_convenience
## 8.2242375 0.7980797
head(fitted(mdl_price_vs_conv))
## 1 2 3 4 5 6
## 16.20503 15.40695 12.21464 12.21464 12.21464 10.61848
head(residuals(mdl_price_vs_conv))
## 1 2 3 4 5 6
## -4.7375611 -2.6384224 2.0970130 4.3663019 0.8262112 -0.9059199
summary(mdl_price_vs_conv)
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7132 -2.2213 -0.5409 1.8105 26.5299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.22424 0.28500 28.86 <2e-16 ***
## n_convenience 0.79808 0.05653 14.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.384 on 412 degrees of freedom
## Multiple R-squared: 0.326, Adjusted R-squared: 0.3244
## F-statistic: 199.3 on 1 and 412 DF, p-value: < 2.2e-16
# broom package - contains functions that decompose models into three data frames
# tidy - the coefficient-level elements (the coefficients themselves, as well as p-values for each coefficient)
tidy(mdl_price_vs_conv)
# augment - the observation-level elements (like fitted values and residuals)
augment(mdl_price_vs_conv)
# glance - the model-level elements (mostly performance metrics)
glance(mdl_price_vs_conv)
# Manually predicting house prices
coeffs <- coefficients(mdl_price_vs_conv)
intercept <- coeffs[1]
slope <- coeffs[2]
n_convenience <- c(0:10)
explanatory_data %>%
mutate(
price_twd_msq = slope * n_convenience + intercept
)
# Compare to the results from predict()
predict(mdl_price_vs_conv, explanatory_data)
## 1 2 3 4 5 6 7 8
## 8.224237 9.022317 9.820397 10.618477 11.416556 12.214636 13.012716 13.810795
## 9 10 11
## 14.608875 15.406955 16.205035
Regression to the mean
- The concept
- Response value = fitted value + residual
- “The stuff you explained” + “the stuff you couldn’t explain”
- Residuals exist due to problems in the model and fundamental
randomness
- Extreme cases are often due to randomness
- Regression to the mean means extreme cases don’t
persist over time
# Regression to the mean
father_son <- father.son %>%
mutate(
father_height_cm = fheight * 2.54,
son_height_cm = sheight * 2.54
)
ggplot(father_son, aes(father_height_cm, son_height_cm)) +
geom_point() +
geom_abline(color = "green", linewidth = 1) + # geom_abline(intercept = 0, slope = 1), the sons and fathers heights are equal
coord_fixed() + # coord_fixed(ratio = 1). x:y - 1:1
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

mdl_son_vs_father <- lm(son_height_cm ~ father_height_cm, data = father_son)
really_tall_father <- tibble(father_height_cm = 190)
predict(mdl_son_vs_father, really_tall_father)
## 1
## 183.7497
really_short_father <- tibble(father_height_cm = 150)
predict(mdl_son_vs_father, really_short_father)
## 1
## 163.1859
Transforming variables
- Sometimes, the relationship between the explanatory variable and the
response variable may not be a straight line. To fit a linear regression
model, you need to transform the explanatory variable or the response
variable, or both of them
- When the response variable is transformed you need to back transform
the predictions - backtransformation
# Transforming variables
perch <- fish %>% filter(species == "Perch")
g1 <- ggplot(perch, aes(length_cm, mass_g)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
g2 <- ggplot(perch, aes(length_cm ^ 3, mass_g)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# With transformation
mdl_perch <- lm(mass_g ~ I(length_cm ^ 3), data = perch) # I(): As is. Only for exponentiation
explanatory_data <- tibble(length_cm = seq(10, 40, 5))
prediction_data <- explanatory_data %>%
mutate(mass_g = predict(mdl_perch, explanatory_data))
# The linear model has non-linear predictions, after the transformation is undone
g1 <- ggplot(perch, aes(length_cm, mass_g)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data, color = "red")
g2 <- ggplot(perch, aes(length_cm ^ 3, mass_g)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data, color = "red")
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

g1 <- ggplot(ad_conversion, aes(spent_usd, n_impressions)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
g2 <- ggplot(ad_conversion, aes(sqrt(spent_usd), sqrt(n_impressions))) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

mdl_ad <- lm(sqrt(n_impressions) ~ sqrt(spent_usd), data = ad_conversion)
explanatory_data <- tibble(spent_usd = seq(0, 600, 100))
prediction_data <- explanatory_data %>%
mutate(
sqrt_n_impressions = predict(mdl_ad, explanatory_data),
n_impressions = sqrt_n_impressions ^ 2
)
# Square roots are a common transformation when your data has a right-skewed distribution
g1 <- ggplot(ad_conversion, aes(spent_usd, n_impressions)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data, color = "red")
g2 <- ggplot(ad_conversion, aes(sqrt(spent_usd), sqrt(n_impressions))) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data, color = "red")
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Transforming the explanatory variable
mdl_price_vs_dist <- lm(price_twd_msq ~ sqrt(dist_to_mrt_m), data = taiwan_real_estate)
explanatory_data <- tibble(dist_to_mrt_m = seq(0, 80, 10) ^ 2)
prediction_data <- explanatory_data %>%
mutate(price_twd_msq = predict(mdl_price_vs_dist, explanatory_data))
g1 <- ggplot(taiwan_real_estate, aes(dist_to_mrt_m, price_twd_msq)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data, color = "green", size = 3)
g2 <- ggplot(taiwan_real_estate, aes(sqrt(dist_to_mrt_m), price_twd_msq)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data, color = "green", size = 3)
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Transforming the response variable
mdl_click_vs_impression <- lm(I(n_clicks ^ 0.25) ~ I(n_impressions ^ 0.25), data = ad_conversion)
explanatory_data <- tibble(n_impressions = seq(0, 3e6, 5e5))
prediction_data <- explanatory_data %>%
mutate(
n_clicks_025 = predict(mdl_click_vs_impression, explanatory_data),
n_clicks = n_clicks_025 ^ 4 # Backtransformation
)
g1 <- ggplot(ad_conversion, aes(n_impressions, n_clicks)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data, color = "green")
g2 <- ggplot(ad_conversion, aes(n_impressions^.25, n_clicks^.25)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data, color = "green")
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Assessing model fit
- Metrics
- Coefficient of determination (r-squared/R-squared)
- The proportion of the variance in the response variable that is
predictable from the explanatory variable
- r-squared: For simple linear regression -
correlation squared
- R-squared : More than one explanatory
variables
- 1: a perfect fit
- 0: the worst possible fit
- What constitutes a good score depends on your dataset. E.g.
- A score of zero-point five on a psychological experiment may be
exceptionally high because humans are inherently hard to predict
- A score of zero-point nine may be considered a poor fit
summary(): Multiple R-squared
glance(): r.squared
- For simple linear regression, the coefficient of determination is
the correlation between the explanatory and response variables,
squared
- Residual standard error (RSE)
- A “typical” difference between a prediction and an observed
response
- It has the same unit as the response variable
- Smaller numbers are better, with zero being a perfect fit to the
data
summary(): Residual standard error
glance(): sigma
- Root-mean-square error (RMSE)
- Quantifying how inaccurate the model predictions are
- Worse than RSE for comparisons between models
- Typically use RSE instead
Coefficient of determination
# Coefficient of determination
mdl_bream <- lm(mass_g ~ length_cm, data = bream)
summary(mdl_bream) # Multiple R-squared
##
## Call:
## lm(formula = mass_g ~ length_cm, data = bream)
##
## Residuals:
## Min 1Q Median 3Q Max
## -233.877 -35.377 -4.797 31.578 207.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1035.348 107.973 -9.589 4.58e-11 ***
## length_cm 54.550 3.539 15.415 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74.15 on 33 degrees of freedom
## Multiple R-squared: 0.8781, Adjusted R-squared: 0.8744
## F-statistic: 237.6 on 1 and 33 DF, p-value: < 2.2e-16
mdl_bream %>% glance() # r.squared
mdl_bream %>%
glance() %>%
pull(r.squared)
## [1] 0.8780627
# Manual calculation
bream %>%
summarise(coeff_determination = cor(length_cm, mass_g) ^ 2)
mdl_click_vs_impression_orig <- lm(n_clicks ~ n_impressions, data = ad_conversion)
mdl_click_vs_impression_trans <- lm(I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)
summary(mdl_click_vs_impression_orig)
##
## Call:
## lm(formula = n_clicks ~ n_impressions, data = ad_conversion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -186.099 -5.392 -1.422 2.070 119.876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.683e+00 7.888e-01 2.133 0.0331 *
## n_impressions 1.718e-04 1.960e-06 87.654 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.91 on 934 degrees of freedom
## Multiple R-squared: 0.8916, Adjusted R-squared: 0.8915
## F-statistic: 7683 on 1 and 934 DF, p-value: < 2.2e-16
summary(mdl_click_vs_impression_trans)
##
## Call:
## lm(formula = I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57061 -0.13229 0.00582 0.14494 0.46888
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0717479 0.0172019 4.171 3.32e-05 ***
## I(n_impressions^0.25) 0.1115330 0.0008844 126.108 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1969 on 934 degrees of freedom
## Multiple R-squared: 0.9445, Adjusted R-squared: 0.9445
## F-statistic: 1.59e+04 on 1 and 934 DF, p-value: < 2.2e-16
# The number of impressions (explanatory) explains 89% of the variability in the number of clicks (response)
mdl_click_vs_impression_orig %>%
glance() %>%
pull(r.squared)
## [1] 0.8916135
mdl_click_vs_impression_trans %>%
glance() %>%
pull(r.squared)
## [1] 0.9445273
# Manual calculation
ad_conversion %>%
summarise(
coeff_determination_orig = cor(n_impressions, n_clicks) ^ 2,
coeff_determination_trans = cor(n_impressions^0.25, n_clicks^0.25) ^ 2
)
RSE
# Residual standard error (RSE)
summary(mdl_bream) # Residual standard error
##
## Call:
## lm(formula = mass_g ~ length_cm, data = bream)
##
## Residuals:
## Min 1Q Median 3Q Max
## -233.877 -35.377 -4.797 31.578 207.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1035.348 107.973 -9.589 4.58e-11 ***
## length_cm 54.550 3.539 15.415 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74.15 on 33 degrees of freedom
## Multiple R-squared: 0.8781, Adjusted R-squared: 0.8744
## F-statistic: 237.6 on 1 and 33 DF, p-value: < 2.2e-16
mdl_bream %>% glance() # sigma
mdl_bream %>%
glance() %>%
pull(sigma)
## [1] 74.15224
# Manual calculation
bream %>%
mutate(residuals_sq = residuals(mdl_bream) ^ 2) %>%
summarise(
resid_sum_of_sq = sum(residuals_sq),
deg_freedom = n() - 2, # Degrees of freedom - the number of observations minus the number of model coefficients
rse = sqrt(resid_sum_of_sq / deg_freedom)
) # Indicating the difference between predicted bream masses and observed bream masses is typically about 74g
summary(mdl_click_vs_impression_orig)
##
## Call:
## lm(formula = n_clicks ~ n_impressions, data = ad_conversion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -186.099 -5.392 -1.422 2.070 119.876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.683e+00 7.888e-01 2.133 0.0331 *
## n_impressions 1.718e-04 1.960e-06 87.654 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.91 on 934 degrees of freedom
## Multiple R-squared: 0.8916, Adjusted R-squared: 0.8915
## F-statistic: 7683 on 1 and 934 DF, p-value: < 2.2e-16
summary(mdl_click_vs_impression_trans)
##
## Call:
## lm(formula = I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57061 -0.13229 0.00582 0.14494 0.46888
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0717479 0.0172019 4.171 3.32e-05 ***
## I(n_impressions^0.25) 0.1115330 0.0008844 126.108 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1969 on 934 degrees of freedom
## Multiple R-squared: 0.9445, Adjusted R-squared: 0.9445
## F-statistic: 1.59e+04 on 1 and 934 DF, p-value: < 2.2e-16
mdl_click_vs_impression_orig %>%
glance() %>%
pull(sigma) # # The typical difference between observed number of clicks and predicted number of clicks is 20
## [1] 19.90584
mdl_click_vs_impression_trans %>%
glance() %>%
pull(sigma)
## [1] 0.1969064
# Manual calculation
ad_conversion %>%
mutate(residuals_sq = residuals(mdl_click_vs_impression_orig)^2) %>%
summarise(
resid_sum_of_sq = sum(residuals_sq),
deg_freedom = n() - 2,
rse = sqrt(resid_sum_of_sq / deg_freedom)
)
ad_conversion %>%
mutate(residuals_sq = residuals(mdl_click_vs_impression_trans) ^ 2) %>%
summarise(
resid_sum_of_sq = sum(residuals_sq),
deg_freedom = n() - 2,
rse = sqrt(resid_sum_of_sq / deg_freedom)
)
RMSE
# Root-mean-square error (RMSE)
# Calculation
bream %>%
mutate(residuals_sq = residuals(mdl_bream) ^ 2) %>%
summarise(
resid_sum_of_sq = sum(residuals_sq),
n_obs = n(),
rmse = sqrt(resid_sum_of_sq / n_obs)
)
ad_conversion %>%
mutate(residuals_sq = residuals(mdl_click_vs_impression_orig) ^ 2) %>%
summarise(
resid_sum_of_sq = sum(residuals_sq),
n_obs = n(),
rmse = sqrt(resid_sum_of_sq / n_obs)
)
ad_conversion %>%
mutate(residuals_sq = residuals(mdl_click_vs_impression_trans) ^ 2) %>%
summarise(
resid_sum_of_sq = sum(residuals_sq),
n_obs = n(),
rmse = sqrt(resid_sum_of_sq / n_obs)
)
Visualizing model fit
- Linear regression model is a good fit
- Residuals are normally distributed
- The mean of the residuals is zero
- Residuals vs. fitted values
- LOESS trend line - A smooth curve following the data
- If residuals met the assumption that they are normally distributed
with mean zero, the trend line should closely follow the y equals zero
line
- Shows whether or not the residuals go positive or negative as the
fitted values change
- Q-Q plot
- Shows whether or not the residuals follow a normal distribution
- x-axis: The points are quantiles from the normal distribution
- y-axis: the standardized residuals, which are the residuals divided
by their standard deviation
- If the points track along the straight line, they are normally
distributed
- Scale-location
- The square root of the standardized residuals versus the fitted
values
- Shows whether the size of the residuals gets bigger or smaller
autoplot()
- Values for
which
- 1 residuals vs. ed values
- 2 Q-Q plot
- 3 scale-location
# Visualizing model fit
mdl_click_vs_impression_orig <- lm(formula = n_clicks ~ n_impressions, data = ad_conversion)
mdl_click_vs_impression_trans <- lm(I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)
autoplot(
mdl_click_vs_impression_orig,
which = 1:3,
nrow = 1,
ncol = 3)

# The residuals track the "y equals 0" line more closely in the transformed model compared to the original model
# The residuals track the "normality" line more closely in the transformed model compared to the original model
# The size of the standardized residuals is more consistent in the transformed model compared to the original model
# Indicating that the transformed model is a better fit for the data
autoplot(
mdl_click_vs_impression_trans,
which = 1:3,
nrow = 1,
ncol = 3)

Outliers
- Unusual data point
- Extreme explanatory values
- Response values away from the regression line
# Outliers
roach <- fish %>% filter(species == "Roach")
roach %>%
mutate(
has_extreme_length = length_cm < 15 | length_cm > 26,
has_extreme_mass = mass_g < 1
) %>%
ggplot(aes(length_cm, mass_g)) +
geom_point(aes(color = has_extreme_length, shape = has_extreme_mass), size = 4) +
geom_smooth(method = "lm", se = FALSE) +
theme(text = element_text(size=14))
## `geom_smooth()` using formula = 'y ~ x'

Leverage
- Leverage - A measure of how extreme the
explanatory variable values are
# Leverage
mdl_roach <- lm(mass_g ~ length_cm, data = roach)
hatvalues(mdl_roach)
## 1 2 3 4 5 6 7
## 0.31372898 0.12553776 0.09348669 0.07628287 0.06838661 0.06189726 0.06049475
## 8 9 10 11 12 13 14
## 0.05681481 0.05026390 0.05009244 0.05009244 0.05055408 0.05091020 0.05807223
## 15 16 17 18 19 20
## 0.05807223 0.05930767 0.08839105 0.09948802 0.13338565 0.39474037
augment(mdl_roach)
# Highly leveraged roaches
mdl_roach %>%
augment() %>%
dplyr::select(mass_g, length_cm, leverage = .hat) %>%
arrange(desc(leverage)) %>%
head()
Influence
- Influence - A type of “leave one out” metric.
Measures how much the model would change if you reran it without that
data point
- The influence (Cook’s distance) of
each observation is based on the size of the residuals
and the leverage
- A bigger number denotes more influence for the observation
# Influence
cooks.distance(mdl_roach)
## 1 2 3 4 5 6
## 1.074015e+00 1.042888e-02 1.972286e-05 1.979659e-03 6.609838e-03 3.118518e-01
## 7 8 9 10 11 12
## 8.525058e-04 1.994809e-04 2.565139e-04 2.563127e-04 2.445136e-03 7.949762e-03
## 13 14 15 16 17 18
## 1.372065e-04 4.817271e-03 1.151628e-02 4.519344e-03 6.120857e-02 1.500641e-01
## 19 20
## 2.061450e-02 3.657822e-01
augment(mdl_roach)
# Most influential roaches
mdl_roach %>%
augment() %>%
dplyr::select(mass_g, length_cm, cooks_dist = .cooksd, residual = .resid, leverage = .hat) %>%
arrange(desc(cooks_dist)) %>%
head()
# Removing the most influential roach
roach_not_short <- roach %>%
filter(length_cm != 12.9)
ggplot(roach, aes(length_cm, mass_g)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_smooth(method = "lm", se = FALSE, data = roach_not_short, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# autoplot()
# Less helpful for diagnosis than the previous three, other than seeing the labels of the most influential observations
autoplot(
mdl_roach,
which = 4:6,
nrow = 3,
ncol = 1
) +
theme(text=element_text(size=12))

Simple Logistic Regression
Fitting a logistic regression
- Logistic regression
- Generalized linear model (GLM)
- Linear regression - Makes the assumption that the residuals follow a
Gaussian (normal) distribution
- Logistic regression - Assumes that residuals follow a
binomial distribution
- Used when the response variable is logical (binary)
- The responses follow logistic (S-shaped) curve
# Logistic Regression
# Using a linear model
mdl_churn_vs_recency_lm <- lm(has_churned ~ time_since_last_purchase, data = churn)
coeffs <- coefficients(mdl_churn_vs_recency_lm)
intercept <- coeffs[1]
slope <- coeffs[2]
# Predictions are probabilities of churn, not amounts of churn
# Problem: The model predicts probabilities greater than one or negative probabilities
g1 <- ggplot(churn, aes(time_since_last_purchase, has_churned)) +
geom_point() +
geom_abline(intercept = intercept, slope = slope) +
xlim(-10, 10) + # Zooming out
ylim(-0.2, 1.2)
# Linear regression using glm()
# Assume residual follows Gaussian distribution: same as above linear regression
glm(has_churned ~ time_since_last_purchase, data = churn, family = gaussian)
##
## Call: glm(formula = has_churned ~ time_since_last_purchase, family = gaussian,
## data = churn)
##
## Coefficients:
## (Intercept) time_since_last_purchase
## 0.49078 0.06378
##
## Degrees of Freedom: 399 Total (i.e. Null); 398 Residual
## Null Deviance: 100
## Residual Deviance: 98.02 AIC: 578.7
# Logistic regression
# Assume residual follows Binomial distribution
mdl_recency_glm <- glm(has_churned ~ time_since_last_purchase, data = churn, family = binomial)
mdl_recency_glm
##
## Call: glm(formula = has_churned ~ time_since_last_purchase, family = binomial,
## data = churn)
##
## Coefficients:
## (Intercept) time_since_last_purchase
## -0.03502 0.26921
##
## Degrees of Freedom: 399 Total (i.e. Null); 398 Residual
## Null Deviance: 554.5
## Residual Deviance: 546.4 AIC: 550.4
# The logistic regression curve never goes below zero or above one
g2 <- ggplot(churn, aes(time_since_last_purchase, has_churned)) +
geom_point() +
geom_abline(intercept = intercept, slope = slope) +
stat_smooth(
method = "glm",
se = FALSE,
method.args = list(family = binomial),
fullrange=TRUE # Span the fit
) +
xlim(-10, 10) +
ylim(-0.2, 1.2)
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'

# time_since_last_purchase
# Churners typically have a longer time since their last purchase
g1 <- ggplot(churn, aes(time_since_last_purchase)) +
geom_histogram(binwidth = 0.25) +
facet_grid(rows = vars(has_churned))
# time_since_first_purchase
# Churners have a shorter length of relationship
g2 <- ggplot(churn, aes(time_since_first_purchase)) +
geom_histogram(binwidth = 0.25) +
facet_grid(rows = vars(has_churned))
grid.arrange(g1, g2, nrow = 1)

# Visualizing linear and logistic models
ggplot(churn, aes(time_since_first_purchase, has_churned)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
geom_smooth(method = "glm", se = FALSE, color = "green", method.args = list(family = binomial))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Fit a logistic regression
mdl_churn_vs_relationship <- glm(
has_churned ~ time_since_first_purchase,
data = churn,
family = binomial
)
mdl_churn_vs_relationship
##
## Call: glm(formula = has_churned ~ time_since_first_purchase, family = binomial,
## data = churn)
##
## Coefficients:
## (Intercept) time_since_first_purchase
## -0.01518 -0.35479
##
## Degrees of Freedom: 399 Total (i.e. Null); 398 Residual
## Null Deviance: 554.5
## Residual Deviance: 543.7 AIC: 547.7
Predictions
# Predictions
# The ggplot predictions
plt_churn_vs_recency_base <- ggplot(churn, aes(time_since_last_purchase, has_churned)) +
geom_point() +
geom_smooth(
method = "glm",
se = FALSE,
method.args = list(family = binomial)
)
# Making predictions
mdl_recency <- glm(has_churned ~ time_since_last_purchase, data = churn, family = "binomial")
explanatory_data <- tibble(time_since_last_purchase = seq(-1, 6, 0.25))
prediction_data <- explanatory_data %>%
mutate(
has_churned = predict(mdl_recency, explanatory_data, type = "response"),
# Getting the most likely outcome
most_likely_outcome = round(has_churned) # Not churn if below 0.5, churn if over 0.5
)
plt_churn_vs_recency_base +
geom_point(data = prediction_data, color = "blue") +
# Visualizing most likely outcome
geom_point(aes(y = most_likely_outcome), data = prediction_data, color = "green")
## `geom_smooth()` using formula = 'y ~ x'

Odds ratios
- Odds ratio - The probability that something
happens, divided by the probability that it doesn’t
- \(\text{odds_ratio} =
\frac{\text{probability}}{1 - \text{probability}}\)
- Probability of 0.25 = The odds of “three to one against”
# Odds ratios
odds_ratio <- data.frame(x = seq(0, 1, .001)) %>%
mutate(y = x/(1-x))
ggplot(odds_ratio, aes(x, y)) +
geom_line(linewidth = 1) +
labs(x = "probability", y = "odds_ratio") +
theme(text=element_text(size=15))

mdl_recency <- glm(has_churned ~ time_since_last_purchase, data = churn, family = "binomial")
explanatory_data <- tibble(time_since_last_purchase = seq(-1, 6, 0.25))
prediction_data <- explanatory_data %>%
mutate(
has_churned = predict(mdl_recency, explanatory_data, type = "response"),
most_likely_response = round(has_churned),
odds_ratio = has_churned / (1 - has_churned) # Calculating odds ratio
)
# Visualizing odds ratio
g1 <- ggplot(prediction_data, aes(time_since_last_purchase, odds_ratio)) +
geom_line() +
geom_hline(yintercept = 1, linetype = "dotted")
# The dotted line where the odds ratio is one indicates where churning is just as likely as not churning
# In the bottom-left, the predictions are below one, so the chance of churning is less than the chance of not churning
# In the top-right, the chance of churning is about five times more than the chance of not churning
g2 <- ggplot(prediction_data, aes(time_since_last_purchase, odds_ratio)) +
geom_line() +
geom_hline(yintercept = 1, linetype = "dotted") +
scale_y_log10()
# Logistic regression odds ratios on a log-scale change linearly with the explanatory variable
# This property of the logarithm of odds ratios means log-odds ratio is another common way of describing logistic regression predictions
grid.arrange(g1, g2, nrow = 1)

# Calculating log odds ratio
prediction_data <- explanatory_data %>%
mutate(
has_churned = predict(mdl_recency, explanatory_data, type = "response"),
most_likely_response = round(has_churned),
odds_ratio = has_churned / (1 - has_churned),
log_odds_ratio = log(odds_ratio),
log_odds_ratio2 = predict(mdl_recency, explanatory_data) # Predict will return the log odds ratio if you don't specify the type argument
)
prediction_data
| Probability |
✔ |
✘ |
✔ |
| Most likely outcome |
✔✔ |
✔ |
✘ |
| Odds ratio |
✔ |
✘ |
✔ |
| Log odds ratio |
✘ |
✔ |
✔ |
mdl_churn_vs_relationship <- glm(has_churned ~ time_since_first_purchase, data = churn, family = binomial)
explanatory_data <- tibble(time_since_first_purchase = seq(-1.5, 4, 0.25))
prediction_data <- explanatory_data %>%
mutate(
# Probability of a positive response
has_churned = predict(mdl_churn_vs_relationship, explanatory_data, type = "response"),
# Most likely outcome
most_likely_outcome = round(has_churned),
# Odds ratio
odds_ratio = has_churned / (1 - has_churned),
# Log odds ratio
log_odds_ratio = log(odds_ratio),
log_odds_ratio2 = predict(mdl_churn_vs_relationship, explanatory_data)
)
g1 <- ggplot(mdl_churn_vs_relationship, aes(time_since_first_purchase, has_churned)) +
geom_point() +
geom_smooth(method = "glm", se = FALSE, method.args = list(family = binomial)) +
geom_point(data = prediction_data, color = "yellow", size = 2) +
labs(title = "Probability of a positive response")
g2 <- g1 +
geom_point(data = prediction_data, aes(time_since_first_purchase, most_likely_outcome), color = "yellow", size = 2) +
labs(title = "Most likely outcome")
g3 <- ggplot(prediction_data, aes(time_since_first_purchase, odds_ratio)) +
geom_line() +
geom_hline(yintercept = 1, linetype = "dotted") +
labs(title = "Odds ratio")
g4 <- ggplot(prediction_data, aes(time_since_first_purchase, odds_ratio)) +
geom_line() +
geom_hline(yintercept = 1, linetype = "dotted") +
scale_y_log10() +
labs(title = "Log odds ratio")
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

grid.arrange(g3, g4, nrow = 1)

Quantifying logistic regression fit
- Confusion matrix (Confusion table): Counts of
outcomes
- The basis of all performance metrics for models with a categorical
response
| predicted false |
correct |
false negative |
| predicted true |
false positive |
correct |