R Notebook 03 - Introduction to Regression in R

# Load the packages
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(gridExtra)))
suppressMessages(suppressWarnings(library(fst)))
suppressMessages(suppressWarnings(library(broom)))
suppressMessages(suppressWarnings(library(UsingR)))
suppressMessages(suppressWarnings(library(ggfortify)))
suppressMessages(suppressWarnings(library(yardstick)))

# Read in the data files
swedish_motor_insurance <- read.csv("insurance.csv")
taiwan_real_estate <- read.fst("taiwan_real_estate.fst")
fish <- read.csv("fish.csv")
ad_conversion <- read.fst("ad_conversion.fst")
churn <- read.fst("churn.fst")

Regression

Regression - Statistical models to explore the relationship a response variable and some explanatory variables
- Given values of explanatory variables, you can predict the values of the response variable
- Linear regression - The response variable is numeric
- Logistic regression - The response variable is logical (TRUE/FALSE)
- Simple linear/logistic regression - There is only one explanatory variable
Explanatory variables (Independent variables) - The variables that explain how the response variable will change
Response variable (Dependent variable) - The variable that you want to predict

Simple Linear Regression

Fitting a linear regression

Linear regression models always fit a straight line to the data
- \(y = \text{intercept} + \text{slope} ∗ x\)
- Single categorical explanatory variables
  - The linear regression coefficients are the means of each category

Numeric explanatory variable

# Simple Linear Regression
# Numeric explanatory variable
# Descriptive statistics
swedish_motor_insurance %>% 
  summarize_all(mean)

swedish_motor_insurance %>% 
  summarise(correlation = cor(n_claims, total_payment_sek))

# Visualizing pairs of variables
ggplot(swedish_motor_insurance, aes(n_claims, total_payment_sek)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

# Fitting a linear regression
lm(total_payment_sek ~ n_claims, data = swedish_motor_insurance)

## 
## Call:
## lm(formula = total_payment_sek ~ n_claims, data = swedish_motor_insurance)
## 
## Coefficients:
## (Intercept)     n_claims  
##      19.994        3.414

# total_payment_sek = 19.994 + 3.414 ∗ n_claims

# Visualizing pairs of variables
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

# Fitting a linear regression
lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)

## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
## 
## Coefficients:
##   (Intercept)  n_convenience  
##        8.2242         0.7981

Categorical explanatory variables

# Categorical explanatory variables
ggplot(fish, aes(mass_g)) +
  geom_histogram(bins = 9) +
  facet_wrap(vars(species))

# Descriptive statistics
fish %>% 
  group_by(species) %>% 
  summarise(mean_mass_g = mean(mass_g))

lm(mass_g ~ species + 0, data = fish) # +0 specifies that all the coefficients should be given relative to zero

## 
## Call:
## lm(formula = mass_g ~ species + 0, data = fish)
## 
## Coefficients:
## speciesBream  speciesPerch   speciesPike  speciesRoach  
##        617.8         382.2         718.7         152.0

ggplot(taiwan_real_estate, aes(price_twd_msq)) +
  geom_histogram(bins = 10) +
  facet_wrap(vars(house_age_years))

taiwan_real_estate %>% 
  group_by(house_age_years) %>% 
  summarise(mean_by_group = mean(price_twd_msq))

lm(price_twd_msq ~ house_age_years, data = taiwan_real_estate)

## 
## Call:
## lm(formula = price_twd_msq ~ house_age_years, data = taiwan_real_estate)
## 
## Coefficients:
##       (Intercept)  house_age_years.L  house_age_years.Q  
##           11.3025            -0.8798             1.7462

lm(price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)

## 
## Call:
## lm(formula = price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)
## 
## Coefficients:
##  house_age_years0 to 15  house_age_years15 to 30  house_age_years30 to 45  
##                  12.637                    9.877                   11.393

Predictions

Extrapolating - Making predictions outside the range of observed data

# Predictions
bream <- fish %>% 
  filter(species == "Bream")

g1 <- ggplot(bream, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

mdl_mass_vs_length <- lm(mass_g ~ length_cm, data = bream)

explanatory_data <- tibble(length_cm = 20:40)

prediction_data <- explanatory_data %>% 
  mutate(
    mass_g = predict(mdl_mass_vs_length, explanatory_data)
  )

# Extrapolating
explanatory_little_bream <- tibble(length_cm = 10)

prediction_little_data <- explanatory_little_bream %>% 
  mutate(
    mass_g = predict(mdl_mass_vs_length, explanatory_little_bream)
  )

g2 <- ggplot(bream, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(
    data = prediction_data,
    color = "green"
  ) +
  geom_point(
    data = prediction_little_data,
    color = "red"
  )

grid.arrange(g1, g2, nrow = 1)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

mdl_price_vs_conv <- lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)

# Predicting house prices
explanatory_data <- tibble(n_convenience = 0:10)

prediction_data <- explanatory_data %>% 
  mutate(
    price_twd_msq = predict(mdl_price_vs_conv, explanatory_data)
  )

# Visualizing predictions
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "yellow")

## `geom_smooth()` using formula = 'y ~ x'

Model objects

coefficients()
fitted() - Predictions on the original dataset
residuals() - Actual response values minus predicted response values
summary() - Shows a more extended printout of the details of the model
- Call - The code used to create the model
- Residuals - Summary statistics of the residuals
  - If the model is a good fit, the residuals should
    - Follow a normal distribution
    - Median is close to zero
    - 1Q and 3Q have about the same absolute value
- Coefficients - Details of the coefficients
  - First column are the ones returned by the coefficients() function
  - Last column are the p-values, which refer to statistical significance
- Model metrics - Some metrics on the performance of the model
broom package - Contains functions that decompose models into three data frames
- tidy() - Returns the coefficient details in a data frame
- augment() - Returns observation level result
- glance() - Returns model-level results

# Model objects
mdl_mass_vs_length <- lm(mass_g ~ length_cm, data = bream)

# coefficients()
coefficients(mdl_mass_vs_length)

## (Intercept)   length_cm 
## -1035.34757    54.54998

# fitted()
head(fitted(mdl_mass_vs_length))

##        1        2        3        4        5        6 
## 230.2120 273.8520 268.3970 399.3169 410.2269 426.5919

# explanatory_data <- bream %>% select(length_cm)
# predict(mdl_mass_vs_length, explanatory_data)

# residuals()
head(residuals(mdl_mass_vs_length))

##         1         2         3         4         5         6 
##  11.78801  16.14802  71.60302 -36.31693  19.77307  23.40808

# bream$mass_g - fitted(mdl_mass_vs_length)

# summary()
summary(mdl_mass_vs_length)

## 
## Call:
## lm(formula = mass_g ~ length_cm, data = bream)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -233.877  -35.377   -4.797   31.578  207.923 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1035.348    107.973  -9.589 4.58e-11 ***
## length_cm      54.550      3.539  15.415  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.15 on 33 degrees of freedom
## Multiple R-squared:  0.8781, Adjusted R-squared:  0.8744 
## F-statistic: 237.6 on 1 and 33 DF,  p-value: < 2.2e-16

# broom
tidy(mdl_mass_vs_length)

augment(mdl_mass_vs_length)

glance(mdl_mass_vs_length)

mdl_price_vs_conv <- lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)

coefficients(mdl_price_vs_conv)

##   (Intercept) n_convenience 
##     8.2242375     0.7980797

head(fitted(mdl_price_vs_conv))

##        1        2        3        4        5        6 
## 16.20503 15.40695 12.21464 12.21464 12.21464 10.61848

head(residuals(mdl_price_vs_conv))

##          1          2          3          4          5          6 
## -4.7375611 -2.6384224  2.0970130  4.3663019  0.8262112 -0.9059199

summary(mdl_price_vs_conv)

## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7132  -2.2213  -0.5409   1.8105  26.5299 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8.22424    0.28500   28.86   <2e-16 ***
## n_convenience  0.79808    0.05653   14.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.384 on 412 degrees of freedom
## Multiple R-squared:  0.326,  Adjusted R-squared:  0.3244 
## F-statistic: 199.3 on 1 and 412 DF,  p-value: < 2.2e-16

# broom package - contains functions that decompose models into three data frames
# tidy - the coefficient-level elements (the coefficients themselves, as well as p-values for each coefficient)
tidy(mdl_price_vs_conv)

# augment - the observation-level elements (like fitted values and residuals)
augment(mdl_price_vs_conv)

# glance - the model-level elements (mostly performance metrics)
glance(mdl_price_vs_conv)

# Manually predicting house prices
coeffs <- coefficients(mdl_price_vs_conv)

intercept <- coeffs[1]
slope <- coeffs[2]

n_convenience <- c(0:10)
explanatory_data %>% 
  mutate(
    price_twd_msq = slope * n_convenience + intercept
  )

# Compare to the results from predict()
predict(mdl_price_vs_conv, explanatory_data)

##         1         2         3         4         5         6         7         8 
##  8.224237  9.022317  9.820397 10.618477 11.416556 12.214636 13.012716 13.810795 
##         9        10        11 
## 14.608875 15.406955 16.205035

Regression to the mean

The concept
- Response value = fitted value + residual
- “The stuff you explained” + “the stuff you couldn’t explain”
- Residuals exist due to problems in the model and fundamental randomness
- Extreme cases are often due to randomness
- Regression to the mean means extreme cases don’t persist over time

# Regression to the mean
father_son <- father.son %>% 
  mutate(
    father_height_cm = fheight * 2.54,
    son_height_cm = sheight * 2.54
  )

ggplot(father_son, aes(father_height_cm, son_height_cm)) +
  geom_point() +
  geom_abline(color = "green", linewidth = 1) + # geom_abline(intercept = 0, slope = 1), the sons and fathers heights are equal
  coord_fixed() + # coord_fixed(ratio = 1). x:y - 1:1
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

mdl_son_vs_father <- lm(son_height_cm ~ father_height_cm, data = father_son)

really_tall_father <- tibble(father_height_cm = 190)
predict(mdl_son_vs_father, really_tall_father)

##        1 
## 183.7497

really_short_father <- tibble(father_height_cm = 150)
predict(mdl_son_vs_father, really_short_father)

##        1 
## 163.1859

Transforming variables

Sometimes, the relationship between the explanatory variable and the response variable may not be a straight line. To fit a linear regression model, you need to transform the explanatory variable or the response variable, or both of them
When the response variable is transformed you need to back transform the predictions - backtransformation

# Transforming variables
perch <- fish %>% filter(species == "Perch")

g1 <- ggplot(perch, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

g2 <- ggplot(perch, aes(length_cm ^ 3, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

grid.arrange(g1, g2, nrow = 1)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# With transformation
mdl_perch <- lm(mass_g ~ I(length_cm ^ 3), data = perch) # I(): As is. Only for exponentiation

explanatory_data <- tibble(length_cm = seq(10, 40, 5))

prediction_data <- explanatory_data %>% 
  mutate(mass_g = predict(mdl_perch, explanatory_data))

# The linear model has non-linear predictions, after the transformation is undone
g1 <- ggplot(perch, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "red")

g2 <- ggplot(perch, aes(length_cm ^ 3, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "red")

grid.arrange(g1, g2, nrow = 1)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

g1 <- ggplot(ad_conversion, aes(spent_usd, n_impressions)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

g2 <- ggplot(ad_conversion, aes(sqrt(spent_usd), sqrt(n_impressions))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

grid.arrange(g1, g2, nrow = 1)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

mdl_ad <- lm(sqrt(n_impressions) ~ sqrt(spent_usd), data = ad_conversion)

explanatory_data <- tibble(spent_usd = seq(0, 600, 100))

prediction_data <- explanatory_data %>% 
  mutate(
    sqrt_n_impressions = predict(mdl_ad, explanatory_data),
    n_impressions = sqrt_n_impressions ^ 2
  )

# Square roots are a common transformation when your data has a right-skewed distribution
g1 <- ggplot(ad_conversion, aes(spent_usd, n_impressions)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "red")

g2 <- ggplot(ad_conversion, aes(sqrt(spent_usd), sqrt(n_impressions))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "red")

grid.arrange(g1, g2, nrow = 1)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Transforming the explanatory variable
mdl_price_vs_dist <- lm(price_twd_msq ~ sqrt(dist_to_mrt_m), data = taiwan_real_estate)

explanatory_data <- tibble(dist_to_mrt_m = seq(0, 80, 10) ^ 2)

prediction_data <- explanatory_data %>% 
  mutate(price_twd_msq = predict(mdl_price_vs_dist, explanatory_data))

g1 <- ggplot(taiwan_real_estate, aes(dist_to_mrt_m, price_twd_msq)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "green", size = 3)

g2 <- ggplot(taiwan_real_estate, aes(sqrt(dist_to_mrt_m), price_twd_msq)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "green", size = 3)

grid.arrange(g1, g2, nrow = 1)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Transforming the response variable
mdl_click_vs_impression <- lm(I(n_clicks ^ 0.25) ~ I(n_impressions ^ 0.25), data = ad_conversion)

explanatory_data <- tibble(n_impressions = seq(0, 3e6, 5e5))

prediction_data <- explanatory_data %>% 
  mutate(
    n_clicks_025 = predict(mdl_click_vs_impression, explanatory_data),
    n_clicks = n_clicks_025 ^ 4 # Backtransformation
  )

g1 <- ggplot(ad_conversion, aes(n_impressions, n_clicks)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "green")

g2 <- ggplot(ad_conversion, aes(n_impressions^.25, n_clicks^.25)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "green")

grid.arrange(g1, g2, nrow = 1)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Assessing model fit

Metrics
- Coefficient of determination (r-squared/R-squared) - The proportion of the variance in the response variable that is predictable from the explanatory variable
  - r-squared: For simple linear regression - correlation squared
  - R-squared : More than one explanatory variables
  - 1: a perfect fit
  - 0: the worst possible fit
  - What constitutes a good score depends on your dataset. E.g.
    - A score of zero-point five on a psychological experiment may be exceptionally high because humans are inherently hard to predict
    - A score of zero-point nine may be considered a poor fit
  - summary(): Multiple R-squared
  - glance(): r.squared
  - For simple linear regression, the coefficient of determination is the correlation between the explanatory and response variables, squared
- Residual standard error (RSE)
  - A “typical” difference between a prediction and an observed response
  - It has the same unit as the response variable
  - Smaller numbers are better, with zero being a perfect fit to the data
  - summary(): Residual standard error
  - glance(): sigma
- Root-mean-square error (RMSE)
  - Quantifying how inaccurate the model predictions are
  - Worse than RSE for comparisons between models
  - Typically use RSE instead

Coefficient of determination

# Coefficient of determination
mdl_bream <- lm(mass_g ~ length_cm, data = bream)
summary(mdl_bream) # Multiple R-squared

## 
## Call:
## lm(formula = mass_g ~ length_cm, data = bream)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -233.877  -35.377   -4.797   31.578  207.923 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1035.348    107.973  -9.589 4.58e-11 ***
## length_cm      54.550      3.539  15.415  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.15 on 33 degrees of freedom
## Multiple R-squared:  0.8781, Adjusted R-squared:  0.8744 
## F-statistic: 237.6 on 1 and 33 DF,  p-value: < 2.2e-16

mdl_bream %>% glance() # r.squared

mdl_bream %>% 
  glance() %>% 
  pull(r.squared)

## [1] 0.8780627

# Manual calculation
bream %>% 
  summarise(coeff_determination = cor(length_cm, mass_g) ^ 2)

mdl_click_vs_impression_orig <- lm(n_clicks ~ n_impressions, data = ad_conversion)
mdl_click_vs_impression_trans <- lm(I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)

summary(mdl_click_vs_impression_orig)

## 
## Call:
## lm(formula = n_clicks ~ n_impressions, data = ad_conversion)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -186.099   -5.392   -1.422    2.070  119.876 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.683e+00  7.888e-01   2.133   0.0331 *  
## n_impressions 1.718e-04  1.960e-06  87.654   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.91 on 934 degrees of freedom
## Multiple R-squared:  0.8916, Adjusted R-squared:  0.8915 
## F-statistic:  7683 on 1 and 934 DF,  p-value: < 2.2e-16

summary(mdl_click_vs_impression_trans)

## 
## Call:
## lm(formula = I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57061 -0.13229  0.00582  0.14494  0.46888 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.0717479  0.0172019   4.171 3.32e-05 ***
## I(n_impressions^0.25) 0.1115330  0.0008844 126.108  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1969 on 934 degrees of freedom
## Multiple R-squared:  0.9445, Adjusted R-squared:  0.9445 
## F-statistic: 1.59e+04 on 1 and 934 DF,  p-value: < 2.2e-16

# The number of impressions (explanatory) explains 89% of the variability in the number of clicks (response)
mdl_click_vs_impression_orig %>% 
  glance() %>% 
  pull(r.squared)

## [1] 0.8916135

mdl_click_vs_impression_trans %>% 
  glance() %>% 
  pull(r.squared)

## [1] 0.9445273

# Manual calculation
ad_conversion %>% 
  summarise(
    coeff_determination_orig = cor(n_impressions, n_clicks) ^ 2,
    coeff_determination_trans = cor(n_impressions^0.25, n_clicks^0.25) ^ 2
  )

RSE

# Residual standard error (RSE)
summary(mdl_bream) # Residual standard error

## 
## Call:
## lm(formula = mass_g ~ length_cm, data = bream)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -233.877  -35.377   -4.797   31.578  207.923 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1035.348    107.973  -9.589 4.58e-11 ***
## length_cm      54.550      3.539  15.415  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.15 on 33 degrees of freedom
## Multiple R-squared:  0.8781, Adjusted R-squared:  0.8744 
## F-statistic: 237.6 on 1 and 33 DF,  p-value: < 2.2e-16

mdl_bream %>% glance() # sigma

mdl_bream %>% 
  glance() %>% 
  pull(sigma)

## [1] 74.15224

# Manual calculation
bream %>% 
  mutate(residuals_sq = residuals(mdl_bream) ^ 2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    deg_freedom = n() - 2, # Degrees of freedom - the number of observations minus the number of model coefficients
    rse = sqrt(resid_sum_of_sq / deg_freedom)
  ) # Indicating the difference between predicted bream masses and observed bream masses is typically about 74g

summary(mdl_click_vs_impression_orig)

## 
## Call:
## lm(formula = n_clicks ~ n_impressions, data = ad_conversion)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -186.099   -5.392   -1.422    2.070  119.876 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.683e+00  7.888e-01   2.133   0.0331 *  
## n_impressions 1.718e-04  1.960e-06  87.654   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.91 on 934 degrees of freedom
## Multiple R-squared:  0.8916, Adjusted R-squared:  0.8915 
## F-statistic:  7683 on 1 and 934 DF,  p-value: < 2.2e-16

summary(mdl_click_vs_impression_trans)

## 
## Call:
## lm(formula = I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57061 -0.13229  0.00582  0.14494  0.46888 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.0717479  0.0172019   4.171 3.32e-05 ***
## I(n_impressions^0.25) 0.1115330  0.0008844 126.108  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1969 on 934 degrees of freedom
## Multiple R-squared:  0.9445, Adjusted R-squared:  0.9445 
## F-statistic: 1.59e+04 on 1 and 934 DF,  p-value: < 2.2e-16

mdl_click_vs_impression_orig %>% 
  glance() %>% 
  pull(sigma) # # The typical difference between observed number of clicks and predicted number of clicks is 20

## [1] 19.90584

mdl_click_vs_impression_trans %>% 
  glance() %>% 
  pull(sigma)

## [1] 0.1969064

# Manual calculation
ad_conversion %>% 
  mutate(residuals_sq = residuals(mdl_click_vs_impression_orig)^2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    deg_freedom = n() - 2,
    rse = sqrt(resid_sum_of_sq / deg_freedom)
  )

ad_conversion %>% 
  mutate(residuals_sq = residuals(mdl_click_vs_impression_trans) ^ 2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    deg_freedom = n() - 2,
    rse = sqrt(resid_sum_of_sq / deg_freedom)
)

RMSE

# Root-mean-square error (RMSE)
# Calculation
bream %>% 
  mutate(residuals_sq = residuals(mdl_bream) ^ 2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    n_obs = n(),
    rmse = sqrt(resid_sum_of_sq / n_obs)
  )

ad_conversion %>% 
  mutate(residuals_sq = residuals(mdl_click_vs_impression_orig) ^ 2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    n_obs = n(),
    rmse = sqrt(resid_sum_of_sq / n_obs)
)

ad_conversion %>% 
  mutate(residuals_sq = residuals(mdl_click_vs_impression_trans) ^ 2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    n_obs = n(),
    rmse = sqrt(resid_sum_of_sq / n_obs)
)

Visualizing model fit

Linear regression model is a good fit
- Residuals are normally distributed
- The mean of the residuals is zero
Residuals vs. fitted values
- LOESS trend line - A smooth curve following the data
- If residuals met the assumption that they are normally distributed with mean zero, the trend line should closely follow the y equals zero line
- Shows whether or not the residuals go positive or negative as the fitted values change
Q-Q plot
- Shows whether or not the residuals follow a normal distribution
- x-axis: The points are quantiles from the normal distribution
- y-axis: the standardized residuals, which are the residuals divided by their standard deviation
- If the points track along the straight line, they are normally distributed
Scale-location
- The square root of the standardized residuals versus the fitted values
- Shows whether the size of the residuals gets bigger or smaller
autoplot()
- Values for which
  - 1 residuals vs. ed values
  - 2 Q-Q plot
  - 3 scale-location

# Visualizing model fit
mdl_click_vs_impression_orig <- lm(formula = n_clicks ~ n_impressions, data = ad_conversion)
mdl_click_vs_impression_trans <- lm(I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)

autoplot(
  mdl_click_vs_impression_orig,
  which = 1:3,
  nrow = 1,
  ncol = 3)

# The residuals track the "y equals 0" line more closely in the transformed model compared to the original model
# The residuals track the "normality" line more closely in the transformed model compared to the original model
# The size of the standardized residuals is more consistent in the transformed model compared to the original model
# Indicating that the transformed model is a better fit for the data
autoplot(
  mdl_click_vs_impression_trans,
  which = 1:3,
  nrow = 1,
  ncol = 3)

Outliers

Unusual data point
- Extreme explanatory values
- Response values away from the regression line

# Outliers
roach <- fish %>% filter(species == "Roach")

roach %>% 
  mutate(
    has_extreme_length = length_cm < 15 | length_cm > 26,
    has_extreme_mass = mass_g < 1
  ) %>% 
  ggplot(aes(length_cm, mass_g)) +
    geom_point(aes(color = has_extreme_length, shape = has_extreme_mass), size = 4) +
    geom_smooth(method = "lm", se = FALSE) +
    theme(text = element_text(size=14))

## `geom_smooth()` using formula = 'y ~ x'

Leverage

Leverage - A measure of how extreme the explanatory variable values are

# Leverage
mdl_roach <- lm(mass_g ~ length_cm, data = roach)
hatvalues(mdl_roach)

##          1          2          3          4          5          6          7 
## 0.31372898 0.12553776 0.09348669 0.07628287 0.06838661 0.06189726 0.06049475 
##          8          9         10         11         12         13         14 
## 0.05681481 0.05026390 0.05009244 0.05009244 0.05055408 0.05091020 0.05807223 
##         15         16         17         18         19         20 
## 0.05807223 0.05930767 0.08839105 0.09948802 0.13338565 0.39474037

augment(mdl_roach)

# Highly leveraged roaches
mdl_roach %>% 
  augment() %>% 
  dplyr::select(mass_g, length_cm, leverage = .hat) %>% 
  arrange(desc(leverage)) %>% 
  head()

Influence

Influence - A type of “leave one out” metric. Measures how much the model would change if you reran it without that data point
- The influence (Cook’s distance) of each observation is based on the size of the residuals and the leverage
- A bigger number denotes more influence for the observation

# Influence
cooks.distance(mdl_roach)

##            1            2            3            4            5            6 
## 1.074015e+00 1.042888e-02 1.972286e-05 1.979659e-03 6.609838e-03 3.118518e-01 
##            7            8            9           10           11           12 
## 8.525058e-04 1.994809e-04 2.565139e-04 2.563127e-04 2.445136e-03 7.949762e-03 
##           13           14           15           16           17           18 
## 1.372065e-04 4.817271e-03 1.151628e-02 4.519344e-03 6.120857e-02 1.500641e-01 
##           19           20 
## 2.061450e-02 3.657822e-01

augment(mdl_roach)

# Most influential roaches
mdl_roach %>% 
  augment() %>% 
  dplyr::select(mass_g, length_cm, cooks_dist = .cooksd, residual = .resid, leverage = .hat) %>% 
  arrange(desc(cooks_dist)) %>% 
  head()

# Removing the most influential roach
roach_not_short <- roach %>% 
  filter(length_cm != 12.9)

ggplot(roach, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_smooth(method = "lm", se = FALSE, data = roach_not_short, color = "red")

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# autoplot()
# Less helpful for diagnosis than the previous three, other than seeing the labels of the most influential observations
autoplot(
  mdl_roach,
  which = 4:6,
  nrow = 3,
  ncol = 1
) +
theme(text=element_text(size=12))

Simple Logistic Regression

Fitting a logistic regression

Logistic regression
- Generalized linear model (GLM)
  - Linear regression - Makes the assumption that the residuals follow a Gaussian (normal) distribution
  - Logistic regression - Assumes that residuals follow a binomial distribution
- Used when the response variable is logical (binary)
- The responses follow logistic (S-shaped) curve

# Logistic Regression
# Using a linear model
mdl_churn_vs_recency_lm <- lm(has_churned ~ time_since_last_purchase, data = churn)

coeffs <- coefficients(mdl_churn_vs_recency_lm)
intercept <- coeffs[1]
slope <- coeffs[2]

# Predictions are probabilities of churn, not amounts of churn
# Problem: The model predicts probabilities greater than one or negative probabilities
g1 <- ggplot(churn, aes(time_since_last_purchase, has_churned)) +
  geom_point() +
  geom_abline(intercept = intercept, slope = slope) +
  xlim(-10, 10) + # Zooming out
  ylim(-0.2, 1.2)

# Linear regression using glm()
# Assume residual follows Gaussian distribution: same as above linear regression
glm(has_churned ~ time_since_last_purchase, data = churn, family = gaussian)

## 
## Call:  glm(formula = has_churned ~ time_since_last_purchase, family = gaussian, 
##     data = churn)
## 
## Coefficients:
##              (Intercept)  time_since_last_purchase  
##                  0.49078                   0.06378  
## 
## Degrees of Freedom: 399 Total (i.e. Null);  398 Residual
## Null Deviance:       100 
## Residual Deviance: 98.02     AIC: 578.7

# Logistic regression
# Assume residual follows Binomial distribution
mdl_recency_glm <- glm(has_churned ~ time_since_last_purchase, data = churn, family = binomial)
mdl_recency_glm

## 
## Call:  glm(formula = has_churned ~ time_since_last_purchase, family = binomial, 
##     data = churn)
## 
## Coefficients:
##              (Intercept)  time_since_last_purchase  
##                 -0.03502                   0.26921  
## 
## Degrees of Freedom: 399 Total (i.e. Null);  398 Residual
## Null Deviance:       554.5 
## Residual Deviance: 546.4     AIC: 550.4

# The logistic regression curve never goes below zero or above one
g2 <- ggplot(churn, aes(time_since_last_purchase, has_churned)) +
  geom_point() +
  geom_abline(intercept = intercept, slope = slope) +
  stat_smooth(
    method = "glm",
    se = FALSE,
    method.args = list(family = binomial),
    fullrange=TRUE # Span the fit
  ) +
  xlim(-10, 10) +
  ylim(-0.2, 1.2)

grid.arrange(g1, g2, nrow = 1)

## `geom_smooth()` using formula = 'y ~ x'

# time_since_last_purchase
# Churners typically have a longer time since their last purchase
g1 <- ggplot(churn, aes(time_since_last_purchase)) +
  geom_histogram(binwidth = 0.25) +
  facet_grid(rows = vars(has_churned))

# time_since_first_purchase
# Churners have a shorter length of relationship
g2 <- ggplot(churn, aes(time_since_first_purchase)) +
  geom_histogram(binwidth = 0.25) +
  facet_grid(rows = vars(has_churned))

grid.arrange(g1, g2, nrow = 1)

# Visualizing linear and logistic models
ggplot(churn, aes(time_since_first_purchase, has_churned)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  geom_smooth(method = "glm", se = FALSE, color = "green", method.args = list(family = binomial))

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Fit a logistic regression
mdl_churn_vs_relationship <- glm(
  has_churned ~ time_since_first_purchase,
  data = churn,
  family = binomial
)
mdl_churn_vs_relationship

## 
## Call:  glm(formula = has_churned ~ time_since_first_purchase, family = binomial, 
##     data = churn)
## 
## Coefficients:
##               (Intercept)  time_since_first_purchase  
##                  -0.01518                   -0.35479  
## 
## Degrees of Freedom: 399 Total (i.e. Null);  398 Residual
## Null Deviance:       554.5 
## Residual Deviance: 543.7     AIC: 547.7

Predictions

# Predictions
# The ggplot predictions
plt_churn_vs_recency_base <- ggplot(churn, aes(time_since_last_purchase, has_churned)) +
  geom_point() +
  geom_smooth(
    method = "glm",
    se = FALSE,
    method.args = list(family = binomial)
  )

# Making predictions
mdl_recency <- glm(has_churned ~ time_since_last_purchase, data = churn, family = "binomial")

explanatory_data <- tibble(time_since_last_purchase = seq(-1, 6, 0.25))

prediction_data <- explanatory_data %>% 
  mutate(
    has_churned = predict(mdl_recency, explanatory_data, type = "response"),
    # Getting the most likely outcome
    most_likely_outcome = round(has_churned) # Not churn if below 0.5, churn if over 0.5
  )

plt_churn_vs_recency_base +
  geom_point(data = prediction_data, color = "blue") +
  # Visualizing most likely outcome
  geom_point(aes(y = most_likely_outcome), data = prediction_data, color = "green")

## `geom_smooth()` using formula = 'y ~ x'

Odds ratios

Odds ratio - The probability that something happens, divided by the probability that it doesn’t
- \(\text{odds_ratio} = \frac{\text{probability}}{1 - \text{probability}}\)
- Probability of 0.25 = The odds of “three to one against”

# Odds ratios
odds_ratio <- data.frame(x = seq(0, 1, .001)) %>% 
  mutate(y = x/(1-x))

ggplot(odds_ratio, aes(x, y)) +
  geom_line(linewidth = 1) +
  labs(x = "probability", y = "odds_ratio") +
  theme(text=element_text(size=15))

mdl_recency <- glm(has_churned ~ time_since_last_purchase, data = churn, family = "binomial")

explanatory_data <- tibble(time_since_last_purchase = seq(-1, 6, 0.25))

prediction_data <- explanatory_data %>% 
  mutate(
    has_churned = predict(mdl_recency, explanatory_data, type = "response"),
    most_likely_response = round(has_churned),
    odds_ratio = has_churned / (1 - has_churned) # Calculating odds ratio
  )

# Visualizing odds ratio
g1 <- ggplot(prediction_data, aes(time_since_last_purchase, odds_ratio)) +
  geom_line() +
  geom_hline(yintercept = 1, linetype = "dotted")
# The dotted line where the odds ratio is one indicates where churning is just as likely as not churning
# In the bottom-left, the predictions are below one, so the chance of churning is less than the chance of not churning
# In the top-right, the chance of churning is about five times more than the chance of not churning

g2 <- ggplot(prediction_data, aes(time_since_last_purchase, odds_ratio)) +
  geom_line() +
  geom_hline(yintercept = 1, linetype = "dotted") +
  scale_y_log10()
# Logistic regression odds ratios on a log-scale change linearly with the explanatory variable
# This property of the logarithm of odds ratios means log-odds ratio is another common way of describing logistic regression predictions

grid.arrange(g1, g2, nrow = 1)

# Calculating log odds ratio
prediction_data <- explanatory_data %>% 
  mutate(
    has_churned = predict(mdl_recency, explanatory_data, type = "response"),
    most_likely_response = round(has_churned),
    odds_ratio = has_churned / (1 - has_churned),
    log_odds_ratio = log(odds_ratio),
    log_odds_ratio2 = predict(mdl_recency, explanatory_data) # Predict will return the log odds ratio if you don't specify the type argument
  )

prediction_data

Scale	Are values easy to interpret?	Are changes easy to interpret?	Is precise?
Probability	✔	✘	✔
Most likely outcome	✔✔	✔	✘
Odds ratio	✔	✘	✔
Log odds ratio	✘	✔	✔

mdl_churn_vs_relationship <- glm(has_churned ~ time_since_first_purchase, data = churn, family = binomial)

explanatory_data <- tibble(time_since_first_purchase = seq(-1.5, 4, 0.25))

prediction_data <- explanatory_data %>% 
  mutate(
    # Probability of a positive response
    has_churned = predict(mdl_churn_vs_relationship, explanatory_data, type = "response"),
    # Most likely outcome
    most_likely_outcome = round(has_churned),
    # Odds ratio
    odds_ratio = has_churned / (1 - has_churned),
    # Log odds ratio
    log_odds_ratio = log(odds_ratio),
    log_odds_ratio2 = predict(mdl_churn_vs_relationship, explanatory_data)
  )

g1 <- ggplot(mdl_churn_vs_relationship, aes(time_since_first_purchase, has_churned)) +
  geom_point() +
  geom_smooth(method = "glm", se = FALSE, method.args = list(family = binomial)) +
  geom_point(data = prediction_data, color = "yellow", size = 2) +
  labs(title = "Probability of a positive response")

g2 <- g1 +
  geom_point(data = prediction_data, aes(time_since_first_purchase, most_likely_outcome), color = "yellow", size = 2) +
  labs(title = "Most likely outcome")

g3 <- ggplot(prediction_data, aes(time_since_first_purchase, odds_ratio)) +
  geom_line() +
  geom_hline(yintercept = 1, linetype = "dotted") +
  labs(title = "Odds ratio")

g4 <- ggplot(prediction_data, aes(time_since_first_purchase, odds_ratio)) +
  geom_line() +
  geom_hline(yintercept = 1, linetype = "dotted") +
  scale_y_log10() +
  labs(title = "Log odds ratio")

grid.arrange(g1, g2, nrow = 1)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

grid.arrange(g3, g4, nrow = 1)

Quantifying logistic regression fit

Confusion matrix (Confusion table): Counts of outcomes
- The basis of all performance metrics for models with a categorical response

	actual false	actual true
predicted false	correct	false negative
predicted true	false positive	correct

Performance metrics

Accuracy - The proportion of correct predictions
- \(\text{accuracy} = \frac{TN+TP}{TN+FN+FP+TP}\)
- Higher accuracy is better
Sensitivity - The proportion of true positives
- \(\text{sensitivity} = \frac{TP}{FN+TP}\)
- Higher sensitivity is better
Specificity - The proportion of true negatives
- \(\text{specificity} = \frac{TN}{TN+FP}\)
- Higher specificity is better
- Tradeoff where improving specificity will decrease sensitivity, or increasing sensitivity will decrease specificity

# Quantifying logistic regression fit
mdl_recency <- glm(has_churned ~ time_since_last_purchase, data = churn, family = "binomial")

actual_response <- churn$has_churned
predicted_response <- round(fitted(mdl_recency))

outcomes <- table(predicted_response, actual_response)
outcomes

##                   actual_response
## predicted_response   0   1
##                  0 141 111
##                  1  59  89

# Visualizing the confusion matrix: mosaic plot
confusion <- conf_mat(outcomes)
confusion

##                   actual_response
## predicted_response   0   1
##                  0 141 111
##                  1  59  89

autoplot(confusion) +
  theme(text=element_text(size=12))

# Performance metrics
summary(confusion, event_level = "second") # The second column contains the positive response

## Accuracy
summary(confusion, event_level = "second") %>% slice(1)

(141 + 89) / (141 + 111 + 59 + 89)

## [1] 0.575

# Sensitivity
summary(confusion, event_level = "second") %>% slice(3)

89 / (111 + 89)

## [1] 0.445

# Specificity
summary(confusion, event_level = "second") %>% slice(4)

141 / (141 + 59)

## [1] 0.705