# Load the packages
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(gridExtra)))
suppressMessages(suppressWarnings(library(fst)))
suppressMessages(suppressWarnings(library(broom)))
suppressMessages(suppressWarnings(library(UsingR)))
suppressMessages(suppressWarnings(library(ggfortify)))
suppressMessages(suppressWarnings(library(yardstick)))

# Read in the data files
swedish_motor_insurance <- read.csv("insurance.csv")
taiwan_real_estate <- read.fst("taiwan_real_estate.fst")
fish <- read.csv("fish.csv")
ad_conversion <- read.fst("ad_conversion.fst")
churn <- read.fst("churn.fst")

Regression

  • Regression - Statistical models to explore the relationship a response variable and some explanatory variables
    • Given values of explanatory variables, you can predict the values of the response variable
    • Linear regression - The response variable is numeric
    • Logistic regression - The response variable is logical (TRUE/FALSE)
    • Simple linear/logistic regression - There is only one explanatory variable
  • Explanatory variables (Independent variables) - The variables that explain how the response variable will change
  • Response variable (Dependent variable) - The variable that you want to predict

Simple Linear Regression

Fitting a linear regression

  • Linear regression models always fit a straight line to the data
    • \(y = \text{intercept} + \text{slope} ∗ x\)
    • Single categorical explanatory variables
      • The linear regression coefficients are the means of each category

Numeric explanatory variable

# Simple Linear Regression
# Numeric explanatory variable
# Descriptive statistics
swedish_motor_insurance %>% 
  summarize_all(mean)
swedish_motor_insurance %>% 
  summarise(correlation = cor(n_claims, total_payment_sek))
# Visualizing pairs of variables
ggplot(swedish_motor_insurance, aes(n_claims, total_payment_sek)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

# Fitting a linear regression
lm(total_payment_sek ~ n_claims, data = swedish_motor_insurance)
## 
## Call:
## lm(formula = total_payment_sek ~ n_claims, data = swedish_motor_insurance)
## 
## Coefficients:
## (Intercept)     n_claims  
##      19.994        3.414
# total_payment_sek = 19.994 + 3.414 ∗ n_claims

# Visualizing pairs of variables
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

# Fitting a linear regression
lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
## 
## Coefficients:
##   (Intercept)  n_convenience  
##        8.2242         0.7981

Categorical explanatory variables

# Categorical explanatory variables
ggplot(fish, aes(mass_g)) +
  geom_histogram(bins = 9) +
  facet_wrap(vars(species))

# Descriptive statistics
fish %>% 
  group_by(species) %>% 
  summarise(mean_mass_g = mean(mass_g))
lm(mass_g ~ species + 0, data = fish) # +0 specifies that all the coefficients should be given relative to zero
## 
## Call:
## lm(formula = mass_g ~ species + 0, data = fish)
## 
## Coefficients:
## speciesBream  speciesPerch   speciesPike  speciesRoach  
##        617.8         382.2         718.7         152.0
ggplot(taiwan_real_estate, aes(price_twd_msq)) +
  geom_histogram(bins = 10) +
  facet_wrap(vars(house_age_years))

taiwan_real_estate %>% 
  group_by(house_age_years) %>% 
  summarise(mean_by_group = mean(price_twd_msq))
lm(price_twd_msq ~ house_age_years, data = taiwan_real_estate)
## 
## Call:
## lm(formula = price_twd_msq ~ house_age_years, data = taiwan_real_estate)
## 
## Coefficients:
##       (Intercept)  house_age_years.L  house_age_years.Q  
##           11.3025            -0.8798             1.7462
lm(price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)
## 
## Call:
## lm(formula = price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)
## 
## Coefficients:
##  house_age_years0 to 15  house_age_years15 to 30  house_age_years30 to 45  
##                  12.637                    9.877                   11.393

Predictions

  • Extrapolating - Making predictions outside the range of observed data
# Predictions
bream <- fish %>% 
  filter(species == "Bream")

g1 <- ggplot(bream, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

mdl_mass_vs_length <- lm(mass_g ~ length_cm, data = bream)

explanatory_data <- tibble(length_cm = 20:40)

prediction_data <- explanatory_data %>% 
  mutate(
    mass_g = predict(mdl_mass_vs_length, explanatory_data)
  )

# Extrapolating
explanatory_little_bream <- tibble(length_cm = 10)

prediction_little_data <- explanatory_little_bream %>% 
  mutate(
    mass_g = predict(mdl_mass_vs_length, explanatory_little_bream)
  )

g2 <- ggplot(bream, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(
    data = prediction_data,
    color = "green"
  ) +
  geom_point(
    data = prediction_little_data,
    color = "red"
  )

grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

mdl_price_vs_conv <- lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)

# Predicting house prices
explanatory_data <- tibble(n_convenience = 0:10)

prediction_data <- explanatory_data %>% 
  mutate(
    price_twd_msq = predict(mdl_price_vs_conv, explanatory_data)
  )

# Visualizing predictions
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "yellow")
## `geom_smooth()` using formula = 'y ~ x'

Model objects

  • coefficients()
  • fitted() - Predictions on the original dataset
  • residuals() - Actual response values minus predicted response values
  • summary() - Shows a more extended printout of the details of the model
    • Call - The code used to create the model
    • Residuals - Summary statistics of the residuals
      • If the model is a good fit, the residuals should
        • Follow a normal distribution
        • Median is close to zero
        • 1Q and 3Q have about the same absolute value
    • Coefficients - Details of the coefficients
      • First column are the ones returned by the coefficients() function
      • Last column are the p-values, which refer to statistical significance
    • Model metrics - Some metrics on the performance of the model
  • broom package - Contains functions that decompose models into three data frames
    • tidy() - Returns the coefficient details in a data frame
    • augment() - Returns observation level result
    • glance() - Returns model-level results
# Model objects
mdl_mass_vs_length <- lm(mass_g ~ length_cm, data = bream)

# coefficients()
coefficients(mdl_mass_vs_length)
## (Intercept)   length_cm 
## -1035.34757    54.54998
# fitted()
head(fitted(mdl_mass_vs_length))
##        1        2        3        4        5        6 
## 230.2120 273.8520 268.3970 399.3169 410.2269 426.5919
# explanatory_data <- bream %>% select(length_cm)
# predict(mdl_mass_vs_length, explanatory_data)

# residuals()
head(residuals(mdl_mass_vs_length))
##         1         2         3         4         5         6 
##  11.78801  16.14802  71.60302 -36.31693  19.77307  23.40808
# bream$mass_g - fitted(mdl_mass_vs_length)

# summary()
summary(mdl_mass_vs_length)
## 
## Call:
## lm(formula = mass_g ~ length_cm, data = bream)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -233.877  -35.377   -4.797   31.578  207.923 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1035.348    107.973  -9.589 4.58e-11 ***
## length_cm      54.550      3.539  15.415  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.15 on 33 degrees of freedom
## Multiple R-squared:  0.8781, Adjusted R-squared:  0.8744 
## F-statistic: 237.6 on 1 and 33 DF,  p-value: < 2.2e-16
# broom
tidy(mdl_mass_vs_length)
augment(mdl_mass_vs_length)
glance(mdl_mass_vs_length)
mdl_price_vs_conv <- lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)

coefficients(mdl_price_vs_conv)
##   (Intercept) n_convenience 
##     8.2242375     0.7980797
head(fitted(mdl_price_vs_conv))
##        1        2        3        4        5        6 
## 16.20503 15.40695 12.21464 12.21464 12.21464 10.61848
head(residuals(mdl_price_vs_conv))
##          1          2          3          4          5          6 
## -4.7375611 -2.6384224  2.0970130  4.3663019  0.8262112 -0.9059199
summary(mdl_price_vs_conv)
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7132  -2.2213  -0.5409   1.8105  26.5299 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8.22424    0.28500   28.86   <2e-16 ***
## n_convenience  0.79808    0.05653   14.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.384 on 412 degrees of freedom
## Multiple R-squared:  0.326,  Adjusted R-squared:  0.3244 
## F-statistic: 199.3 on 1 and 412 DF,  p-value: < 2.2e-16
# broom package - contains functions that decompose models into three data frames
# tidy - the coefficient-level elements (the coefficients themselves, as well as p-values for each coefficient)
tidy(mdl_price_vs_conv)
# augment - the observation-level elements (like fitted values and residuals)
augment(mdl_price_vs_conv)
# glance - the model-level elements (mostly performance metrics)
glance(mdl_price_vs_conv)
# Manually predicting house prices
coeffs <- coefficients(mdl_price_vs_conv)

intercept <- coeffs[1]
slope <- coeffs[2]

n_convenience <- c(0:10)
explanatory_data %>% 
  mutate(
    price_twd_msq = slope * n_convenience + intercept
  )
# Compare to the results from predict()
predict(mdl_price_vs_conv, explanatory_data)
##         1         2         3         4         5         6         7         8 
##  8.224237  9.022317  9.820397 10.618477 11.416556 12.214636 13.012716 13.810795 
##         9        10        11 
## 14.608875 15.406955 16.205035

Regression to the mean

  • The concept
    • Response value = fitted value + residual
    • “The stuff you explained” + “the stuff you couldn’t explain”
    • Residuals exist due to problems in the model and fundamental randomness
    • Extreme cases are often due to randomness
    • Regression to the mean means extreme cases don’t persist over time
# Regression to the mean
father_son <- father.son %>% 
  mutate(
    father_height_cm = fheight * 2.54,
    son_height_cm = sheight * 2.54
  )

ggplot(father_son, aes(father_height_cm, son_height_cm)) +
  geom_point() +
  geom_abline(color = "green", linewidth = 1) + # geom_abline(intercept = 0, slope = 1), the sons and fathers heights are equal
  coord_fixed() + # coord_fixed(ratio = 1). x:y - 1:1
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

mdl_son_vs_father <- lm(son_height_cm ~ father_height_cm, data = father_son)

really_tall_father <- tibble(father_height_cm = 190)
predict(mdl_son_vs_father, really_tall_father)
##        1 
## 183.7497
really_short_father <- tibble(father_height_cm = 150)
predict(mdl_son_vs_father, really_short_father)
##        1 
## 163.1859

Transforming variables

  • Sometimes, the relationship between the explanatory variable and the response variable may not be a straight line. To fit a linear regression model, you need to transform the explanatory variable or the response variable, or both of them
  • When the response variable is transformed you need to back transform the predictions - backtransformation
# Transforming variables
perch <- fish %>% filter(species == "Perch")

g1 <- ggplot(perch, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

g2 <- ggplot(perch, aes(length_cm ^ 3, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# With transformation
mdl_perch <- lm(mass_g ~ I(length_cm ^ 3), data = perch) # I(): As is. Only for exponentiation

explanatory_data <- tibble(length_cm = seq(10, 40, 5))

prediction_data <- explanatory_data %>% 
  mutate(mass_g = predict(mdl_perch, explanatory_data))

# The linear model has non-linear predictions, after the transformation is undone
g1 <- ggplot(perch, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "red")

g2 <- ggplot(perch, aes(length_cm ^ 3, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "red")

grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

g1 <- ggplot(ad_conversion, aes(spent_usd, n_impressions)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

g2 <- ggplot(ad_conversion, aes(sqrt(spent_usd), sqrt(n_impressions))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

mdl_ad <- lm(sqrt(n_impressions) ~ sqrt(spent_usd), data = ad_conversion)

explanatory_data <- tibble(spent_usd = seq(0, 600, 100))

prediction_data <- explanatory_data %>% 
  mutate(
    sqrt_n_impressions = predict(mdl_ad, explanatory_data),
    n_impressions = sqrt_n_impressions ^ 2
  )

# Square roots are a common transformation when your data has a right-skewed distribution
g1 <- ggplot(ad_conversion, aes(spent_usd, n_impressions)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "red")

g2 <- ggplot(ad_conversion, aes(sqrt(spent_usd), sqrt(n_impressions))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "red")

grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Transforming the explanatory variable
mdl_price_vs_dist <- lm(price_twd_msq ~ sqrt(dist_to_mrt_m), data = taiwan_real_estate)

explanatory_data <- tibble(dist_to_mrt_m = seq(0, 80, 10) ^ 2)

prediction_data <- explanatory_data %>% 
  mutate(price_twd_msq = predict(mdl_price_vs_dist, explanatory_data))

g1 <- ggplot(taiwan_real_estate, aes(dist_to_mrt_m, price_twd_msq)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "green", size = 3)

g2 <- ggplot(taiwan_real_estate, aes(sqrt(dist_to_mrt_m), price_twd_msq)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "green", size = 3)

grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Transforming the response variable
mdl_click_vs_impression <- lm(I(n_clicks ^ 0.25) ~ I(n_impressions ^ 0.25), data = ad_conversion)

explanatory_data <- tibble(n_impressions = seq(0, 3e6, 5e5))

prediction_data <- explanatory_data %>% 
  mutate(
    n_clicks_025 = predict(mdl_click_vs_impression, explanatory_data),
    n_clicks = n_clicks_025 ^ 4 # Backtransformation
  )

g1 <- ggplot(ad_conversion, aes(n_impressions, n_clicks)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "green")

g2 <- ggplot(ad_conversion, aes(n_impressions^.25, n_clicks^.25)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, color = "green")

grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Assessing model fit

  • Metrics
    • Coefficient of determination (r-squared/R-squared) - The proportion of the variance in the response variable that is predictable from the explanatory variable
      • r-squared: For simple linear regression - correlation squared
      • R-squared : More than one explanatory variables
      • 1: a perfect fit
      • 0: the worst possible fit
      • What constitutes a good score depends on your dataset. E.g.
        • A score of zero-point five on a psychological experiment may be exceptionally high because humans are inherently hard to predict
        • A score of zero-point nine may be considered a poor fit
      • summary(): Multiple R-squared
      • glance(): r.squared
      • For simple linear regression, the coefficient of determination is the correlation between the explanatory and response variables, squared
    • Residual standard error (RSE)
      • A “typical” difference between a prediction and an observed response
      • It has the same unit as the response variable
      • Smaller numbers are better, with zero being a perfect fit to the data
      • summary(): Residual standard error
      • glance(): sigma
    • Root-mean-square error (RMSE)
      • Quantifying how inaccurate the model predictions are
      • Worse than RSE for comparisons between models
      • Typically use RSE instead

Coefficient of determination

# Coefficient of determination
mdl_bream <- lm(mass_g ~ length_cm, data = bream)
summary(mdl_bream) # Multiple R-squared
## 
## Call:
## lm(formula = mass_g ~ length_cm, data = bream)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -233.877  -35.377   -4.797   31.578  207.923 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1035.348    107.973  -9.589 4.58e-11 ***
## length_cm      54.550      3.539  15.415  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.15 on 33 degrees of freedom
## Multiple R-squared:  0.8781, Adjusted R-squared:  0.8744 
## F-statistic: 237.6 on 1 and 33 DF,  p-value: < 2.2e-16
mdl_bream %>% glance() # r.squared
mdl_bream %>% 
  glance() %>% 
  pull(r.squared)
## [1] 0.8780627
# Manual calculation
bream %>% 
  summarise(coeff_determination = cor(length_cm, mass_g) ^ 2)
mdl_click_vs_impression_orig <- lm(n_clicks ~ n_impressions, data = ad_conversion)
mdl_click_vs_impression_trans <- lm(I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)

summary(mdl_click_vs_impression_orig)
## 
## Call:
## lm(formula = n_clicks ~ n_impressions, data = ad_conversion)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -186.099   -5.392   -1.422    2.070  119.876 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.683e+00  7.888e-01   2.133   0.0331 *  
## n_impressions 1.718e-04  1.960e-06  87.654   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.91 on 934 degrees of freedom
## Multiple R-squared:  0.8916, Adjusted R-squared:  0.8915 
## F-statistic:  7683 on 1 and 934 DF,  p-value: < 2.2e-16
summary(mdl_click_vs_impression_trans)
## 
## Call:
## lm(formula = I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57061 -0.13229  0.00582  0.14494  0.46888 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.0717479  0.0172019   4.171 3.32e-05 ***
## I(n_impressions^0.25) 0.1115330  0.0008844 126.108  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1969 on 934 degrees of freedom
## Multiple R-squared:  0.9445, Adjusted R-squared:  0.9445 
## F-statistic: 1.59e+04 on 1 and 934 DF,  p-value: < 2.2e-16
# The number of impressions (explanatory) explains 89% of the variability in the number of clicks (response)
mdl_click_vs_impression_orig %>% 
  glance() %>% 
  pull(r.squared)
## [1] 0.8916135
mdl_click_vs_impression_trans %>% 
  glance() %>% 
  pull(r.squared)
## [1] 0.9445273
# Manual calculation
ad_conversion %>% 
  summarise(
    coeff_determination_orig = cor(n_impressions, n_clicks) ^ 2,
    coeff_determination_trans = cor(n_impressions^0.25, n_clicks^0.25) ^ 2
  )

RSE

# Residual standard error (RSE)
summary(mdl_bream) # Residual standard error
## 
## Call:
## lm(formula = mass_g ~ length_cm, data = bream)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -233.877  -35.377   -4.797   31.578  207.923 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1035.348    107.973  -9.589 4.58e-11 ***
## length_cm      54.550      3.539  15.415  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.15 on 33 degrees of freedom
## Multiple R-squared:  0.8781, Adjusted R-squared:  0.8744 
## F-statistic: 237.6 on 1 and 33 DF,  p-value: < 2.2e-16
mdl_bream %>% glance() # sigma
mdl_bream %>% 
  glance() %>% 
  pull(sigma)
## [1] 74.15224
# Manual calculation
bream %>% 
  mutate(residuals_sq = residuals(mdl_bream) ^ 2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    deg_freedom = n() - 2, # Degrees of freedom - the number of observations minus the number of model coefficients
    rse = sqrt(resid_sum_of_sq / deg_freedom)
  ) # Indicating the difference between predicted bream masses and observed bream masses is typically about 74g
summary(mdl_click_vs_impression_orig)
## 
## Call:
## lm(formula = n_clicks ~ n_impressions, data = ad_conversion)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -186.099   -5.392   -1.422    2.070  119.876 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.683e+00  7.888e-01   2.133   0.0331 *  
## n_impressions 1.718e-04  1.960e-06  87.654   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.91 on 934 degrees of freedom
## Multiple R-squared:  0.8916, Adjusted R-squared:  0.8915 
## F-statistic:  7683 on 1 and 934 DF,  p-value: < 2.2e-16
summary(mdl_click_vs_impression_trans)
## 
## Call:
## lm(formula = I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57061 -0.13229  0.00582  0.14494  0.46888 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.0717479  0.0172019   4.171 3.32e-05 ***
## I(n_impressions^0.25) 0.1115330  0.0008844 126.108  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1969 on 934 degrees of freedom
## Multiple R-squared:  0.9445, Adjusted R-squared:  0.9445 
## F-statistic: 1.59e+04 on 1 and 934 DF,  p-value: < 2.2e-16
mdl_click_vs_impression_orig %>% 
  glance() %>% 
  pull(sigma) # # The typical difference between observed number of clicks and predicted number of clicks is 20
## [1] 19.90584
mdl_click_vs_impression_trans %>% 
  glance() %>% 
  pull(sigma)
## [1] 0.1969064
# Manual calculation
ad_conversion %>% 
  mutate(residuals_sq = residuals(mdl_click_vs_impression_orig)^2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    deg_freedom = n() - 2,
    rse = sqrt(resid_sum_of_sq / deg_freedom)
  )
ad_conversion %>% 
  mutate(residuals_sq = residuals(mdl_click_vs_impression_trans) ^ 2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    deg_freedom = n() - 2,
    rse = sqrt(resid_sum_of_sq / deg_freedom)
)

RMSE

# Root-mean-square error (RMSE)
# Calculation
bream %>% 
  mutate(residuals_sq = residuals(mdl_bream) ^ 2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    n_obs = n(),
    rmse = sqrt(resid_sum_of_sq / n_obs)
  )
ad_conversion %>% 
  mutate(residuals_sq = residuals(mdl_click_vs_impression_orig) ^ 2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    n_obs = n(),
    rmse = sqrt(resid_sum_of_sq / n_obs)
)
ad_conversion %>% 
  mutate(residuals_sq = residuals(mdl_click_vs_impression_trans) ^ 2) %>% 
  summarise(
    resid_sum_of_sq = sum(residuals_sq),
    n_obs = n(),
    rmse = sqrt(resid_sum_of_sq / n_obs)
)

Visualizing model fit

  • Linear regression model is a good fit
    • Residuals are normally distributed
    • The mean of the residuals is zero
  • Residuals vs. fitted values
    • LOESS trend line - A smooth curve following the data
    • If residuals met the assumption that they are normally distributed with mean zero, the trend line should closely follow the y equals zero line
    • Shows whether or not the residuals go positive or negative as the fitted values change
  • Q-Q plot
    • Shows whether or not the residuals follow a normal distribution
    • x-axis: The points are quantiles from the normal distribution
    • y-axis: the standardized residuals, which are the residuals divided by their standard deviation
    • If the points track along the straight line, they are normally distributed
  • Scale-location
    • The square root of the standardized residuals versus the fitted values
    • Shows whether the size of the residuals gets bigger or smaller
  • autoplot()
    • Values for which
      • 1 residuals vs. ed values
      • 2 Q-Q plot
      • 3 scale-location
# Visualizing model fit
mdl_click_vs_impression_orig <- lm(formula = n_clicks ~ n_impressions, data = ad_conversion)
mdl_click_vs_impression_trans <- lm(I(n_clicks^0.25) ~ I(n_impressions^0.25), data = ad_conversion)

autoplot(
  mdl_click_vs_impression_orig,
  which = 1:3,
  nrow = 1,
  ncol = 3)

# The residuals track the "y equals 0" line more closely in the transformed model compared to the original model
# The residuals track the "normality" line more closely in the transformed model compared to the original model
# The size of the standardized residuals is more consistent in the transformed model compared to the original model
# Indicating that the transformed model is a better fit for the data
autoplot(
  mdl_click_vs_impression_trans,
  which = 1:3,
  nrow = 1,
  ncol = 3)

Outliers

  • Unusual data point
    • Extreme explanatory values
    • Response values away from the regression line
# Outliers
roach <- fish %>% filter(species == "Roach")

roach %>% 
  mutate(
    has_extreme_length = length_cm < 15 | length_cm > 26,
    has_extreme_mass = mass_g < 1
  ) %>% 
  ggplot(aes(length_cm, mass_g)) +
    geom_point(aes(color = has_extreme_length, shape = has_extreme_mass), size = 4) +
    geom_smooth(method = "lm", se = FALSE) +
    theme(text = element_text(size=14))
## `geom_smooth()` using formula = 'y ~ x'

Leverage

  • Leverage - A measure of how extreme the explanatory variable values are
# Leverage
mdl_roach <- lm(mass_g ~ length_cm, data = roach)
hatvalues(mdl_roach)
##          1          2          3          4          5          6          7 
## 0.31372898 0.12553776 0.09348669 0.07628287 0.06838661 0.06189726 0.06049475 
##          8          9         10         11         12         13         14 
## 0.05681481 0.05026390 0.05009244 0.05009244 0.05055408 0.05091020 0.05807223 
##         15         16         17         18         19         20 
## 0.05807223 0.05930767 0.08839105 0.09948802 0.13338565 0.39474037
augment(mdl_roach)
# Highly leveraged roaches
mdl_roach %>% 
  augment() %>% 
  dplyr::select(mass_g, length_cm, leverage = .hat) %>% 
  arrange(desc(leverage)) %>% 
  head()

Influence

  • Influence - A type of “leave one out” metric. Measures how much the model would change if you reran it without that data point
    • The influence (Cook’s distance) of each observation is based on the size of the residuals and the leverage
    • A bigger number denotes more influence for the observation
# Influence
cooks.distance(mdl_roach)
##            1            2            3            4            5            6 
## 1.074015e+00 1.042888e-02 1.972286e-05 1.979659e-03 6.609838e-03 3.118518e-01 
##            7            8            9           10           11           12 
## 8.525058e-04 1.994809e-04 2.565139e-04 2.563127e-04 2.445136e-03 7.949762e-03 
##           13           14           15           16           17           18 
## 1.372065e-04 4.817271e-03 1.151628e-02 4.519344e-03 6.120857e-02 1.500641e-01 
##           19           20 
## 2.061450e-02 3.657822e-01
augment(mdl_roach)
# Most influential roaches
mdl_roach %>% 
  augment() %>% 
  dplyr::select(mass_g, length_cm, cooks_dist = .cooksd, residual = .resid, leverage = .hat) %>% 
  arrange(desc(cooks_dist)) %>% 
  head()
# Removing the most influential roach
roach_not_short <- roach %>% 
  filter(length_cm != 12.9)

ggplot(roach, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_smooth(method = "lm", se = FALSE, data = roach_not_short, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# autoplot()
# Less helpful for diagnosis than the previous three, other than seeing the labels of the most influential observations
autoplot(
  mdl_roach,
  which = 4:6,
  nrow = 3,
  ncol = 1
) +
theme(text=element_text(size=12))

Simple Logistic Regression

Fitting a logistic regression

  • Logistic regression
    • Generalized linear model (GLM)
      • Linear regression - Makes the assumption that the residuals follow a Gaussian (normal) distribution
      • Logistic regression - Assumes that residuals follow a binomial distribution
    • Used when the response variable is logical (binary)
    • The responses follow logistic (S-shaped) curve
# Logistic Regression
# Using a linear model
mdl_churn_vs_recency_lm <- lm(has_churned ~ time_since_last_purchase, data = churn)

coeffs <- coefficients(mdl_churn_vs_recency_lm)
intercept <- coeffs[1]
slope <- coeffs[2]

# Predictions are probabilities of churn, not amounts of churn
# Problem: The model predicts probabilities greater than one or negative probabilities
g1 <- ggplot(churn, aes(time_since_last_purchase, has_churned)) +
  geom_point() +
  geom_abline(intercept = intercept, slope = slope) +
  xlim(-10, 10) + # Zooming out
  ylim(-0.2, 1.2)

# Linear regression using glm()
# Assume residual follows Gaussian distribution: same as above linear regression
glm(has_churned ~ time_since_last_purchase, data = churn, family = gaussian)
## 
## Call:  glm(formula = has_churned ~ time_since_last_purchase, family = gaussian, 
##     data = churn)
## 
## Coefficients:
##              (Intercept)  time_since_last_purchase  
##                  0.49078                   0.06378  
## 
## Degrees of Freedom: 399 Total (i.e. Null);  398 Residual
## Null Deviance:       100 
## Residual Deviance: 98.02     AIC: 578.7
# Logistic regression
# Assume residual follows Binomial distribution
mdl_recency_glm <- glm(has_churned ~ time_since_last_purchase, data = churn, family = binomial)
mdl_recency_glm
## 
## Call:  glm(formula = has_churned ~ time_since_last_purchase, family = binomial, 
##     data = churn)
## 
## Coefficients:
##              (Intercept)  time_since_last_purchase  
##                 -0.03502                   0.26921  
## 
## Degrees of Freedom: 399 Total (i.e. Null);  398 Residual
## Null Deviance:       554.5 
## Residual Deviance: 546.4     AIC: 550.4
# The logistic regression curve never goes below zero or above one
g2 <- ggplot(churn, aes(time_since_last_purchase, has_churned)) +
  geom_point() +
  geom_abline(intercept = intercept, slope = slope) +
  stat_smooth(
    method = "glm",
    se = FALSE,
    method.args = list(family = binomial),
    fullrange=TRUE # Span the fit
  ) +
  xlim(-10, 10) +
  ylim(-0.2, 1.2)

grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'

# time_since_last_purchase
# Churners typically have a longer time since their last purchase
g1 <- ggplot(churn, aes(time_since_last_purchase)) +
  geom_histogram(binwidth = 0.25) +
  facet_grid(rows = vars(has_churned))

# time_since_first_purchase
# Churners have a shorter length of relationship
g2 <- ggplot(churn, aes(time_since_first_purchase)) +
  geom_histogram(binwidth = 0.25) +
  facet_grid(rows = vars(has_churned))

grid.arrange(g1, g2, nrow = 1)

# Visualizing linear and logistic models
ggplot(churn, aes(time_since_first_purchase, has_churned)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  geom_smooth(method = "glm", se = FALSE, color = "green", method.args = list(family = binomial))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# Fit a logistic regression
mdl_churn_vs_relationship <- glm(
  has_churned ~ time_since_first_purchase,
  data = churn,
  family = binomial
)
mdl_churn_vs_relationship
## 
## Call:  glm(formula = has_churned ~ time_since_first_purchase, family = binomial, 
##     data = churn)
## 
## Coefficients:
##               (Intercept)  time_since_first_purchase  
##                  -0.01518                   -0.35479  
## 
## Degrees of Freedom: 399 Total (i.e. Null);  398 Residual
## Null Deviance:       554.5 
## Residual Deviance: 543.7     AIC: 547.7

Predictions

# Predictions
# The ggplot predictions
plt_churn_vs_recency_base <- ggplot(churn, aes(time_since_last_purchase, has_churned)) +
  geom_point() +
  geom_smooth(
    method = "glm",
    se = FALSE,
    method.args = list(family = binomial)
  )

# Making predictions
mdl_recency <- glm(has_churned ~ time_since_last_purchase, data = churn, family = "binomial")

explanatory_data <- tibble(time_since_last_purchase = seq(-1, 6, 0.25))

prediction_data <- explanatory_data %>% 
  mutate(
    has_churned = predict(mdl_recency, explanatory_data, type = "response"),
    # Getting the most likely outcome
    most_likely_outcome = round(has_churned) # Not churn if below 0.5, churn if over 0.5
  )

plt_churn_vs_recency_base +
  geom_point(data = prediction_data, color = "blue") +
  # Visualizing most likely outcome
  geom_point(aes(y = most_likely_outcome), data = prediction_data, color = "green")
## `geom_smooth()` using formula = 'y ~ x'

Odds ratios

  • Odds ratio - The probability that something happens, divided by the probability that it doesn’t
    • \(\text{odds_ratio} = \frac{\text{probability}}{1 - \text{probability}}\)
    • Probability of 0.25 = The odds of “three to one against”
# Odds ratios
odds_ratio <- data.frame(x = seq(0, 1, .001)) %>% 
  mutate(y = x/(1-x))

ggplot(odds_ratio, aes(x, y)) +
  geom_line(linewidth = 1) +
  labs(x = "probability", y = "odds_ratio") +
  theme(text=element_text(size=15))

mdl_recency <- glm(has_churned ~ time_since_last_purchase, data = churn, family = "binomial")

explanatory_data <- tibble(time_since_last_purchase = seq(-1, 6, 0.25))

prediction_data <- explanatory_data %>% 
  mutate(
    has_churned = predict(mdl_recency, explanatory_data, type = "response"),
    most_likely_response = round(has_churned),
    odds_ratio = has_churned / (1 - has_churned) # Calculating odds ratio
  )

# Visualizing odds ratio
g1 <- ggplot(prediction_data, aes(time_since_last_purchase, odds_ratio)) +
  geom_line() +
  geom_hline(yintercept = 1, linetype = "dotted")
# The dotted line where the odds ratio is one indicates where churning is just as likely as not churning
# In the bottom-left, the predictions are below one, so the chance of churning is less than the chance of not churning
# In the top-right, the chance of churning is about five times more than the chance of not churning

g2 <- ggplot(prediction_data, aes(time_since_last_purchase, odds_ratio)) +
  geom_line() +
  geom_hline(yintercept = 1, linetype = "dotted") +
  scale_y_log10()
# Logistic regression odds ratios on a log-scale change linearly with the explanatory variable
# This property of the logarithm of odds ratios means log-odds ratio is another common way of describing logistic regression predictions

grid.arrange(g1, g2, nrow = 1)

# Calculating log odds ratio
prediction_data <- explanatory_data %>% 
  mutate(
    has_churned = predict(mdl_recency, explanatory_data, type = "response"),
    most_likely_response = round(has_churned),
    odds_ratio = has_churned / (1 - has_churned),
    log_odds_ratio = log(odds_ratio),
    log_odds_ratio2 = predict(mdl_recency, explanatory_data) # Predict will return the log odds ratio if you don't specify the type argument
  )

prediction_data
Scale Are values easy to interpret? Are changes easy to interpret? Is precise?
Probability
Most likely outcome ✔✔
Odds ratio
Log odds ratio
mdl_churn_vs_relationship <- glm(has_churned ~ time_since_first_purchase, data = churn, family = binomial)

explanatory_data <- tibble(time_since_first_purchase = seq(-1.5, 4, 0.25))

prediction_data <- explanatory_data %>% 
  mutate(
    # Probability of a positive response
    has_churned = predict(mdl_churn_vs_relationship, explanatory_data, type = "response"),
    # Most likely outcome
    most_likely_outcome = round(has_churned),
    # Odds ratio
    odds_ratio = has_churned / (1 - has_churned),
    # Log odds ratio
    log_odds_ratio = log(odds_ratio),
    log_odds_ratio2 = predict(mdl_churn_vs_relationship, explanatory_data)
  )

g1 <- ggplot(mdl_churn_vs_relationship, aes(time_since_first_purchase, has_churned)) +
  geom_point() +
  geom_smooth(method = "glm", se = FALSE, method.args = list(family = binomial)) +
  geom_point(data = prediction_data, color = "yellow", size = 2) +
  labs(title = "Probability of a positive response")

g2 <- g1 +
  geom_point(data = prediction_data, aes(time_since_first_purchase, most_likely_outcome), color = "yellow", size = 2) +
  labs(title = "Most likely outcome")

g3 <- ggplot(prediction_data, aes(time_since_first_purchase, odds_ratio)) +
  geom_line() +
  geom_hline(yintercept = 1, linetype = "dotted") +
  labs(title = "Odds ratio")

g4 <- ggplot(prediction_data, aes(time_since_first_purchase, odds_ratio)) +
  geom_line() +
  geom_hline(yintercept = 1, linetype = "dotted") +
  scale_y_log10() +
  labs(title = "Log odds ratio")

grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

grid.arrange(g3, g4, nrow = 1)

Quantifying logistic regression fit

  • Confusion matrix (Confusion table): Counts of outcomes
    • The basis of all performance metrics for models with a categorical response
actual false actual true
predicted false correct false negative
predicted true false positive correct

Performance metrics

  • Accuracy - The proportion of correct predictions
    • \(\text{accuracy} = \frac{TN+TP}{TN+FN+FP+TP}\)
    • Higher accuracy is better
  • Sensitivity - The proportion of true positives
    • \(\text{sensitivity} = \frac{TP}{FN+TP}\)
    • Higher sensitivity is better
  • Specificity - The proportion of true negatives
    • \(\text{specificity} = \frac{TN}{TN+FP}\)
    • Higher specificity is better
    • Tradeoff where improving specificity will decrease sensitivity, or increasing sensitivity will decrease specificity
# Quantifying logistic regression fit
mdl_recency <- glm(has_churned ~ time_since_last_purchase, data = churn, family = "binomial")

actual_response <- churn$has_churned
predicted_response <- round(fitted(mdl_recency))

outcomes <- table(predicted_response, actual_response)
outcomes
##                   actual_response
## predicted_response   0   1
##                  0 141 111
##                  1  59  89
# Visualizing the confusion matrix: mosaic plot
confusion <- conf_mat(outcomes)
confusion
##                   actual_response
## predicted_response   0   1
##                  0 141 111
##                  1  59  89
autoplot(confusion) +
  theme(text=element_text(size=12))

# Performance metrics
summary(confusion, event_level = "second") # The second column contains the positive response
## Accuracy
summary(confusion, event_level = "second") %>% slice(1)
(141 + 89) / (141 + 111 + 59 + 89)
## [1] 0.575
# Sensitivity
summary(confusion, event_level = "second") %>% slice(3)
89 / (111 + 89)
## [1] 0.445
# Specificity
summary(confusion, event_level = "second") %>% slice(4)
141 / (141 + 59)
## [1] 0.705