# Load the packages
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(gridExtra)))
suppressMessages(suppressWarnings(library(fst)))
suppressMessages(suppressWarnings(library(moderndive)))
suppressMessages(suppressWarnings(library(broom)))
suppressMessages(suppressWarnings(library(plot3D)))
suppressMessages(suppressWarnings(library(magrittr)))
suppressMessages(suppressWarnings(library(yardstick)))

# Read in the data files
fish <- read.csv("fish.csv")
taiwan_real_estate <- read.fst("taiwan_real_estate2.fst")
auctions <- read.csv("auctions.csv")
churn <- read.fst("churn.fst")

Multiple regression

  • Simple regression - A regression model with a single explanatory variable
  • Multiple regression - A regression model with more than one explanatory variable
    • More explanatory variables can give more insight and better predictions
    • Linear regression
      • Parallel slopes linear regression - One numeric explanatory variable and one categorical explanatory variable
    • Logistic regression

Parallel slopes linear regression

Fitting parallel slopes

  • Parallel slopes linear regression - One numeric explanatory variable and one categorical explanatory variable
# Parallel slopes linear regression
# One numeric explanatory variable
# One intercept coefficient and one slope coefficient
mdl_mass_vs_length <- lm(mass_g ~ length_cm, data = fish)
mdl_mass_vs_length
## 
## Call:
## lm(formula = mass_g ~ length_cm, data = fish)
## 
## Coefficients:
## (Intercept)    length_cm  
##      -536.2         34.9
# One categorical explanatory variable
# One intercept coefficient for each category
mdl_mass_vs_species <- lm(mass_g ~ species + 0, data = fish)
mdl_mass_vs_species
## 
## Call:
## lm(formula = mass_g ~ species + 0, data = fish)
## 
## Coefficients:
## speciesBream  speciesPerch   speciesPike  speciesRoach  
##        617.8         382.2         718.7         152.0
# One numeric explanatory variable and one categorical explanatory variable
# One slope coefficient, and one intercept coefficient for each category
mdl_mass_vs_both <- lm(mass_g ~ length_cm + species + 0, data = fish)
mdl_mass_vs_both
## 
## Call:
## lm(formula = mass_g ~ length_cm + species + 0, data = fish)
## 
## Coefficients:
##    length_cm  speciesBream  speciesPerch   speciesPike  speciesRoach  
##        42.57       -672.24       -713.29      -1089.46       -726.78
# Visualization
# One numeric explanatory variable
g1 <- ggplot(fish, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

# One categorical explanatory variable
g2 <- ggplot(fish, aes(species, mass_g)) +
  geom_boxplot() +
  stat_summary(fun = mean, shape = 15)

# One numeric explanatory variable and one categorical explanatory variable
g3 <- ggplot(fish, aes(length_cm, mass_g, color = species)) +
  geom_point() +
  geom_parallel_slopes(se = FALSE)

grid.arrange(g1, g2, g3, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing missing values (`geom_segment()`).

mdl_price_vs_conv <- lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)
mdl_price_vs_conv
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
## 
## Coefficients:
##   (Intercept)  n_convenience  
##        8.2242         0.7981
mdl_price_vs_age <- lm(price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)
mdl_price_vs_age
## 
## Call:
## lm(formula = price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)
## 
## Coefficients:
##  house_age_years0 to 15  house_age_years15 to 30  house_age_years30 to 45  
##                  12.637                    9.877                   11.393
mdl_price_vs_both <- lm(price_twd_msq ~ n_convenience + house_age_years +0, data = taiwan_real_estate)
mdl_price_vs_both
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience + house_age_years + 
##     0, data = taiwan_real_estate)
## 
## Coefficients:
##           n_convenience   house_age_years0 to 15  house_age_years15 to 30  
##                  0.7915                   9.4133                   7.0852  
## house_age_years30 to 45  
##                  7.5110
g1 <- ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

g2 <- ggplot(taiwan_real_estate, aes(house_age_years, price_twd_msq)) +
  geom_boxplot() +
  stat_summary(fun = mean, shape = 15)

g3 <- ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq, color = house_age_years)) +
  geom_point() +
  geom_parallel_slopes(se = FALSE)

grid.arrange(g1, g2, g3, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Predicting parallel slopes

# Predicting parallel slopes
# Choosing values for explanatory variables
explanatory_data <- expand_grid( # `expand_grid` returns a data frame of all combinations of its inputs
  length_cm = seq(5, 60, 5),
  species = unique(fish$species)
)
glimpse(explanatory_data)
## Rows: 48
## Columns: 2
## $ length_cm <dbl> 5, 5, 5, 5, 10, 10, 10, 10, 15, 15, 15, 15, 20, 20, 20, 20, …
## $ species   <chr> "Bream", "Roach", "Perch", "Pike", "Bream", "Roach", "Perch"…
# Add a column of predictions to the data frame
prediction_data <- explanatory_data %>% 
  mutate(mass_g = predict(mdl_mass_vs_both, explanatory_data))

# Visualizing the predictions
ggplot(fish, aes(length_cm, mass_g, color = species)) +
  geom_point() +
  geom_parallel_slopes(se = FALSE) +
  geom_point(data = prediction_data, size = 3, shape = 15)

# Manually calculating predictions
coeffs <- coefficients(mdl_mass_vs_both)

slope <- coeffs[1]
intercept_bream <- coeffs[2]
intercept_perch <- coeffs[3]
intercept_pike <- coeffs[4]
intercept_roach <- coeffs[5]

explanatory_data %>% 
  mutate(
    intercept = case_when(
      species == "Bream" ~ intercept_bream,
      species == "Perch" ~ intercept_perch,
      species == "Pike" ~ intercept_pike,
      species == "Roach" ~ intercept_roach
    ),
    mass_g = intercept + slope * length_cm
  )
predict(mdl_mass_vs_both, explanatory_data)
##          1          2          3          4          5          6          7 
## -459.39910 -513.93503 -500.45009 -876.61328 -246.55633 -301.09226 -287.60732 
##          8          9         10         11         12         13         14 
## -663.77051  -33.71355  -88.24949  -74.76455 -450.92774  179.12922  124.59328 
##         15         16         17         18         19         20         21 
##  138.07822 -238.08497  391.97199  337.43605  350.92099  -25.24220  604.81476 
##         22         23         24         25         26         27         28 
##  550.27882  563.76376  187.60057  817.65753  763.12159  776.60653  400.44334 
##         29         30         31         32         33         34         35 
## 1030.50030  975.96436  989.44930  613.28611 1243.34307 1188.80713 1202.29207 
##         36         37         38         39         40         41         42 
##  826.12888 1456.18584 1401.64990 1415.13484 1038.97165 1669.02861 1614.49268 
##         43         44         45         46         47         48 
## 1627.97761 1251.81442 1881.87138 1827.33545 1840.82038 1464.65719
explanatory_data <- expand_grid(
  n_convenience = 0:10,
  house_age_years = unique(taiwan_real_estate$house_age_years)
)
glimpse(explanatory_data)
## Rows: 33
## Columns: 2
## $ n_convenience   <int> 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, …
## $ house_age_years <fct> 30 to 45, 15 to 30, 0 to 15, 30 to 45, 15 to 30, 0 to …
prediction_data <- explanatory_data %>% 
  mutate(price_twd_msq = predict(mdl_price_vs_both, explanatory_data))

taiwan_real_estate %>% 
  ggplot(aes(n_convenience, price_twd_msq, color = house_age_years)) +
  geom_point() +
  geom_parallel_slopes(se = FALSE) +
  geom_point(data = prediction_data, size = 5, shape = 15)

# Manually calculating predictions
coeffs <- coefficients(mdl_price_vs_both)

slope <- coeffs[1]
intercept_0_15 <- coeffs[2]
intercept_15_30 <- coeffs[3]
intercept_30_45 <- coeffs[4]

explanatory_data %>% 
  mutate(
    intercept = case_when(
      house_age_years == "0 to 15" ~ intercept_0_15,
      house_age_years == "15 to 30" ~ intercept_15_30,
      house_age_years == "30 to 45" ~ intercept_30_45
    ),
    price_twd_msq = intercept + slope * n_convenience
  )
predict(mdl_price_vs_both, explanatory_data)
##         1         2         3         4         5         6         7         8 
##  7.510958  7.085169  9.413325  8.302415  7.876627 10.204782  9.093873  8.668084 
##         9        10        11        12        13        14        15        16 
## 10.996239  9.885330  9.459541 11.787696 10.676787 10.250998 12.579153 11.468244 
##        17        18        19        20        21        22        23        24 
## 11.042455 13.370610 12.259701 11.833912 14.162067 13.051158 12.625369 14.953525 
##        25        26        27        28        29        30        31        32 
## 13.842615 13.416826 15.744982 14.634072 14.208284 16.536439 15.425530 14.999741 
##        33 
## 17.327896

Assessing model performance

  • Model performance metrics
    • Coefficient of determination (R-squared) - Measures how good the regression’s prediction line fits the observed values
      • Larger is better
      • [0, 1]
    • Residual standard error (RSE) - The typical size of the residuals
      • Smaller is better
    • Adjusted coefficient of determination
      • \(\bar{R^2} = 1 - (1 - R^2) \frac{n_{obs} - 1}{n_{obs} - n_{var} - 1}\)
      • More explanatory variables increases \(R^2\)
      • Too many explanatory variables causes overfitting
      • Adjusted coefficient of determination penalizes more explanatory variables
      • Penalty is noticeable when \(R^2\) is small, or \(n_{var}\) is large fraction of \(n_{obs}\)
      • In glance(), it’s the adj.r.squared element
# Assessing model performance
# Coefficient of determination
mdl_mass_vs_species %>% 
  glance() %>% 
  pull(r.squared)
## [1] 0.7163347
mdl_mass_vs_length %>% 
  glance() %>% 
  pull(r.squared)
## [1] 0.822569
mdl_mass_vs_both %>% 
  glance() %>% 
  pull(r.squared)
## [1] 0.9694266
# Adjusted coefficient of determination
mdl_mass_vs_species %>% 
  glance() %>% 
  dplyr::select(r.squared, adj.r.squared)
mdl_mass_vs_length %>% 
  glance() %>% 
  dplyr::select(r.squared, adj.r.squared)
mdl_mass_vs_both %>% 
  glance() %>% 
  dplyr::select(r.squared, adj.r.squared)
# Residual standard error
mdl_mass_vs_species %>% 
  glance() %>% 
  pull(sigma)
## [1] 313.5501
mdl_mass_vs_length %>% 
  glance() %>% 
  pull(sigma)
## [1] 152.1209
mdl_mass_vs_both %>% 
  glance() %>% 
  pull(sigma)
## [1] 103.3556
mdl_price_vs_conv %>% 
  glance() %>% 
  dplyr::select(r.squared, adj.r.squared)
mdl_price_vs_age %>% 
  glance() %>% 
  dplyr::select(r.squared, adj.r.squared)
mdl_price_vs_both %>% 
  glance() %>% 
  dplyr::select(r.squared, adj.r.squared)
mdl_price_vs_conv %>% 
  glance() %>% 
  pull(sigma)
## [1] 3.383888
mdl_price_vs_age %>% 
  glance() %>% 
  pull(sigma)
## [1] 3.950184
mdl_price_vs_both %>% 
  glance() %>% 
  pull(sigma)
## [1] 3.21346

Models for each category

  • Splitting the dataset in the smart way
    • base-R: split() + lapply()
    • dplyr : nest_by() + mutate()
# Models for each category
# 4 categories
unique(fish$species)
## [1] "Bream" "Roach" "Perch" "Pike"
# Splitting the dataset
bream <- fish %>% filter(species == "Bream")
perch <- fish %>% filter(species == "Perch")
pike <- fish %>% filter(species == "Pike")
roach <- fish %>% filter(species == "Roach")

# 4 models
mdl_bream <- lm(mass_g ~ length_cm, data = bream)
mdl_bream
## 
## Call:
## lm(formula = mass_g ~ length_cm, data = bream)
## 
## Coefficients:
## (Intercept)    length_cm  
##    -1035.35        54.55
mdl_perch <- lm(mass_g ~ length_cm, data = perch)
mdl_perch
## 
## Call:
## lm(formula = mass_g ~ length_cm, data = perch)
## 
## Coefficients:
## (Intercept)    length_cm  
##     -619.18        38.91
mdl_pike <- lm(mass_g ~ length_cm, data = pike)
mdl_pike
## 
## Call:
## lm(formula = mass_g ~ length_cm, data = pike)
## 
## Coefficients:
## (Intercept)    length_cm  
##    -1540.82        53.19
mdl_roach <- lm(mass_g ~ length_cm, data = roach)
mdl_roach
## 
## Call:
## lm(formula = mass_g ~ length_cm, data = roach)
## 
## Coefficients:
## (Intercept)    length_cm  
##     -329.38        23.32
# Explanatory data
explanatory_data <- tibble(length_cm = seq(5, 60, 5))

# Making predictions
prediction_data_bream <- explanatory_data %>% 
  mutate(
    mass_g = predict(mdl_bream, explanatory_data),
    species = "Bream"
  )
prediction_data_perch <- explanatory_data %>% 
  mutate(
    mass_g = predict(mdl_perch, explanatory_data),
    species = "Perch"
  )
prediction_data_pike <- explanatory_data %>% 
  mutate(
    mass_g = predict(mdl_pike, explanatory_data),
    species = "Pike"
  )
prediction_data_roach <- explanatory_data %>% 
  mutate(
    mass_g = predict(mdl_roach, explanatory_data),
    species = "Roach"
  )

# Visualizing predictions
g1 <- ggplot(fish, aes(length_cm, mass_g)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

g2 <- ggplot(fish, aes(length_cm, mass_g, color = species)) +
  geom_point() +
  geom_parallel_slopes(se = FALSE)

g3 <- ggplot(fish, aes(length_cm, mass_g, color = species)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data_bream, size = 3, shape = 15) +
  geom_point(data = prediction_data_perch, size = 3, shape = 15) +
  geom_point(data = prediction_data_pike, size = 3, shape = 15) +
  geom_point(data = prediction_data_roach, size = 3, shape = 15)

grid.arrange(g1, g2, g3, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# The mixed performance result is quite common: the whole dataset model benefits from the increased power of more rows of data, whereas individual models benefit from not having to satisfy differing components of data
mdl_fish <- lm(mass_g ~ length_cm + species, data = fish)

# Coefficient of determination
mdl_fish %>% glance() %>% pull(adj.r.squared)
## [1] 0.9174431
mdl_bream %>% glance() %>% pull(adj.r.squared)
## [1] 0.8743676
mdl_perch %>% glance() %>% pull(adj.r.squared)
## [1] 0.9169461
mdl_pike %>% glance() %>% pull(adj.r.squared)
## [1] 0.9410872
mdl_roach %>% glance() %>% pull(adj.r.squared)
## [1] 0.8152746
# Residual standard error
mdl_fish %>% glance() %>% pull(sigma)
## [1] 103.3556
mdl_bream %>% glance() %>% pull(sigma)
## [1] 74.15224
mdl_perch %>% glance() %>% pull(sigma)
## [1] 100.1802
mdl_pike %>% glance() %>% pull(sigma)
## [1] 119.9377
mdl_roach %>% glance() %>% pull(sigma)
## [1] 38.1784
# One model per category
taiwan_0_to_15 <- taiwan_real_estate %>% filter(house_age_years == "0 to 15")
taiwan_15_to_30 <- taiwan_real_estate %>% filter(house_age_years == "15 to 30")
taiwan_30_to_45 <- taiwan_real_estate %>% filter(house_age_years == "30 to 45")

mdl_0_to_15 <- lm(price_twd_msq ~ n_convenience, data = taiwan_0_to_15)
mdl_15_to_30 <- lm(price_twd_msq ~ n_convenience, data = taiwan_15_to_30)
mdl_30_to_45 <- lm(price_twd_msq ~ n_convenience, data = taiwan_30_to_45)

mdl_0_to_15
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_0_to_15)
## 
## Coefficients:
##   (Intercept)  n_convenience  
##        9.2417         0.8336
mdl_15_to_30
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_15_to_30)
## 
## Coefficients:
##   (Intercept)  n_convenience  
##        6.8719         0.8519
mdl_30_to_45
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_30_to_45)
## 
## Coefficients:
##   (Intercept)  n_convenience  
##        8.1131         0.6687
# Predicting multiple models
explanatory_data <- tibble(n_convenience = 0:10)

prediction_data_0_to_15 <- explanatory_data %>% 
  mutate(price_twd_msq = predict(mdl_0_to_15, explanatory_data))
prediction_data_15_to_30 <- explanatory_data %>% 
  mutate(price_twd_msq = predict(mdl_15_to_30, explanatory_data))
prediction_data_30_to_45 <- explanatory_data %>% 
  mutate(price_twd_msq = predict(mdl_30_to_45, explanatory_data))

# Visualizing multiple models
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq, color = house_age_years)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data_0_to_15, size = 3, shape = 15, color = "red") +
  geom_point(data = prediction_data_15_to_30, size = 3, shape = 15, color = "green") +
  geom_point(data = prediction_data_30_to_45, size = 3, shape = 15, color = "blue")
## `geom_smooth()` using formula = 'y ~ x'

# Assessing model performance
mdl_all_ages <- lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)

mdl_all_ages %>% glance() %>% pull(r.squared)
## [1] 0.3260466
mdl_0_to_15 %>% glance() %>% pull(r.squared)
## [1] 0.3120536
mdl_15_to_30 %>% glance() %>% pull(r.squared)
## [1] 0.4424605
mdl_30_to_45 %>% glance() %>% pull(r.squared)
## [1] 0.3125713
mdl_all_ages %>% glance() %>% pull(sigma)
## [1] 3.383888
mdl_0_to_15 %>% glance() %>% pull(sigma)
## [1] 3.564127
mdl_15_to_30 %>% glance() %>% pull(sigma)
## [1] 2.585273
mdl_30_to_45 %>% glance() %>% pull(sigma)
## [1] 3.239037

One model with an interaction

  • Specify interactions between explanatory variables - A single model that contains intercepts and slopes for each category
  • Interaction - The effect of one explanatory variable on the expected response changes depending on the value of another explanatory variable (The two explanatory variables interact / Interactions between explanatory variables)
    • E.g. Different fish species have different mass to length ratios. The effect of length on the expected mass is different for different species
    • No interactions - response ~ explntry1 + explntry2
    • With interactions (implicit) - response_var ~ explntry1 * explntry2
      • Implicit - Didn’t write down what interactions are needed - R figures that out itself
    • With interactions (explicit) - response ~ explntry1 + explntry2 + explntry1:explntry2

Modeling with interactions

# One model with an interaction
lm(mass_g ~ length_cm * species, data = fish) # The coefficients are tricky to understand
## 
## Call:
## lm(formula = mass_g ~ length_cm * species, data = fish)
## 
## Coefficients:
##            (Intercept)               length_cm            speciesPerch  
##              -1035.348                  54.550                 416.172  
##            speciesPike            speciesRoach  length_cm:speciesPerch  
##               -505.477                 705.971                 -15.639  
##  length_cm:speciesPike  length_cm:speciesRoach  
##                 -1.355                 -31.231
lm(mass_g ~ length_cm * species + 0, data = fish) # The coefficients are tricky to understand
## 
## Call:
## lm(formula = mass_g ~ length_cm * species + 0, data = fish)
## 
## Coefficients:
##              length_cm            speciesBream            speciesPerch  
##                 54.550               -1035.348                -619.175  
##            speciesPike            speciesRoach  length_cm:speciesPerch  
##              -1540.824                -329.376                 -15.639  
##  length_cm:speciesPike  length_cm:speciesRoach  
##                 -1.355                 -31.231
lm(mass_g ~ species + species:length_cm + 0, data = fish) # Zero removes the global intercept
## 
## Call:
## lm(formula = mass_g ~ species + species:length_cm + 0, data = fish)
## 
## Coefficients:
##           speciesBream            speciesPerch             speciesPike  
##               -1035.35                 -619.18                -1540.82  
##           speciesRoach  speciesBream:length_cm  speciesPerch:length_cm  
##                -329.38                   54.55                   38.91  
##  speciesPike:length_cm  speciesRoach:length_cm  
##                  53.19                   23.32
# Specifying an interaction
lm(price_twd_msq ~ n_convenience * house_age_years, data = taiwan_real_estate)
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience * house_age_years, 
##     data = taiwan_real_estate)
## 
## Coefficients:
##                           (Intercept)                          n_convenience  
##                               9.24170                                0.83359  
##               house_age_years15 to 30                house_age_years30 to 45  
##                              -2.36978                               -1.12858  
## n_convenience:house_age_years15 to 30  n_convenience:house_age_years30 to 45  
##                               0.01833                               -0.16489
lm(price_twd_msq ~ n_convenience + house_age_years + n_convenience:house_age_years, data = taiwan_real_estate)
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience + house_age_years + 
##     n_convenience:house_age_years, data = taiwan_real_estate)
## 
## Coefficients:
##                           (Intercept)                          n_convenience  
##                               9.24170                                0.83359  
##               house_age_years15 to 30                house_age_years30 to 45  
##                              -2.36978                               -1.12858  
## n_convenience:house_age_years15 to 30  n_convenience:house_age_years30 to 45  
##                               0.01833                               -0.16489
# Interactions with understandable coeffs
mdl_readable_inter <- lm(price_twd_msq ~ house_age_years + house_age_years:n_convenience + 0, data = taiwan_real_estate)
mdl_readable_inter
## 
## Call:
## lm(formula = price_twd_msq ~ house_age_years + house_age_years:n_convenience + 
##     0, data = taiwan_real_estate)
## 
## Coefficients:
##                house_age_years0 to 15                house_age_years15 to 30  
##                                9.2417                                 6.8719  
##               house_age_years30 to 45   house_age_years0 to 15:n_convenience  
##                                8.1131                                 0.8336  
## house_age_years15 to 30:n_convenience  house_age_years30 to 45:n_convenience  
##                                0.8519                                 0.6687
coefficients(mdl_0_to_15)
##   (Intercept) n_convenience 
##     9.2417022     0.8335867
coefficients(mdl_15_to_30)
##   (Intercept) n_convenience 
##     6.8719186     0.8519172
coefficients(mdl_30_to_45)
##   (Intercept) n_convenience 
##     8.1131235     0.6686981

Making predictions with interactions

# Making predictions with interactions
mdl_mass_vs_both_inter <- lm(mass_g ~ species + species:length_cm + 0, data = fish)

explanatory_data <- expand_grid(
  length_cm = seq(5, 60, 5),
  species = unique(fish$species)
)

prediction_data <- explanatory_data %>% 
  mutate(mass_g = predict(mdl_mass_vs_both_inter, explanatory_data))

# Visualizing the predictions
ggplot(fish, aes(length_cm, mass_g, color = species)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, size = 3, shape = 15)
## `geom_smooth()` using formula = 'y ~ x'

# Manually calculating the predictions
coeffs <- coefficients(mdl_mass_vs_both_inter)

intercept_bream <- coeffs[1]
intercept_perch <- coeffs[2]
intercept_pike <- coeffs[3]
intercept_roach <- coeffs[4]
slope_bream <- coeffs[5]
slope_perch <- coeffs[6]
slope_pike <- coeffs[7]
slope_roach <- coeffs[8]

explanatory_data %>% 
  mutate(
    mass_g = case_when(
      species == "Bream" ~ intercept_bream + slope_bream * length_cm,
      species == "Perch" ~ intercept_perch + slope_perch * length_cm,
      species == "Pike" ~ intercept_pike + slope_pike * length_cm,
      species == "Roach" ~ intercept_roach + slope_roach * length_cm
      )
  )
# Predicting with interactions
mdl_price_vs_both_inter <- lm(price_twd_msq ~ house_age_years + n_convenience:house_age_years + 0, data = taiwan_real_estate)

explanatory_data <- expand.grid(
  n_convenience = 0:10,
  house_age_years = unique(taiwan_real_estate$house_age_years)
)

prediction_data <- explanatory_data %>% 
  mutate(price_twd_msq = predict(mdl_price_vs_both_inter, explanatory_data))

ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq, color = house_age_years)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(data = prediction_data, size = 5, shape = 15)
## `geom_smooth()` using formula = 'y ~ x'

# Manually calculating predictions with interactions
coeffs <- coefficients(mdl_price_vs_both_inter)

intercept_0_15 <- coeffs[1]
intercept_15_30 <- coeffs[2]
intercept_30_45 <- coeffs[3]
slope_0_15 <- coeffs[4]
slope_15_30 <- coeffs[5]
slope_30_45 <- coeffs[6]

prediction_data <- explanatory_data %>% 
  mutate(
    price_twd_msq = case_when(
      house_age_years == "0 to 15" ~ coeffs[1] + coeffs[4] * n_convenience,
      house_age_years == "15 to 30" ~ coeffs[2] + coeffs[5] * n_convenience,
      house_age_years == "30 to 45" ~ coeffs[3] + coeffs[6] * n_convenience
    )
  )

Simpson’s Paradox

  • The difference between models of a whole dataset and individual models for each category
  • Simpson’s Paradox - Occurs when the trend of a model on the whole dataset is very different from the trends shown by models on subsets of the dataset
    • trend = slope coefficient
  • Reconciling the difference
    • Good advice - If possible, try to plot the dataset
    • Common advice - You can’t choose the best model in general - it depends on the dataset and the question you are trying to answer
    • More good advice - Articulate a question before you start modeling
    • Usually (but not always) the grouped model contains more insight
    • Are you missing explanatory variables?
    • Context is important
# Simpson's Paradox
glimpse(auctions)
## Rows: 343
## Columns: 3
## $ price        <dbl> 260.00, 256.86, 260.00, 238.02, 231.50, 251.11, 247.50, 2…
## $ openbid      <dbl> 0.01, 0.01, 0.01, 0.01, 1.00, 9.99, 215.00, 155.00, 50.00…
## $ auction_type <chr> "7 day auction", "3 day auction", "5 day auction", "7 day…
# Whole
mdl_price_vs_openbid <- lm(price ~ openbid, data = auctions)
mdl_price_vs_openbid
## 
## Call:
## lm(formula = price ~ openbid, data = auctions)
## 
## Coefficients:
## (Intercept)      openbid  
##  229.245667    -0.002098
g1 <- ggplot(auctions, aes(openbid, price)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

# By group
mdl_price_vs_both <- lm(price ~ auction_type + openbid:auction_type + 0, data = auctions)
mdl_price_vs_both
## 
## Call:
## lm(formula = price ~ auction_type + openbid:auction_type + 0, 
##     data = auctions)
## 
## Coefficients:
##         auction_type3 day auction          auction_type5 day auction  
##                        226.369005                         221.599320  
##         auction_type7 day auction  auction_type3 day auction:openbid  
##                        231.602861                          -0.029026  
## auction_type5 day auction:openbid  auction_type7 day auction:openbid  
##                          0.084014                           0.003682
g2 <- ggplot(auctions, aes(openbid, price, color = auction_type)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Multiple Linear Regression

  • Two numeric explanatory variables
    • Visualizing 3 numeric variables
      • 3D scatter plot
      • 2D scatter plot with response as color

Two numeric explanatory variables

# Multiple Linear Regression
# Two numeric explanatory variables
# 3D scatter plot
# The plot is impossible to interpret
scatter3D(fish$length_cm, fish$height_cm, fish$mass_g)

fish %$% # magrittr's dollar pipe
  scatter3D(length_cm, height_cm, mass_g)

# 2D scatter plot, color for response
# Set of color scales called "viridis" that provide easier to distinguish colors
# `scale_color_viridis_c` is used for continuous scales, where you have numeric data
ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
  geom_point() +
  scale_color_viridis_c(option = "inferno") # Viridis color scales

# Modeling with 2 numeric explanatory variables
# A global intercept coefficient, and one slope coefficient for each explanatory variable
mdl_mass_vs_both <- lm(mass_g ~ length_cm + height_cm, data = fish)
mdl_mass_vs_both
## 
## Call:
## lm(formula = mass_g ~ length_cm + height_cm, data = fish)
## 
## Coefficients:
## (Intercept)    length_cm    height_cm  
##     -622.15        28.97        26.33
explanatory_data <- expand_grid(
  length_cm = seq(5, 60, 5),
  height_cm = seq(2, 20, 2)
)

prediction_data <- explanatory_data %>% 
  mutate(mass_g = predict(mdl_mass_vs_both, explanatory_data))

# The color grid gives a nice overview of how the response variable changes over the plane of the explanatory variables. The heaviest fish are in the top-right, where they are long and tall
ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
  geom_point() +
  scale_color_viridis_c(option = "inferno") +
  geom_point(data = prediction_data, shape = 15, size = 3)

# Including an interaction
mdl_mass_vs_both_inter <- lm(mass_g ~ length_cm * height_cm, data = fish)
mdl_mass_vs_both_inter
## 
## Call:
## lm(formula = mass_g ~ length_cm * height_cm, data = fish)
## 
## Coefficients:
##         (Intercept)            length_cm            height_cm  
##            159.1075               0.3014             -78.1252  
## length_cm:height_cm  
##              3.5454
explanatory_data <- expand_grid(
  length_cm = seq(5, 60, 5),
  height_cm = seq(2, 20, 2)
)

prediction_data <- explanatory_data %>% 
  mutate(mass_g = predict(mdl_mass_vs_both_inter, explanatory_data))

ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
  geom_point() +
  scale_color_viridis_c(option = "inferno") +
  geom_point(data = prediction_data, shape = 15, size = 3)

# 3D visualizations
taiwan_real_estate %$% 
  scatter3D(n_convenience, sqrt(dist_to_mrt_m), price_twd_msq)

ggplot(taiwan_real_estate, aes(n_convenience, sqrt(dist_to_mrt_m), color = price_twd_msq)) + 
  geom_point() +
  scale_color_viridis_c(option = "plasma")

# Modeling 2 numeric explanatory variables
mdl_price_vs_conv_dist <- lm(price_twd_msq ~ n_convenience + sqrt(dist_to_mrt_m), data = taiwan_real_estate)
mdl_price_vs_conv_dist
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience + sqrt(dist_to_mrt_m), 
##     data = taiwan_real_estate)
## 
## Coefficients:
##         (Intercept)        n_convenience  sqrt(dist_to_mrt_m)  
##             15.1038               0.2142              -0.1573
explanatory_data <- expand_grid(
  n_convenience = 0:10,
  dist_to_mrt_m = seq(0, 80, 10) ^ 2
)

prediction_data <- explanatory_data %>% 
  mutate(price_twd_msq = predict(mdl_price_vs_conv_dist, explanatory_data))

ggplot(taiwan_real_estate, aes(n_convenience, sqrt(dist_to_mrt_m), color = price_twd_msq)) +
  geom_point() +
  scale_color_viridis_c(option = "plasma") +
  geom_point(data = prediction_data, size = 3, color = "yellow")

# Including an interaction
mdl_price_vs_conv_dist <- lm(price_twd_msq ~ n_convenience * sqrt(dist_to_mrt_m), data = taiwan_real_estate)
mdl_price_vs_conv_dist
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience * sqrt(dist_to_mrt_m), 
##     data = taiwan_real_estate)
## 
## Coefficients:
##                       (Intercept)                      n_convenience  
##                          14.73733                            0.42425  
##               sqrt(dist_to_mrt_m)  n_convenience:sqrt(dist_to_mrt_m)  
##                          -0.14121                           -0.01124
explanatory_data <- expand_grid(
  n_convenience = 0:10,
  dist_to_mrt_m = seq(0, 80, 10) ^ 2
)

prediction_data <- explanatory_data %>% 
  mutate(price_twd_msq = predict(mdl_price_vs_conv_dist, explanatory_data))

ggplot(taiwan_real_estate, aes(n_convenience, sqrt(dist_to_mrt_m), color = price_twd_msq)) +
  geom_point() +
  scale_color_viridis_c(option = "plasma") +
  geom_point(data = prediction_data, size = 3, color = "yellow")

More than 2 explanatory variables

  • More than 2 explanatory variables
    • In general, while it is tricky to include more than three numeric variables in a scatter plot, you can include as many categorical variables as you like using faceting. However, more facets can make it harder to see an overall picture. Plotting rapidly becomes difficult as you increase the number of variables to display
    • In addition to using x and y scales for two numeric variables, you can use color for a third numeric variable, and you can use faceting for categorical variables. And that’s about your limit before the plots become to difficult to interpret
    • There are some specialist plot types like correlation heatmaps and parallel coordinates plots that will handle more variables, but they give you much less information about each variable, and they aren’t great for visualizing model predictions
# More than 2 explanatory variables
ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
  geom_point() +
  scale_color_viridis_c(option = "inferno")

# Faceting by species
ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
  geom_point() +
  scale_color_viridis_c(option = "inferno") +
  facet_wrap(vars(species))

# Modeling
# Different levels of interaction
## No interactions
lm(mass_g ~ length_cm + height_cm + species + 0, data = fish)
## 
## Call:
## lm(formula = mass_g ~ length_cm + height_cm + species + 0, data = fish)
## 
## Coefficients:
##    length_cm     height_cm  speciesBream  speciesPerch   speciesPike  
##        36.91         18.16       -776.65       -710.57       -989.40  
## speciesRoach  
##      -731.64
## 2-way interactions between pairs of variables
lm(mass_g ~ length_cm + height_cm + species + length_cm:height_cm + length_cm:species + height_cm:species + 0, data = fish)
## 
## Call:
## lm(formula = mass_g ~ length_cm + height_cm + species + length_cm:height_cm + 
##     length_cm:species + height_cm:species + 0, data = fish)
## 
## Coefficients:
##              length_cm               height_cm            speciesBream  
##                -37.046                 -36.911                 633.199  
##           speciesPerch             speciesPike            speciesRoach  
##                137.801                -311.846                 185.260  
##    length_cm:height_cm  length_cm:speciesPerch   length_cm:speciesPike  
##                  3.574                  23.124                  57.529  
## length_cm:speciesRoach  height_cm:speciesPerch   height_cm:speciesPike  
##                 29.676                  10.743                -100.358  
## height_cm:speciesRoach  
##                -21.209
# Same as above
# The power of operator has a special meaning; it doesn't square the explanatory variable values
lm(mass_g ~ (length_cm + height_cm + species)^2 + 0, data = fish)
## 
## Call:
## lm(formula = mass_g ~ (length_cm + height_cm + species)^2 + 0, 
##     data = fish)
## 
## Coefficients:
##              length_cm               height_cm            speciesBream  
##                -37.046                 -36.911                 633.199  
##           speciesPerch             speciesPike            speciesRoach  
##                137.801                -311.846                 185.260  
##    length_cm:height_cm  length_cm:speciesPerch   length_cm:speciesPike  
##                  3.574                  23.124                  57.529  
## length_cm:speciesRoach  height_cm:speciesPerch   height_cm:speciesPike  
##                 29.676                  10.743                -100.358  
## height_cm:speciesRoach  
##                -21.209
# lm(Weight ~ I(Length1)^2 + Height + species + 0, data = fish) # Use I function to square the explanatory variable values

## 3-way interaction between all three variables
lm(mass_g ~ length_cm + height_cm + species + length_cm:height_cm + length_cm:species + height_cm:species + length_cm:height_cm:species + 0, data = fish)
## 
## Call:
## lm(formula = mass_g ~ length_cm + height_cm + species + length_cm:height_cm + 
##     length_cm:species + height_cm:species + length_cm:height_cm:species + 
##     0, data = fish)
## 
## Coefficients:
##                        length_cm                         height_cm  
##                           6.2418                           56.6590  
##                     speciesBream                      speciesPerch  
##                        -721.3943                          123.4318  
##                      speciesPike                      speciesRoach  
##                         802.2666                          115.7905  
##              length_cm:height_cm            length_cm:speciesPerch  
##                           0.6211                          -19.7563  
##            length_cm:speciesPike            length_cm:speciesRoach  
##                         -10.1747                          -10.4436  
##           height_cm:speciesPerch             height_cm:speciesPike  
##                         -80.1937                         -346.2949  
##           height_cm:speciesRoach  length_cm:height_cm:speciesPerch  
##                        -103.7407                            2.8790  
##  length_cm:height_cm:speciesPike  length_cm:height_cm:speciesRoach  
##                           6.1729                            2.4620
# Same as above
lm(mass_g ~ length_cm * height_cm * species + 0, data = fish)
## 
## Call:
## lm(formula = mass_g ~ length_cm * height_cm * species + 0, data = fish)
## 
## Coefficients:
##                        length_cm                         height_cm  
##                           6.2418                           56.6590  
##                     speciesBream                      speciesPerch  
##                        -721.3943                          123.4318  
##                      speciesPike                      speciesRoach  
##                         802.2666                          115.7905  
##              length_cm:height_cm            length_cm:speciesPerch  
##                           0.6211                          -19.7563  
##            length_cm:speciesPike            length_cm:speciesRoach  
##                         -10.1747                          -10.4436  
##           height_cm:speciesPerch             height_cm:speciesPike  
##                         -80.1937                         -346.2949  
##           height_cm:speciesRoach  length_cm:height_cm:speciesPerch  
##                        -103.7407                            2.8790  
##  length_cm:height_cm:speciesPike  length_cm:height_cm:speciesRoach  
##                           6.1729                            2.4620
# Prediction
mdl_mass_vs_all <- lm(mass_g ~ length_cm * height_cm * species * 0, data = fish)

explanatory_data <- expand_grid(
  length_cm = seq(5, 60, 6),
  height_cm = seq(2, 20, 2),
  species = unique(fish$species)
)

prediction_data <- explanatory_data %>% 
  mutate(mass_g = predict(mdl_mass_vs_all, explanatory_data))

# Visualizing predictions
ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
  geom_point() +
  scale_color_viridis_c(option = "inferno") +
  facet_wrap(vars(species)) +
  geom_point(data = prediction_data, size = 3, shape = 15)

# Visualizing many variables
ggplot(taiwan_real_estate, aes(sqrt(dist_to_mrt_m), n_convenience, color = price_twd_msq)) +
  geom_point() +
  scale_color_viridis_c(option = "plasma") +
  facet_wrap(vars(house_age_years))

# Modeling
## No interactions
mdl_price_vs_all_no_inter <- lm(price_twd_msq ~ n_convenience + sqrt(dist_to_mrt_m) + house_age_years + 0, data = taiwan_real_estate)
mdl_price_vs_all_no_inter
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience + sqrt(dist_to_mrt_m) + 
##     house_age_years + 0, data = taiwan_real_estate)
## 
## Coefficients:
##           n_convenience      sqrt(dist_to_mrt_m)   house_age_years0 to 15  
##                  0.2577                  -0.1481                  15.4745  
## house_age_years15 to 30  house_age_years30 to 45  
##                 14.1301                  13.7655
## 3-way interaction between all three variables
mdl_price_vs_all_3_way_inter <- lm(price_twd_msq ~ n_convenience * sqrt(dist_to_mrt_m) * house_age_years + 0, data = taiwan_real_estate)
mdl_price_vs_all_3_way_inter
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience * sqrt(dist_to_mrt_m) * 
##     house_age_years + 0, data = taiwan_real_estate)
## 
## Coefficients:
##                                             n_convenience  
##                                                  0.374982  
##                                       sqrt(dist_to_mrt_m)  
##                                                 -0.162944  
##                                    house_age_years0 to 15  
##                                                 16.046849  
##                                   house_age_years15 to 30  
##                                                 13.760066  
##                                   house_age_years30 to 45  
##                                                 12.088773  
##                         n_convenience:sqrt(dist_to_mrt_m)  
##                                                 -0.008393  
##                     n_convenience:house_age_years15 to 30  
##                                                  0.078370  
##                     n_convenience:house_age_years30 to 45  
##                                                  0.066720  
##               sqrt(dist_to_mrt_m):house_age_years15 to 30  
##                                                  0.036618  
##               sqrt(dist_to_mrt_m):house_age_years30 to 45  
##                                                  0.061281  
## n_convenience:sqrt(dist_to_mrt_m):house_age_years15 to 30  
##                                                 -0.003821  
## n_convenience:sqrt(dist_to_mrt_m):house_age_years30 to 45  
##                                                  0.004401
## 2-way interactions between pairs of variables
mdl_price_vs_all_2_way_inter <- lm(price_twd_msq ~ (n_convenience + sqrt(dist_to_mrt_m) + house_age_years)^2 + 0, data = taiwan_real_estate)
mdl_price_vs_all_2_way_inter
## 
## Call:
## lm(formula = price_twd_msq ~ (n_convenience + sqrt(dist_to_mrt_m) + 
##     house_age_years)^2 + 0, data = taiwan_real_estate)
## 
## Coefficients:
##                               n_convenience  
##                                    0.384914  
##                         sqrt(dist_to_mrt_m)  
##                                   -0.162025  
##                      house_age_years0 to 15  
##                                   16.026633  
##                     house_age_years15 to 30  
##                                   13.880791  
##                     house_age_years30 to 45  
##                                   11.926904  
##           n_convenience:sqrt(dist_to_mrt_m)  
##                                   -0.008956  
##       n_convenience:house_age_years15 to 30  
##                                   -0.006894  
##       n_convenience:house_age_years30 to 45  
##                                    0.143416  
## sqrt(dist_to_mrt_m):house_age_years15 to 30  
##                                    0.031600  
## sqrt(dist_to_mrt_m):house_age_years30 to 45  
##                                    0.068198
# Prediction
explanatory_data <- expand_grid(
  dist_to_mrt_m = seq(0, 80, 10) ^ 2,
  n_convenience = 0:10,
  house_age_years = unique(taiwan_real_estate$house_age_years)
)

prediction_data <- explanatory_data %>% 
  mutate(price_twd_msq = predict(mdl_price_vs_all_3_way_inter, explanatory_data))

# Visualizing predictions
ggplot(taiwan_real_estate, aes(sqrt(dist_to_mrt_m), n_convenience, color = price_twd_msq)) +
  geom_point() +
  scale_color_viridis_c(option = "plasma") +
  facet_wrap(vars(house_age_years)) +
  geom_point(data = prediction_data, size = 3, shape = 15)

How linear regression works

  • For the best fit - a metric that measures the size of all the residuals - as small as possible
    • A metric for the best fit
      • The simplest idea (which doesn’t work)
        • Take the sum of all the residuals
        • Some residuals are negative
      • The next simplest idea (which does work)
        • Take the square of each residual, and add up those squares
        • This is called the sum of squares
        • To determine which intercept and slope coefficients will result in the smallest sum of squares - Numerical optimization
# How linear regression works
# Numerical optimization
calc_quadratic <- function(x) {
  x ^ 2 - x + 10
}
optim(par = 3, fn = calc_quadratic) # The first argument is an initial guess at the answer
## Warning in optim(par = 3, fn = calc_quadratic): one-dimensional optimization by Nelder-Mead is unreliable:
## use "Brent" or optimize() directly
## $par
## [1] 0.4998047
## 
## $value
## [1] 9.75
## 
## $counts
## function gradient 
##       30       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL
# Refinements
calc_quadratic <- function(coeffs) { # The function passed to optim is only allowed to have one argument
  x <- coeffs[1]
  x ^ 2 - x + 10
}
optim(par = c(x = 3), fn = calc_quadratic)
## Warning in optim(par = c(x = 3), fn = calc_quadratic): one-dimensional optimization by Nelder-Mead is unreliable:
## use "Brent" or optimize() directly
## $par
##         x 
## 0.4998047 
## 
## $value
## [1] 9.75
## 
## $counts
## function gradient 
##       30       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL
x_actual = taiwan_real_estate$n_convenience
y_actual = taiwan_real_estate$price_twd_msq

# Function to calculate the sum of squares metric
calc_sum_of_squares <- function(coeffs) {
  intercept <- coeffs[1]
  slope <- coeffs[2]
  y_pred <- intercept + slope * x_actual
  y_diff <- y_actual - y_pred
  sum(y_diff ^ 2)
}

# Find where this function had its minimum value
optim(
  par = c(intercept = 0, slope = 0),
  fn = calc_sum_of_squares
)
## $par
## intercept     slope 
## 8.2260232 0.7977565 
## 
## $value
## [1] 4717.687
## 
## $counts
## function gradient 
##       87       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL
lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)
## 
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
## 
## Coefficients:
##   (Intercept)  n_convenience  
##        8.2242         0.7981

Multiple logistic regression

Multiple logistic regression

# Modeling
glm(response ~ explanatory, data = dataset, family = binomial)
glm(response ~ explanatory1 + explanatory2, data = dataset, family = binomial)
glm(response ~ explanatory1 * explanatory2, data = dataset, family = binomial)

# Prediction flow
explanatory_data <- expand_grid(
  explanatory1 = some_values,
  explanatory2 = some_values
)

prediction_data <- explanatory_data %>% 
  mutate(response = predict(mdl, explanatory_data, type = "response"))

# Confusion matrix
actual_response <- dataset$response
predicted_response <- round(fitted(mdl))

outcomes <- table(predicted_response, actual_response)

confusion <- conf_mat(outcomes)
autoplot(confusion)
summary(confusion, event_level = "second")

# Visualization
## Use faceting for categorical variables
## For 2 numeric explanatory variables, use color for response
## Give responses below 0.5 one color; responses above 0.5 another color
scale_color_gradient2(midpoint = 0.5)
# Multiple logistic regression
# Visualizing multiple explanatory variables
ggplot(churn, aes(time_since_first_purchase, time_since_last_purchase, color = has_churned)) +
  geom_point(alpha = 0.5) +
  scale_color_gradient2(midpoint = 0.5) +
  theme_bw()

# Logistic regression with 2 explanatory variables
mdl_churn_vs_both_inter <- glm(has_churned ~ time_since_first_purchase * time_since_last_purchase, data = churn, family = binomial)
mdl_churn_vs_both_inter
## 
## Call:  glm(formula = has_churned ~ time_since_first_purchase * time_since_last_purchase, 
##     family = binomial, data = churn)
## 
## Coefficients:
##                                        (Intercept)  
##                                            -0.1505  
##                          time_since_first_purchase  
##                                            -0.6376  
##                           time_since_last_purchase  
##                                             0.4233  
## time_since_first_purchase:time_since_last_purchase  
##                                             0.1123  
## 
## Degrees of Freedom: 399 Total (i.e. Null);  396 Residual
## Null Deviance:       554.5 
## Residual Deviance: 519.8     AIC: 527.8
# Logistic regression prediction
explanatory_data <- expand_grid(
  time_since_first_purchase = seq(-2, 4, 0.1),
  time_since_last_purchase = seq(-1, 6, 0.1)
)

prediction_data <- explanatory_data %>% 
  mutate(has_churned = predict(mdl_churn_vs_both_inter, explanatory_data, type = "response"))

ggplot(churn, aes(time_since_first_purchase, time_since_last_purchase, color = has_churned)) +
  geom_point(alpha = 0.5) +
  scale_color_gradient2(midpoint = 0.5) +
  theme_bw() +
  geom_point(data = prediction_data, size = 3, shape = 15)

actual_response <- churn$has_churned
predicted_response <- round(fitted(mdl_churn_vs_both_inter))

outcomes <- table(predicted_response, actual_response)
confusion <- conf_mat(outcomes)
confusion
##                   actual_response
## predicted_response   0   1
##                  0 102  53
##                  1  98 147
autoplot(confusion)

summary(confusion, event_level = "second")

The logistic distribution

  • Distribution function names
curve prefix normal logistic nmemonic
PDF d dnorm() dlogis() “d” for differentiate - you differentiate the CDF to get the PDF
CDF p pnorm() plogis() “p” is backwards “q” so it’s the inverse of the inverse CDF
Inv. CDF q qnorm() qlogis() “q” for quantile
# The logistic distribution
# Gaussian/Normal distribution
## Gaussian probability density function (PDF)
gaussian_distn <- tibble(
  x = seq(-4, 4, 0.05),
  gauss_pdf_x = dnorm(x)
)

pdf <- ggplot(gaussian_distn, aes(x, gauss_pdf_x)) +
  geom_line()

# Gaussian cumulative distribution function (CDF)
## Transforms from x-values to probabilities
## A feature of the CDF curve for all distributions - the y-axis is near zero on the far left of the plot, and near one on the far right of the plot
gaussian_distn <- tibble(
  x = seq(-4, 4, 0.05),
  gauss_pdf_x = dnorm(x),
  gauss_cdf_x = pnorm(x)
)

cdf <- ggplot(gaussian_distn, aes(x, gauss_cdf_x)) +
  geom_line()

# Gaussian inverse CDF
## Get back from probabilities to x-values
gaussian_distn_inv <- tibble(
  p = seq(0.001, 0.999, 0.001),
  gauss_inv_cdf_p = qnorm(p)
)

inv_cdf <- ggplot(gaussian_distn_inv, aes(p, gauss_inv_cdf_p)) +
  geom_line()

grid.arrange(pdf, cdf, inv_cdf, nrow = 1)

  • glm()’s family argument
    • Linear regression
      • lm(response ~ explanatory, data = dataset) equals to
      • glm(response ~ explanatory, data = dataset, family = gaussian)
    • Logistic regression
      • glm(response ~ explanatory, data = dataset, family = binomial)
    • str(gaussian())
      • linkfun - Provides a transformation of the response variables
      • linkinv - Undoes that transformation, gaussian()$linkinv
  • Logistic distribution
    • A probability distribution of a single continuous variable
    • Affected by location and scale parameters
      • As location increases, the logistic CDF curve moves rightwards. As scale increases, the steepness of the slope decreases
      • Visualization
    • Logistic distribution CDF (logistic function)
      • \(\text{cdf}(x) = \frac{1}{1 + exp(-x)}\)
      • The plot has an S-shape, known as a sigmoid curve
      • It takes an input that can be any number from minus infinity to infinity, and returns a value between zero and one
      • Each x input value is transformed to a unique value - That means that the transformation can be reversed
    • Logistic distribution inverse CDF (Logit function \ Inverse logistic function)
      • \(\text{inverse_cdf}(x) = log(\frac{p}{1-p})\)
      • The logit function takes values between zero and one, and returns values between minus infinity and infinity
# `glm()`'s family argument
str(gaussian())
## List of 12
##  $ family    : chr "gaussian"
##  $ link      : chr "identity"
##  $ linkfun   :function (mu)  
##  $ linkinv   :function (eta)  
##  $ variance  :function (mu)  
##  $ dev.resids:function (y, mu, wt)  
##  $ aic       :function (y, n, mu, wt, dev)  
##  $ mu.eta    :function (eta)  
##  $ initialize:  expression({  n <- rep.int(1, nobs)  if (is.null(etastart) && is.null(start) && is.null(mustart) && ((family$link| __truncated__
##  $ validmu   :function (mu)  
##  $ valideta  :function (eta)  
##  $ dispersion: num NA
##  - attr(*, "class")= chr "family"
# For the gaussian case, each transformation is just the identity function: it returns the same value that you put into it, since no special transformation is needed
gaussian()$linkfun
## function (mu) 
## mu
## <environment: namespace:stats>
gaussian()$linkinv
## function (eta) 
## eta
## <environment: namespace:stats>
# Logistic PDF
logistic_distn <- tibble(
  x = seq(-6, 6, 0.05),
  logistic_pdf_x = dlogis(x)
)

ggplot(logistic_distn, aes(x, logistic_pdf_x)) +
  geom_line()

# Cumulative distribution function
logistic_distn_cdf <- tibble(
  x = seq(-10, 10, 0.1),
  logistic_x = plogis(x),
  logistic_x_man = 1 / (1 + exp(-x))
)

all.equal(logistic_distn_cdf$logistic_x, logistic_distn_cdf$logistic_x_man)
## [1] TRUE
# Shape depending on location and scale
# As `location` increases, the logistic CDF curve moves rightwards
# As `scale` increases, the steepness of the slope decreases
ggplot(logistic_distn_cdf, aes(x, logistic_x)) +
  geom_line()

# Inverse cumulative distribution function
logistic_distn_inv_cdf <- tibble(
  p = seq(0.001, 0.999, 0.001),
  logit_p = qlogis(p),
  logit_p_man = log(p / (1 - p))
) 

all.equal(logistic_distn_inv_cdf$logit_p, logistic_distn_inv_cdf$logit_p_man)
## [1] TRUE
ggplot(logistic_distn_inv_cdf, aes(p, logit_p)) +
  geom_line()

# binomial family argument
str(binomial())
## List of 13
##  $ family    : chr "binomial"
##  $ link      : chr "logit"
##  $ linkfun   :function (mu)  
##  $ linkinv   :function (eta)  
##  $ variance  :function (mu)  
##  $ dev.resids:function (y, mu, wt)  
##  $ aic       :function (y, n, mu, wt, dev)  
##  $ mu.eta    :function (eta)  
##  $ initialize: language {     if (NCOL(y) == 1) { ...
##  $ validmu   :function (mu)  
##  $ valideta  :function (eta)  
##  $ simulate  :function (object, nsim)  
##  $ dispersion: num 1
##  - attr(*, "class")= chr "family"
x <- seq(0.01, 0.99, 0.01)
p <- seq(0.01, 0.99, 0.01)

all.equal(
  binomial()$linkinv(x), # Corresponds to CDP
  plogis(x)
)
## [1] TRUE
all.equal(
  binomial()$linkfun(p), # Corresponds to inverse CDP
  qlogis(p)
)
## [1] TRUE

How logistic regression works

  • The principle is the same as for linear regression - Choose a metric that measures how far the predicted responses are from the actual responses, and optimize that metric
  • Likelihood - Find the maximum value
    • sum(y_pred * y_actual + (1 - y_pred) * (1 - y_actual))
    • y_actual is always 0 or 1
    • y_pred is between 0 and 1
    • Computing likelihood involves adding many very small numbers, leading to numerical error
    • Log-likelihood is easier to compute
  • Log-likelihood
    • log(y_pred) y_actual + log(1 - y_pred) (1 - y_actual)
  • Negative log-likelihood
    • Maximizing log-likelihood is the same as minimizing negative log-likelihood
    • sum(log_likelihoods)
# How logistic regression works
# Logistic regression algorithm
x_actual <- churn$time_since_last_purchase
y_actual <- churn$has_churned

intercept <- 1 # -0.05
slope <- 0.5 # 0.275

# Manual calculation
# The logistic regression prediction formula combines a linear combination of predictor variables with the logistic function to model the probability of a binary outcome
y_pred <- plogis(intercept + slope * x_actual)

# Both likelihood and log-likelihood increase to a maximum value
likelihoods <- sum(y_pred * y_actual + (1 - y_pred) * (1 - y_actual)) # Likelihood
likelihoods
## [1] 205.0343
log_likelihoods <- log(y_pred) * y_actual + log(1 - y_pred) * (1 - y_actual) # Log-likelihood
sum(log_likelihoods)
## [1] -326.2599
# Simple version of glm()
calc_neg_log_likelihood <- function(coeffs) {
  intercept <- coeffs[1]
  slope <- coeffs[2]
  y_pred <- plogis(intercept + slope * x_actual)
  log_likelihoods <- log(y_pred) * y_actual + log(1 - y_pred) * (1 - y_actual)
  -sum(log_likelihoods)
}

# Optimize the metric
optim(
  # Initially guess 0 intercept and 1 slope
  par = c(intercept = 0, slope = 1),
  # Use calc_neg_log_likelihood as the optimization fn 
  fn = calc_neg_log_likelihood
)
## $par
##   intercept       slope 
## -0.03478255  0.26890041 
## 
## $value
## [1] 273.2002
## 
## $counts
## function gradient 
##       51       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL
# Compare the coefficients to those calculated by glm()
glm(has_churned ~ time_since_last_purchase, data = churn, family = binomial)
## 
## Call:  glm(formula = has_churned ~ time_since_last_purchase, family = binomial, 
##     data = churn)
## 
## Coefficients:
##              (Intercept)  time_since_last_purchase  
##                 -0.03502                   0.26921  
## 
## Degrees of Freedom: 399 Total (i.e. Null);  398 Residual
## Null Deviance:       554.5 
## Residual Deviance: 546.4     AIC: 550.4