Parallel slopes linear regression
Fitting parallel slopes
- Parallel slopes linear regression - One numeric
explanatory variable and one categorical explanatory variable
# Parallel slopes linear regression
# One numeric explanatory variable
# One intercept coefficient and one slope coefficient
mdl_mass_vs_length <- lm(mass_g ~ length_cm, data = fish)
mdl_mass_vs_length
##
## Call:
## lm(formula = mass_g ~ length_cm, data = fish)
##
## Coefficients:
## (Intercept) length_cm
## -536.2 34.9
# One categorical explanatory variable
# One intercept coefficient for each category
mdl_mass_vs_species <- lm(mass_g ~ species + 0, data = fish)
mdl_mass_vs_species
##
## Call:
## lm(formula = mass_g ~ species + 0, data = fish)
##
## Coefficients:
## speciesBream speciesPerch speciesPike speciesRoach
## 617.8 382.2 718.7 152.0
# One numeric explanatory variable and one categorical explanatory variable
# One slope coefficient, and one intercept coefficient for each category
mdl_mass_vs_both <- lm(mass_g ~ length_cm + species + 0, data = fish)
mdl_mass_vs_both
##
## Call:
## lm(formula = mass_g ~ length_cm + species + 0, data = fish)
##
## Coefficients:
## length_cm speciesBream speciesPerch speciesPike speciesRoach
## 42.57 -672.24 -713.29 -1089.46 -726.78
# Visualization
# One numeric explanatory variable
g1 <- ggplot(fish, aes(length_cm, mass_g)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# One categorical explanatory variable
g2 <- ggplot(fish, aes(species, mass_g)) +
geom_boxplot() +
stat_summary(fun = mean, shape = 15)
# One numeric explanatory variable and one categorical explanatory variable
g3 <- ggplot(fish, aes(length_cm, mass_g, color = species)) +
geom_point() +
geom_parallel_slopes(se = FALSE)
grid.arrange(g1, g2, g3, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing missing values (`geom_segment()`).

mdl_price_vs_conv <- lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)
mdl_price_vs_conv
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
##
## Coefficients:
## (Intercept) n_convenience
## 8.2242 0.7981
mdl_price_vs_age <- lm(price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)
mdl_price_vs_age
##
## Call:
## lm(formula = price_twd_msq ~ house_age_years + 0, data = taiwan_real_estate)
##
## Coefficients:
## house_age_years0 to 15 house_age_years15 to 30 house_age_years30 to 45
## 12.637 9.877 11.393
mdl_price_vs_both <- lm(price_twd_msq ~ n_convenience + house_age_years +0, data = taiwan_real_estate)
mdl_price_vs_both
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience + house_age_years +
## 0, data = taiwan_real_estate)
##
## Coefficients:
## n_convenience house_age_years0 to 15 house_age_years15 to 30
## 0.7915 9.4133 7.0852
## house_age_years30 to 45
## 7.5110
g1 <- ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
g2 <- ggplot(taiwan_real_estate, aes(house_age_years, price_twd_msq)) +
geom_boxplot() +
stat_summary(fun = mean, shape = 15)
g3 <- ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq, color = house_age_years)) +
geom_point() +
geom_parallel_slopes(se = FALSE)
grid.arrange(g1, g2, g3, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Predicting parallel slopes
# Predicting parallel slopes
# Choosing values for explanatory variables
explanatory_data <- expand_grid( # `expand_grid` returns a data frame of all combinations of its inputs
length_cm = seq(5, 60, 5),
species = unique(fish$species)
)
glimpse(explanatory_data)
## Rows: 48
## Columns: 2
## $ length_cm <dbl> 5, 5, 5, 5, 10, 10, 10, 10, 15, 15, 15, 15, 20, 20, 20, 20, …
## $ species <chr> "Bream", "Roach", "Perch", "Pike", "Bream", "Roach", "Perch"…
# Add a column of predictions to the data frame
prediction_data <- explanatory_data %>%
mutate(mass_g = predict(mdl_mass_vs_both, explanatory_data))
# Visualizing the predictions
ggplot(fish, aes(length_cm, mass_g, color = species)) +
geom_point() +
geom_parallel_slopes(se = FALSE) +
geom_point(data = prediction_data, size = 3, shape = 15)

# Manually calculating predictions
coeffs <- coefficients(mdl_mass_vs_both)
slope <- coeffs[1]
intercept_bream <- coeffs[2]
intercept_perch <- coeffs[3]
intercept_pike <- coeffs[4]
intercept_roach <- coeffs[5]
explanatory_data %>%
mutate(
intercept = case_when(
species == "Bream" ~ intercept_bream,
species == "Perch" ~ intercept_perch,
species == "Pike" ~ intercept_pike,
species == "Roach" ~ intercept_roach
),
mass_g = intercept + slope * length_cm
)
predict(mdl_mass_vs_both, explanatory_data)
## 1 2 3 4 5 6 7
## -459.39910 -513.93503 -500.45009 -876.61328 -246.55633 -301.09226 -287.60732
## 8 9 10 11 12 13 14
## -663.77051 -33.71355 -88.24949 -74.76455 -450.92774 179.12922 124.59328
## 15 16 17 18 19 20 21
## 138.07822 -238.08497 391.97199 337.43605 350.92099 -25.24220 604.81476
## 22 23 24 25 26 27 28
## 550.27882 563.76376 187.60057 817.65753 763.12159 776.60653 400.44334
## 29 30 31 32 33 34 35
## 1030.50030 975.96436 989.44930 613.28611 1243.34307 1188.80713 1202.29207
## 36 37 38 39 40 41 42
## 826.12888 1456.18584 1401.64990 1415.13484 1038.97165 1669.02861 1614.49268
## 43 44 45 46 47 48
## 1627.97761 1251.81442 1881.87138 1827.33545 1840.82038 1464.65719
explanatory_data <- expand_grid(
n_convenience = 0:10,
house_age_years = unique(taiwan_real_estate$house_age_years)
)
glimpse(explanatory_data)
## Rows: 33
## Columns: 2
## $ n_convenience <int> 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, …
## $ house_age_years <fct> 30 to 45, 15 to 30, 0 to 15, 30 to 45, 15 to 30, 0 to …
prediction_data <- explanatory_data %>%
mutate(price_twd_msq = predict(mdl_price_vs_both, explanatory_data))
taiwan_real_estate %>%
ggplot(aes(n_convenience, price_twd_msq, color = house_age_years)) +
geom_point() +
geom_parallel_slopes(se = FALSE) +
geom_point(data = prediction_data, size = 5, shape = 15)

# Manually calculating predictions
coeffs <- coefficients(mdl_price_vs_both)
slope <- coeffs[1]
intercept_0_15 <- coeffs[2]
intercept_15_30 <- coeffs[3]
intercept_30_45 <- coeffs[4]
explanatory_data %>%
mutate(
intercept = case_when(
house_age_years == "0 to 15" ~ intercept_0_15,
house_age_years == "15 to 30" ~ intercept_15_30,
house_age_years == "30 to 45" ~ intercept_30_45
),
price_twd_msq = intercept + slope * n_convenience
)
predict(mdl_price_vs_both, explanatory_data)
## 1 2 3 4 5 6 7 8
## 7.510958 7.085169 9.413325 8.302415 7.876627 10.204782 9.093873 8.668084
## 9 10 11 12 13 14 15 16
## 10.996239 9.885330 9.459541 11.787696 10.676787 10.250998 12.579153 11.468244
## 17 18 19 20 21 22 23 24
## 11.042455 13.370610 12.259701 11.833912 14.162067 13.051158 12.625369 14.953525
## 25 26 27 28 29 30 31 32
## 13.842615 13.416826 15.744982 14.634072 14.208284 16.536439 15.425530 14.999741
## 33
## 17.327896
Assessing model performance
- Model performance metrics
- Coefficient of determination
(R-squared) - Measures how good the regression’s
prediction line fits the observed values
- Residual standard error (RSE) -
The typical size of the residuals
- Adjusted coefficient of determination
- \(\bar{R^2} = 1 - (1 - R^2) \frac{n_{obs}
- 1}{n_{obs} - n_{var} - 1}\)
- More explanatory variables increases \(R^2\)
- Too many explanatory variables causes overfitting
- Adjusted coefficient of determination penalizes more explanatory
variables
- Penalty is noticeable when \(R^2\)
is small, or \(n_{var}\) is large
fraction of \(n_{obs}\)
- In
glance()
, it’s the adj.r.squared
element
# Assessing model performance
# Coefficient of determination
mdl_mass_vs_species %>%
glance() %>%
pull(r.squared)
## [1] 0.7163347
mdl_mass_vs_length %>%
glance() %>%
pull(r.squared)
## [1] 0.822569
mdl_mass_vs_both %>%
glance() %>%
pull(r.squared)
## [1] 0.9694266
# Adjusted coefficient of determination
mdl_mass_vs_species %>%
glance() %>%
dplyr::select(r.squared, adj.r.squared)
mdl_mass_vs_length %>%
glance() %>%
dplyr::select(r.squared, adj.r.squared)
mdl_mass_vs_both %>%
glance() %>%
dplyr::select(r.squared, adj.r.squared)
# Residual standard error
mdl_mass_vs_species %>%
glance() %>%
pull(sigma)
## [1] 313.5501
mdl_mass_vs_length %>%
glance() %>%
pull(sigma)
## [1] 152.1209
mdl_mass_vs_both %>%
glance() %>%
pull(sigma)
## [1] 103.3556
mdl_price_vs_conv %>%
glance() %>%
dplyr::select(r.squared, adj.r.squared)
mdl_price_vs_age %>%
glance() %>%
dplyr::select(r.squared, adj.r.squared)
mdl_price_vs_both %>%
glance() %>%
dplyr::select(r.squared, adj.r.squared)
mdl_price_vs_conv %>%
glance() %>%
pull(sigma)
## [1] 3.383888
mdl_price_vs_age %>%
glance() %>%
pull(sigma)
## [1] 3.950184
mdl_price_vs_both %>%
glance() %>%
pull(sigma)
## [1] 3.21346
Models for each category
- Splitting the dataset in the smart way
- base-R:
split()
+ lapply()
- dplyr :
nest_by()
+ mutate()
# Models for each category
# 4 categories
unique(fish$species)
## [1] "Bream" "Roach" "Perch" "Pike"
# Splitting the dataset
bream <- fish %>% filter(species == "Bream")
perch <- fish %>% filter(species == "Perch")
pike <- fish %>% filter(species == "Pike")
roach <- fish %>% filter(species == "Roach")
# 4 models
mdl_bream <- lm(mass_g ~ length_cm, data = bream)
mdl_bream
##
## Call:
## lm(formula = mass_g ~ length_cm, data = bream)
##
## Coefficients:
## (Intercept) length_cm
## -1035.35 54.55
mdl_perch <- lm(mass_g ~ length_cm, data = perch)
mdl_perch
##
## Call:
## lm(formula = mass_g ~ length_cm, data = perch)
##
## Coefficients:
## (Intercept) length_cm
## -619.18 38.91
mdl_pike <- lm(mass_g ~ length_cm, data = pike)
mdl_pike
##
## Call:
## lm(formula = mass_g ~ length_cm, data = pike)
##
## Coefficients:
## (Intercept) length_cm
## -1540.82 53.19
mdl_roach <- lm(mass_g ~ length_cm, data = roach)
mdl_roach
##
## Call:
## lm(formula = mass_g ~ length_cm, data = roach)
##
## Coefficients:
## (Intercept) length_cm
## -329.38 23.32
# Explanatory data
explanatory_data <- tibble(length_cm = seq(5, 60, 5))
# Making predictions
prediction_data_bream <- explanatory_data %>%
mutate(
mass_g = predict(mdl_bream, explanatory_data),
species = "Bream"
)
prediction_data_perch <- explanatory_data %>%
mutate(
mass_g = predict(mdl_perch, explanatory_data),
species = "Perch"
)
prediction_data_pike <- explanatory_data %>%
mutate(
mass_g = predict(mdl_pike, explanatory_data),
species = "Pike"
)
prediction_data_roach <- explanatory_data %>%
mutate(
mass_g = predict(mdl_roach, explanatory_data),
species = "Roach"
)
# Visualizing predictions
g1 <- ggplot(fish, aes(length_cm, mass_g)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
g2 <- ggplot(fish, aes(length_cm, mass_g, color = species)) +
geom_point() +
geom_parallel_slopes(se = FALSE)
g3 <- ggplot(fish, aes(length_cm, mass_g, color = species)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data_bream, size = 3, shape = 15) +
geom_point(data = prediction_data_perch, size = 3, shape = 15) +
geom_point(data = prediction_data_pike, size = 3, shape = 15) +
geom_point(data = prediction_data_roach, size = 3, shape = 15)
grid.arrange(g1, g2, g3, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

# The mixed performance result is quite common: the whole dataset model benefits from the increased power of more rows of data, whereas individual models benefit from not having to satisfy differing components of data
mdl_fish <- lm(mass_g ~ length_cm + species, data = fish)
# Coefficient of determination
mdl_fish %>% glance() %>% pull(adj.r.squared)
## [1] 0.9174431
mdl_bream %>% glance() %>% pull(adj.r.squared)
## [1] 0.8743676
mdl_perch %>% glance() %>% pull(adj.r.squared)
## [1] 0.9169461
mdl_pike %>% glance() %>% pull(adj.r.squared)
## [1] 0.9410872
mdl_roach %>% glance() %>% pull(adj.r.squared)
## [1] 0.8152746
# Residual standard error
mdl_fish %>% glance() %>% pull(sigma)
## [1] 103.3556
mdl_bream %>% glance() %>% pull(sigma)
## [1] 74.15224
mdl_perch %>% glance() %>% pull(sigma)
## [1] 100.1802
mdl_pike %>% glance() %>% pull(sigma)
## [1] 119.9377
mdl_roach %>% glance() %>% pull(sigma)
## [1] 38.1784
# One model per category
taiwan_0_to_15 <- taiwan_real_estate %>% filter(house_age_years == "0 to 15")
taiwan_15_to_30 <- taiwan_real_estate %>% filter(house_age_years == "15 to 30")
taiwan_30_to_45 <- taiwan_real_estate %>% filter(house_age_years == "30 to 45")
mdl_0_to_15 <- lm(price_twd_msq ~ n_convenience, data = taiwan_0_to_15)
mdl_15_to_30 <- lm(price_twd_msq ~ n_convenience, data = taiwan_15_to_30)
mdl_30_to_45 <- lm(price_twd_msq ~ n_convenience, data = taiwan_30_to_45)
mdl_0_to_15
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_0_to_15)
##
## Coefficients:
## (Intercept) n_convenience
## 9.2417 0.8336
mdl_15_to_30
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_15_to_30)
##
## Coefficients:
## (Intercept) n_convenience
## 6.8719 0.8519
mdl_30_to_45
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_30_to_45)
##
## Coefficients:
## (Intercept) n_convenience
## 8.1131 0.6687
# Predicting multiple models
explanatory_data <- tibble(n_convenience = 0:10)
prediction_data_0_to_15 <- explanatory_data %>%
mutate(price_twd_msq = predict(mdl_0_to_15, explanatory_data))
prediction_data_15_to_30 <- explanatory_data %>%
mutate(price_twd_msq = predict(mdl_15_to_30, explanatory_data))
prediction_data_30_to_45 <- explanatory_data %>%
mutate(price_twd_msq = predict(mdl_30_to_45, explanatory_data))
# Visualizing multiple models
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq, color = house_age_years)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data_0_to_15, size = 3, shape = 15, color = "red") +
geom_point(data = prediction_data_15_to_30, size = 3, shape = 15, color = "green") +
geom_point(data = prediction_data_30_to_45, size = 3, shape = 15, color = "blue")
## `geom_smooth()` using formula = 'y ~ x'

# Assessing model performance
mdl_all_ages <- lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)
mdl_all_ages %>% glance() %>% pull(r.squared)
## [1] 0.3260466
mdl_0_to_15 %>% glance() %>% pull(r.squared)
## [1] 0.3120536
mdl_15_to_30 %>% glance() %>% pull(r.squared)
## [1] 0.4424605
mdl_30_to_45 %>% glance() %>% pull(r.squared)
## [1] 0.3125713
mdl_all_ages %>% glance() %>% pull(sigma)
## [1] 3.383888
mdl_0_to_15 %>% glance() %>% pull(sigma)
## [1] 3.564127
mdl_15_to_30 %>% glance() %>% pull(sigma)
## [1] 2.585273
mdl_30_to_45 %>% glance() %>% pull(sigma)
## [1] 3.239037
One model with an interaction
- Specify interactions between explanatory variables
- A single model that contains intercepts and slopes for each
category
- Interaction - The effect of one explanatory
variable on the expected response changes depending on the value of
another explanatory variable (The two explanatory variables
interact / Interactions between
explanatory variables)
- E.g. Different fish species have different mass to length ratios.
The effect of length on the expected mass is different for different
species
- No interactions -
response ~ explntry1 + explntry2
- With interactions (implicit) -
response_var ~ explntry1 * explntry2
- Implicit - Didn’t write down what interactions are needed - R
figures that out itself
- With interactions (explicit) -
response ~ explntry1 + explntry2 + explntry1:explntry2
Modeling with interactions
# One model with an interaction
lm(mass_g ~ length_cm * species, data = fish) # The coefficients are tricky to understand
##
## Call:
## lm(formula = mass_g ~ length_cm * species, data = fish)
##
## Coefficients:
## (Intercept) length_cm speciesPerch
## -1035.348 54.550 416.172
## speciesPike speciesRoach length_cm:speciesPerch
## -505.477 705.971 -15.639
## length_cm:speciesPike length_cm:speciesRoach
## -1.355 -31.231
lm(mass_g ~ length_cm * species + 0, data = fish) # The coefficients are tricky to understand
##
## Call:
## lm(formula = mass_g ~ length_cm * species + 0, data = fish)
##
## Coefficients:
## length_cm speciesBream speciesPerch
## 54.550 -1035.348 -619.175
## speciesPike speciesRoach length_cm:speciesPerch
## -1540.824 -329.376 -15.639
## length_cm:speciesPike length_cm:speciesRoach
## -1.355 -31.231
lm(mass_g ~ species + species:length_cm + 0, data = fish) # Zero removes the global intercept
##
## Call:
## lm(formula = mass_g ~ species + species:length_cm + 0, data = fish)
##
## Coefficients:
## speciesBream speciesPerch speciesPike
## -1035.35 -619.18 -1540.82
## speciesRoach speciesBream:length_cm speciesPerch:length_cm
## -329.38 54.55 38.91
## speciesPike:length_cm speciesRoach:length_cm
## 53.19 23.32
# Specifying an interaction
lm(price_twd_msq ~ n_convenience * house_age_years, data = taiwan_real_estate)
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience * house_age_years,
## data = taiwan_real_estate)
##
## Coefficients:
## (Intercept) n_convenience
## 9.24170 0.83359
## house_age_years15 to 30 house_age_years30 to 45
## -2.36978 -1.12858
## n_convenience:house_age_years15 to 30 n_convenience:house_age_years30 to 45
## 0.01833 -0.16489
lm(price_twd_msq ~ n_convenience + house_age_years + n_convenience:house_age_years, data = taiwan_real_estate)
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience + house_age_years +
## n_convenience:house_age_years, data = taiwan_real_estate)
##
## Coefficients:
## (Intercept) n_convenience
## 9.24170 0.83359
## house_age_years15 to 30 house_age_years30 to 45
## -2.36978 -1.12858
## n_convenience:house_age_years15 to 30 n_convenience:house_age_years30 to 45
## 0.01833 -0.16489
# Interactions with understandable coeffs
mdl_readable_inter <- lm(price_twd_msq ~ house_age_years + house_age_years:n_convenience + 0, data = taiwan_real_estate)
mdl_readable_inter
##
## Call:
## lm(formula = price_twd_msq ~ house_age_years + house_age_years:n_convenience +
## 0, data = taiwan_real_estate)
##
## Coefficients:
## house_age_years0 to 15 house_age_years15 to 30
## 9.2417 6.8719
## house_age_years30 to 45 house_age_years0 to 15:n_convenience
## 8.1131 0.8336
## house_age_years15 to 30:n_convenience house_age_years30 to 45:n_convenience
## 0.8519 0.6687
coefficients(mdl_0_to_15)
## (Intercept) n_convenience
## 9.2417022 0.8335867
coefficients(mdl_15_to_30)
## (Intercept) n_convenience
## 6.8719186 0.8519172
coefficients(mdl_30_to_45)
## (Intercept) n_convenience
## 8.1131235 0.6686981
Making predictions with interactions
# Making predictions with interactions
mdl_mass_vs_both_inter <- lm(mass_g ~ species + species:length_cm + 0, data = fish)
explanatory_data <- expand_grid(
length_cm = seq(5, 60, 5),
species = unique(fish$species)
)
prediction_data <- explanatory_data %>%
mutate(mass_g = predict(mdl_mass_vs_both_inter, explanatory_data))
# Visualizing the predictions
ggplot(fish, aes(length_cm, mass_g, color = species)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data, size = 3, shape = 15)
## `geom_smooth()` using formula = 'y ~ x'

# Manually calculating the predictions
coeffs <- coefficients(mdl_mass_vs_both_inter)
intercept_bream <- coeffs[1]
intercept_perch <- coeffs[2]
intercept_pike <- coeffs[3]
intercept_roach <- coeffs[4]
slope_bream <- coeffs[5]
slope_perch <- coeffs[6]
slope_pike <- coeffs[7]
slope_roach <- coeffs[8]
explanatory_data %>%
mutate(
mass_g = case_when(
species == "Bream" ~ intercept_bream + slope_bream * length_cm,
species == "Perch" ~ intercept_perch + slope_perch * length_cm,
species == "Pike" ~ intercept_pike + slope_pike * length_cm,
species == "Roach" ~ intercept_roach + slope_roach * length_cm
)
)
# Predicting with interactions
mdl_price_vs_both_inter <- lm(price_twd_msq ~ house_age_years + n_convenience:house_age_years + 0, data = taiwan_real_estate)
explanatory_data <- expand.grid(
n_convenience = 0:10,
house_age_years = unique(taiwan_real_estate$house_age_years)
)
prediction_data <- explanatory_data %>%
mutate(price_twd_msq = predict(mdl_price_vs_both_inter, explanatory_data))
ggplot(taiwan_real_estate, aes(n_convenience, price_twd_msq, color = house_age_years)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = prediction_data, size = 5, shape = 15)
## `geom_smooth()` using formula = 'y ~ x'

# Manually calculating predictions with interactions
coeffs <- coefficients(mdl_price_vs_both_inter)
intercept_0_15 <- coeffs[1]
intercept_15_30 <- coeffs[2]
intercept_30_45 <- coeffs[3]
slope_0_15 <- coeffs[4]
slope_15_30 <- coeffs[5]
slope_30_45 <- coeffs[6]
prediction_data <- explanatory_data %>%
mutate(
price_twd_msq = case_when(
house_age_years == "0 to 15" ~ coeffs[1] + coeffs[4] * n_convenience,
house_age_years == "15 to 30" ~ coeffs[2] + coeffs[5] * n_convenience,
house_age_years == "30 to 45" ~ coeffs[3] + coeffs[6] * n_convenience
)
)
Simpson’s Paradox
- The difference between models of a whole dataset and individual
models for each category
- Simpson’s Paradox - Occurs when the trend of a
model on the whole dataset is very different from the trends shown by
models on subsets of the dataset
- trend = slope coefficient
- Reconciling the difference
- Good advice - If possible, try to plot the dataset
- Common advice - You can’t choose the best model in general - it
depends on the dataset and the question you are trying to answer
- More good advice - Articulate a question before you start
modeling
- Usually (but not always) the grouped model contains more
insight
- Are you missing explanatory variables?
- Context is important
# Simpson's Paradox
glimpse(auctions)
## Rows: 343
## Columns: 3
## $ price <dbl> 260.00, 256.86, 260.00, 238.02, 231.50, 251.11, 247.50, 2…
## $ openbid <dbl> 0.01, 0.01, 0.01, 0.01, 1.00, 9.99, 215.00, 155.00, 50.00…
## $ auction_type <chr> "7 day auction", "3 day auction", "5 day auction", "7 day…
# Whole
mdl_price_vs_openbid <- lm(price ~ openbid, data = auctions)
mdl_price_vs_openbid
##
## Call:
## lm(formula = price ~ openbid, data = auctions)
##
## Coefficients:
## (Intercept) openbid
## 229.245667 -0.002098
g1 <- ggplot(auctions, aes(openbid, price)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# By group
mdl_price_vs_both <- lm(price ~ auction_type + openbid:auction_type + 0, data = auctions)
mdl_price_vs_both
##
## Call:
## lm(formula = price ~ auction_type + openbid:auction_type + 0,
## data = auctions)
##
## Coefficients:
## auction_type3 day auction auction_type5 day auction
## 226.369005 221.599320
## auction_type7 day auction auction_type3 day auction:openbid
## 231.602861 -0.029026
## auction_type5 day auction:openbid auction_type7 day auction:openbid
## 0.084014 0.003682
g2 <- ggplot(auctions, aes(openbid, price, color = auction_type)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
grid.arrange(g1, g2, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Multiple Linear Regression
- Two numeric explanatory variables
- Visualizing 3 numeric variables
- 3D scatter plot
- 2D scatter plot with response as color
Two numeric explanatory variables
# Multiple Linear Regression
# Two numeric explanatory variables
# 3D scatter plot
# The plot is impossible to interpret
scatter3D(fish$length_cm, fish$height_cm, fish$mass_g)

fish %$% # magrittr's dollar pipe
scatter3D(length_cm, height_cm, mass_g)

# 2D scatter plot, color for response
# Set of color scales called "viridis" that provide easier to distinguish colors
# `scale_color_viridis_c` is used for continuous scales, where you have numeric data
ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
geom_point() +
scale_color_viridis_c(option = "inferno") # Viridis color scales

# Modeling with 2 numeric explanatory variables
# A global intercept coefficient, and one slope coefficient for each explanatory variable
mdl_mass_vs_both <- lm(mass_g ~ length_cm + height_cm, data = fish)
mdl_mass_vs_both
##
## Call:
## lm(formula = mass_g ~ length_cm + height_cm, data = fish)
##
## Coefficients:
## (Intercept) length_cm height_cm
## -622.15 28.97 26.33
explanatory_data <- expand_grid(
length_cm = seq(5, 60, 5),
height_cm = seq(2, 20, 2)
)
prediction_data <- explanatory_data %>%
mutate(mass_g = predict(mdl_mass_vs_both, explanatory_data))
# The color grid gives a nice overview of how the response variable changes over the plane of the explanatory variables. The heaviest fish are in the top-right, where they are long and tall
ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
geom_point() +
scale_color_viridis_c(option = "inferno") +
geom_point(data = prediction_data, shape = 15, size = 3)

# Including an interaction
mdl_mass_vs_both_inter <- lm(mass_g ~ length_cm * height_cm, data = fish)
mdl_mass_vs_both_inter
##
## Call:
## lm(formula = mass_g ~ length_cm * height_cm, data = fish)
##
## Coefficients:
## (Intercept) length_cm height_cm
## 159.1075 0.3014 -78.1252
## length_cm:height_cm
## 3.5454
explanatory_data <- expand_grid(
length_cm = seq(5, 60, 5),
height_cm = seq(2, 20, 2)
)
prediction_data <- explanatory_data %>%
mutate(mass_g = predict(mdl_mass_vs_both_inter, explanatory_data))
ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
geom_point() +
scale_color_viridis_c(option = "inferno") +
geom_point(data = prediction_data, shape = 15, size = 3)

# 3D visualizations
taiwan_real_estate %$%
scatter3D(n_convenience, sqrt(dist_to_mrt_m), price_twd_msq)

ggplot(taiwan_real_estate, aes(n_convenience, sqrt(dist_to_mrt_m), color = price_twd_msq)) +
geom_point() +
scale_color_viridis_c(option = "plasma")

# Modeling 2 numeric explanatory variables
mdl_price_vs_conv_dist <- lm(price_twd_msq ~ n_convenience + sqrt(dist_to_mrt_m), data = taiwan_real_estate)
mdl_price_vs_conv_dist
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience + sqrt(dist_to_mrt_m),
## data = taiwan_real_estate)
##
## Coefficients:
## (Intercept) n_convenience sqrt(dist_to_mrt_m)
## 15.1038 0.2142 -0.1573
explanatory_data <- expand_grid(
n_convenience = 0:10,
dist_to_mrt_m = seq(0, 80, 10) ^ 2
)
prediction_data <- explanatory_data %>%
mutate(price_twd_msq = predict(mdl_price_vs_conv_dist, explanatory_data))
ggplot(taiwan_real_estate, aes(n_convenience, sqrt(dist_to_mrt_m), color = price_twd_msq)) +
geom_point() +
scale_color_viridis_c(option = "plasma") +
geom_point(data = prediction_data, size = 3, color = "yellow")

# Including an interaction
mdl_price_vs_conv_dist <- lm(price_twd_msq ~ n_convenience * sqrt(dist_to_mrt_m), data = taiwan_real_estate)
mdl_price_vs_conv_dist
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience * sqrt(dist_to_mrt_m),
## data = taiwan_real_estate)
##
## Coefficients:
## (Intercept) n_convenience
## 14.73733 0.42425
## sqrt(dist_to_mrt_m) n_convenience:sqrt(dist_to_mrt_m)
## -0.14121 -0.01124
explanatory_data <- expand_grid(
n_convenience = 0:10,
dist_to_mrt_m = seq(0, 80, 10) ^ 2
)
prediction_data <- explanatory_data %>%
mutate(price_twd_msq = predict(mdl_price_vs_conv_dist, explanatory_data))
ggplot(taiwan_real_estate, aes(n_convenience, sqrt(dist_to_mrt_m), color = price_twd_msq)) +
geom_point() +
scale_color_viridis_c(option = "plasma") +
geom_point(data = prediction_data, size = 3, color = "yellow")

More than 2 explanatory variables
- More than 2 explanatory variables
- In general, while it is tricky to include more than three numeric
variables in a scatter plot, you can include as many categorical
variables as you like using faceting. However, more facets can make it
harder to see an overall picture. Plotting rapidly becomes difficult as
you increase the number of variables to display
- In addition to using x and y scales for two numeric variables, you
can use color for a third numeric variable, and you can use faceting for
categorical variables. And that’s about your limit before the plots
become to difficult to interpret
- There are some specialist plot types like correlation heatmaps and
parallel coordinates plots that will handle more variables, but they
give you much less information about each variable, and they aren’t
great for visualizing model predictions
# More than 2 explanatory variables
ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
geom_point() +
scale_color_viridis_c(option = "inferno")

# Faceting by species
ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
geom_point() +
scale_color_viridis_c(option = "inferno") +
facet_wrap(vars(species))

# Modeling
# Different levels of interaction
## No interactions
lm(mass_g ~ length_cm + height_cm + species + 0, data = fish)
##
## Call:
## lm(formula = mass_g ~ length_cm + height_cm + species + 0, data = fish)
##
## Coefficients:
## length_cm height_cm speciesBream speciesPerch speciesPike
## 36.91 18.16 -776.65 -710.57 -989.40
## speciesRoach
## -731.64
## 2-way interactions between pairs of variables
lm(mass_g ~ length_cm + height_cm + species + length_cm:height_cm + length_cm:species + height_cm:species + 0, data = fish)
##
## Call:
## lm(formula = mass_g ~ length_cm + height_cm + species + length_cm:height_cm +
## length_cm:species + height_cm:species + 0, data = fish)
##
## Coefficients:
## length_cm height_cm speciesBream
## -37.046 -36.911 633.199
## speciesPerch speciesPike speciesRoach
## 137.801 -311.846 185.260
## length_cm:height_cm length_cm:speciesPerch length_cm:speciesPike
## 3.574 23.124 57.529
## length_cm:speciesRoach height_cm:speciesPerch height_cm:speciesPike
## 29.676 10.743 -100.358
## height_cm:speciesRoach
## -21.209
# Same as above
# The power of operator has a special meaning; it doesn't square the explanatory variable values
lm(mass_g ~ (length_cm + height_cm + species)^2 + 0, data = fish)
##
## Call:
## lm(formula = mass_g ~ (length_cm + height_cm + species)^2 + 0,
## data = fish)
##
## Coefficients:
## length_cm height_cm speciesBream
## -37.046 -36.911 633.199
## speciesPerch speciesPike speciesRoach
## 137.801 -311.846 185.260
## length_cm:height_cm length_cm:speciesPerch length_cm:speciesPike
## 3.574 23.124 57.529
## length_cm:speciesRoach height_cm:speciesPerch height_cm:speciesPike
## 29.676 10.743 -100.358
## height_cm:speciesRoach
## -21.209
# lm(Weight ~ I(Length1)^2 + Height + species + 0, data = fish) # Use I function to square the explanatory variable values
## 3-way interaction between all three variables
lm(mass_g ~ length_cm + height_cm + species + length_cm:height_cm + length_cm:species + height_cm:species + length_cm:height_cm:species + 0, data = fish)
##
## Call:
## lm(formula = mass_g ~ length_cm + height_cm + species + length_cm:height_cm +
## length_cm:species + height_cm:species + length_cm:height_cm:species +
## 0, data = fish)
##
## Coefficients:
## length_cm height_cm
## 6.2418 56.6590
## speciesBream speciesPerch
## -721.3943 123.4318
## speciesPike speciesRoach
## 802.2666 115.7905
## length_cm:height_cm length_cm:speciesPerch
## 0.6211 -19.7563
## length_cm:speciesPike length_cm:speciesRoach
## -10.1747 -10.4436
## height_cm:speciesPerch height_cm:speciesPike
## -80.1937 -346.2949
## height_cm:speciesRoach length_cm:height_cm:speciesPerch
## -103.7407 2.8790
## length_cm:height_cm:speciesPike length_cm:height_cm:speciesRoach
## 6.1729 2.4620
# Same as above
lm(mass_g ~ length_cm * height_cm * species + 0, data = fish)
##
## Call:
## lm(formula = mass_g ~ length_cm * height_cm * species + 0, data = fish)
##
## Coefficients:
## length_cm height_cm
## 6.2418 56.6590
## speciesBream speciesPerch
## -721.3943 123.4318
## speciesPike speciesRoach
## 802.2666 115.7905
## length_cm:height_cm length_cm:speciesPerch
## 0.6211 -19.7563
## length_cm:speciesPike length_cm:speciesRoach
## -10.1747 -10.4436
## height_cm:speciesPerch height_cm:speciesPike
## -80.1937 -346.2949
## height_cm:speciesRoach length_cm:height_cm:speciesPerch
## -103.7407 2.8790
## length_cm:height_cm:speciesPike length_cm:height_cm:speciesRoach
## 6.1729 2.4620
# Prediction
mdl_mass_vs_all <- lm(mass_g ~ length_cm * height_cm * species * 0, data = fish)
explanatory_data <- expand_grid(
length_cm = seq(5, 60, 6),
height_cm = seq(2, 20, 2),
species = unique(fish$species)
)
prediction_data <- explanatory_data %>%
mutate(mass_g = predict(mdl_mass_vs_all, explanatory_data))
# Visualizing predictions
ggplot(fish, aes(length_cm, height_cm, color = mass_g)) +
geom_point() +
scale_color_viridis_c(option = "inferno") +
facet_wrap(vars(species)) +
geom_point(data = prediction_data, size = 3, shape = 15)

# Visualizing many variables
ggplot(taiwan_real_estate, aes(sqrt(dist_to_mrt_m), n_convenience, color = price_twd_msq)) +
geom_point() +
scale_color_viridis_c(option = "plasma") +
facet_wrap(vars(house_age_years))

# Modeling
## No interactions
mdl_price_vs_all_no_inter <- lm(price_twd_msq ~ n_convenience + sqrt(dist_to_mrt_m) + house_age_years + 0, data = taiwan_real_estate)
mdl_price_vs_all_no_inter
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience + sqrt(dist_to_mrt_m) +
## house_age_years + 0, data = taiwan_real_estate)
##
## Coefficients:
## n_convenience sqrt(dist_to_mrt_m) house_age_years0 to 15
## 0.2577 -0.1481 15.4745
## house_age_years15 to 30 house_age_years30 to 45
## 14.1301 13.7655
## 3-way interaction between all three variables
mdl_price_vs_all_3_way_inter <- lm(price_twd_msq ~ n_convenience * sqrt(dist_to_mrt_m) * house_age_years + 0, data = taiwan_real_estate)
mdl_price_vs_all_3_way_inter
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience * sqrt(dist_to_mrt_m) *
## house_age_years + 0, data = taiwan_real_estate)
##
## Coefficients:
## n_convenience
## 0.374982
## sqrt(dist_to_mrt_m)
## -0.162944
## house_age_years0 to 15
## 16.046849
## house_age_years15 to 30
## 13.760066
## house_age_years30 to 45
## 12.088773
## n_convenience:sqrt(dist_to_mrt_m)
## -0.008393
## n_convenience:house_age_years15 to 30
## 0.078370
## n_convenience:house_age_years30 to 45
## 0.066720
## sqrt(dist_to_mrt_m):house_age_years15 to 30
## 0.036618
## sqrt(dist_to_mrt_m):house_age_years30 to 45
## 0.061281
## n_convenience:sqrt(dist_to_mrt_m):house_age_years15 to 30
## -0.003821
## n_convenience:sqrt(dist_to_mrt_m):house_age_years30 to 45
## 0.004401
## 2-way interactions between pairs of variables
mdl_price_vs_all_2_way_inter <- lm(price_twd_msq ~ (n_convenience + sqrt(dist_to_mrt_m) + house_age_years)^2 + 0, data = taiwan_real_estate)
mdl_price_vs_all_2_way_inter
##
## Call:
## lm(formula = price_twd_msq ~ (n_convenience + sqrt(dist_to_mrt_m) +
## house_age_years)^2 + 0, data = taiwan_real_estate)
##
## Coefficients:
## n_convenience
## 0.384914
## sqrt(dist_to_mrt_m)
## -0.162025
## house_age_years0 to 15
## 16.026633
## house_age_years15 to 30
## 13.880791
## house_age_years30 to 45
## 11.926904
## n_convenience:sqrt(dist_to_mrt_m)
## -0.008956
## n_convenience:house_age_years15 to 30
## -0.006894
## n_convenience:house_age_years30 to 45
## 0.143416
## sqrt(dist_to_mrt_m):house_age_years15 to 30
## 0.031600
## sqrt(dist_to_mrt_m):house_age_years30 to 45
## 0.068198
# Prediction
explanatory_data <- expand_grid(
dist_to_mrt_m = seq(0, 80, 10) ^ 2,
n_convenience = 0:10,
house_age_years = unique(taiwan_real_estate$house_age_years)
)
prediction_data <- explanatory_data %>%
mutate(price_twd_msq = predict(mdl_price_vs_all_3_way_inter, explanatory_data))
# Visualizing predictions
ggplot(taiwan_real_estate, aes(sqrt(dist_to_mrt_m), n_convenience, color = price_twd_msq)) +
geom_point() +
scale_color_viridis_c(option = "plasma") +
facet_wrap(vars(house_age_years)) +
geom_point(data = prediction_data, size = 3, shape = 15)

How linear regression works
- For the best fit - a metric that measures the size of all the
residuals - as small as possible
- A metric for the best fit
- The simplest idea (which doesn’t work)
- Take the sum of all the residuals
- Some residuals are negative
- The next simplest idea (which does work)
- Take the square of each residual, and add up those squares
- This is called the sum of squares
- To determine which intercept and slope coefficients will result in
the smallest sum of squares - Numerical
optimization
# How linear regression works
# Numerical optimization
calc_quadratic <- function(x) {
x ^ 2 - x + 10
}
optim(par = 3, fn = calc_quadratic) # The first argument is an initial guess at the answer
## Warning in optim(par = 3, fn = calc_quadratic): one-dimensional optimization by Nelder-Mead is unreliable:
## use "Brent" or optimize() directly
## $par
## [1] 0.4998047
##
## $value
## [1] 9.75
##
## $counts
## function gradient
## 30 NA
##
## $convergence
## [1] 0
##
## $message
## NULL
# Refinements
calc_quadratic <- function(coeffs) { # The function passed to optim is only allowed to have one argument
x <- coeffs[1]
x ^ 2 - x + 10
}
optim(par = c(x = 3), fn = calc_quadratic)
## Warning in optim(par = c(x = 3), fn = calc_quadratic): one-dimensional optimization by Nelder-Mead is unreliable:
## use "Brent" or optimize() directly
## $par
## x
## 0.4998047
##
## $value
## [1] 9.75
##
## $counts
## function gradient
## 30 NA
##
## $convergence
## [1] 0
##
## $message
## NULL
x_actual = taiwan_real_estate$n_convenience
y_actual = taiwan_real_estate$price_twd_msq
# Function to calculate the sum of squares metric
calc_sum_of_squares <- function(coeffs) {
intercept <- coeffs[1]
slope <- coeffs[2]
y_pred <- intercept + slope * x_actual
y_diff <- y_actual - y_pred
sum(y_diff ^ 2)
}
# Find where this function had its minimum value
optim(
par = c(intercept = 0, slope = 0),
fn = calc_sum_of_squares
)
## $par
## intercept slope
## 8.2260232 0.7977565
##
## $value
## [1] 4717.687
##
## $counts
## function gradient
## 87 NA
##
## $convergence
## [1] 0
##
## $message
## NULL
lm(price_twd_msq ~ n_convenience, data = taiwan_real_estate)
##
## Call:
## lm(formula = price_twd_msq ~ n_convenience, data = taiwan_real_estate)
##
## Coefficients:
## (Intercept) n_convenience
## 8.2242 0.7981
Multiple logistic regression
Multiple logistic regression
# Modeling
glm(response ~ explanatory, data = dataset, family = binomial)
glm(response ~ explanatory1 + explanatory2, data = dataset, family = binomial)
glm(response ~ explanatory1 * explanatory2, data = dataset, family = binomial)
# Prediction flow
explanatory_data <- expand_grid(
explanatory1 = some_values,
explanatory2 = some_values
)
prediction_data <- explanatory_data %>%
mutate(response = predict(mdl, explanatory_data, type = "response"))
# Confusion matrix
actual_response <- dataset$response
predicted_response <- round(fitted(mdl))
outcomes <- table(predicted_response, actual_response)
confusion <- conf_mat(outcomes)
autoplot(confusion)
summary(confusion, event_level = "second")
# Visualization
## Use faceting for categorical variables
## For 2 numeric explanatory variables, use color for response
## Give responses below 0.5 one color; responses above 0.5 another color
scale_color_gradient2(midpoint = 0.5)
# Multiple logistic regression
# Visualizing multiple explanatory variables
ggplot(churn, aes(time_since_first_purchase, time_since_last_purchase, color = has_churned)) +
geom_point(alpha = 0.5) +
scale_color_gradient2(midpoint = 0.5) +
theme_bw()

# Logistic regression with 2 explanatory variables
mdl_churn_vs_both_inter <- glm(has_churned ~ time_since_first_purchase * time_since_last_purchase, data = churn, family = binomial)
mdl_churn_vs_both_inter
##
## Call: glm(formula = has_churned ~ time_since_first_purchase * time_since_last_purchase,
## family = binomial, data = churn)
##
## Coefficients:
## (Intercept)
## -0.1505
## time_since_first_purchase
## -0.6376
## time_since_last_purchase
## 0.4233
## time_since_first_purchase:time_since_last_purchase
## 0.1123
##
## Degrees of Freedom: 399 Total (i.e. Null); 396 Residual
## Null Deviance: 554.5
## Residual Deviance: 519.8 AIC: 527.8
# Logistic regression prediction
explanatory_data <- expand_grid(
time_since_first_purchase = seq(-2, 4, 0.1),
time_since_last_purchase = seq(-1, 6, 0.1)
)
prediction_data <- explanatory_data %>%
mutate(has_churned = predict(mdl_churn_vs_both_inter, explanatory_data, type = "response"))
ggplot(churn, aes(time_since_first_purchase, time_since_last_purchase, color = has_churned)) +
geom_point(alpha = 0.5) +
scale_color_gradient2(midpoint = 0.5) +
theme_bw() +
geom_point(data = prediction_data, size = 3, shape = 15)

actual_response <- churn$has_churned
predicted_response <- round(fitted(mdl_churn_vs_both_inter))
outcomes <- table(predicted_response, actual_response)
confusion <- conf_mat(outcomes)
confusion
## actual_response
## predicted_response 0 1
## 0 102 53
## 1 98 147
autoplot(confusion)

summary(confusion, event_level = "second")
The logistic distribution
- Distribution function names
PDF |
d |
dnorm() |
dlogis() |
“d” for differentiate - you differentiate the CDF to get the
PDF |
CDF |
p |
pnorm() |
plogis() |
“p” is backwards “q” so it’s the inverse of the inverse CDF |
Inv. CDF |
q |
qnorm() |
qlogis() |
“q” for quantile |
# The logistic distribution
# Gaussian/Normal distribution
## Gaussian probability density function (PDF)
gaussian_distn <- tibble(
x = seq(-4, 4, 0.05),
gauss_pdf_x = dnorm(x)
)
pdf <- ggplot(gaussian_distn, aes(x, gauss_pdf_x)) +
geom_line()
# Gaussian cumulative distribution function (CDF)
## Transforms from x-values to probabilities
## A feature of the CDF curve for all distributions - the y-axis is near zero on the far left of the plot, and near one on the far right of the plot
gaussian_distn <- tibble(
x = seq(-4, 4, 0.05),
gauss_pdf_x = dnorm(x),
gauss_cdf_x = pnorm(x)
)
cdf <- ggplot(gaussian_distn, aes(x, gauss_cdf_x)) +
geom_line()
# Gaussian inverse CDF
## Get back from probabilities to x-values
gaussian_distn_inv <- tibble(
p = seq(0.001, 0.999, 0.001),
gauss_inv_cdf_p = qnorm(p)
)
inv_cdf <- ggplot(gaussian_distn_inv, aes(p, gauss_inv_cdf_p)) +
geom_line()
grid.arrange(pdf, cdf, inv_cdf, nrow = 1)

glm()
’s family argument
- Linear regression
lm(response ~ explanatory, data = dataset)
equals
to
glm(response ~ explanatory, data = dataset, family = gaussian)
- Logistic regression
glm(response ~ explanatory, data = dataset, family = binomial)
str(gaussian())
linkfun
- Provides a transformation of the response
variables
linkinv
- Undoes that transformation,
gaussian()$linkinv
- Logistic distribution
- A probability distribution of a single continuous variable
- Affected by location and scale parameters
- As
location
increases, the logistic CDF curve moves
rightwards. As scale
increases, the steepness of the slope
decreases
- Visualization
- Logistic distribution CDF (logistic
function)
- \(\text{cdf}(x) = \frac{1}{1 +
exp(-x)}\)
- The plot has an S-shape, known as a sigmoid
curve
- It takes an input that can be any number from minus infinity to
infinity, and returns a value between zero and one
- Each x input value is transformed to a unique value - That means
that the transformation can be reversed
- Logistic distribution inverse CDF (Logit
function \ Inverse logistic function)
- \(\text{inverse_cdf}(x) =
log(\frac{p}{1-p})\)
- The logit function takes values between zero and one, and returns
values between minus infinity and infinity
# `glm()`'s family argument
str(gaussian())
## List of 12
## $ family : chr "gaussian"
## $ link : chr "identity"
## $ linkfun :function (mu)
## $ linkinv :function (eta)
## $ variance :function (mu)
## $ dev.resids:function (y, mu, wt)
## $ aic :function (y, n, mu, wt, dev)
## $ mu.eta :function (eta)
## $ initialize: expression({ n <- rep.int(1, nobs) if (is.null(etastart) && is.null(start) && is.null(mustart) && ((family$link| __truncated__
## $ validmu :function (mu)
## $ valideta :function (eta)
## $ dispersion: num NA
## - attr(*, "class")= chr "family"
# For the gaussian case, each transformation is just the identity function: it returns the same value that you put into it, since no special transformation is needed
gaussian()$linkfun
## function (mu)
## mu
## <environment: namespace:stats>
gaussian()$linkinv
## function (eta)
## eta
## <environment: namespace:stats>
# Logistic PDF
logistic_distn <- tibble(
x = seq(-6, 6, 0.05),
logistic_pdf_x = dlogis(x)
)
ggplot(logistic_distn, aes(x, logistic_pdf_x)) +
geom_line()

# Cumulative distribution function
logistic_distn_cdf <- tibble(
x = seq(-10, 10, 0.1),
logistic_x = plogis(x),
logistic_x_man = 1 / (1 + exp(-x))
)
all.equal(logistic_distn_cdf$logistic_x, logistic_distn_cdf$logistic_x_man)
## [1] TRUE
# Shape depending on location and scale
# As `location` increases, the logistic CDF curve moves rightwards
# As `scale` increases, the steepness of the slope decreases
ggplot(logistic_distn_cdf, aes(x, logistic_x)) +
geom_line()

# Inverse cumulative distribution function
logistic_distn_inv_cdf <- tibble(
p = seq(0.001, 0.999, 0.001),
logit_p = qlogis(p),
logit_p_man = log(p / (1 - p))
)
all.equal(logistic_distn_inv_cdf$logit_p, logistic_distn_inv_cdf$logit_p_man)
## [1] TRUE
ggplot(logistic_distn_inv_cdf, aes(p, logit_p)) +
geom_line()

# binomial family argument
str(binomial())
## List of 13
## $ family : chr "binomial"
## $ link : chr "logit"
## $ linkfun :function (mu)
## $ linkinv :function (eta)
## $ variance :function (mu)
## $ dev.resids:function (y, mu, wt)
## $ aic :function (y, n, mu, wt, dev)
## $ mu.eta :function (eta)
## $ initialize: language { if (NCOL(y) == 1) { ...
## $ validmu :function (mu)
## $ valideta :function (eta)
## $ simulate :function (object, nsim)
## $ dispersion: num 1
## - attr(*, "class")= chr "family"
x <- seq(0.01, 0.99, 0.01)
p <- seq(0.01, 0.99, 0.01)
all.equal(
binomial()$linkinv(x), # Corresponds to CDP
plogis(x)
)
## [1] TRUE
all.equal(
binomial()$linkfun(p), # Corresponds to inverse CDP
qlogis(p)
)
## [1] TRUE
How logistic regression works
- The principle is the same as for linear regression - Choose a metric
that measures how far the predicted responses are from the actual
responses, and optimize that metric
- Likelihood - Find the maximum value
sum(y_pred * y_actual + (1 - y_pred) * (1 - y_actual))
y_actual
is always 0 or 1
y_pred
is between 0 and 1
- Computing likelihood involves adding many very small numbers,
leading to numerical error
- Log-likelihood is easier to compute
- Log-likelihood
log(y_pred) y_actual + log(1 - y_pred) (1 - y_actual)
- Negative log-likelihood
- Maximizing log-likelihood is the same as minimizing negative
log-likelihood
sum(log_likelihoods)
# How logistic regression works
# Logistic regression algorithm
x_actual <- churn$time_since_last_purchase
y_actual <- churn$has_churned
intercept <- 1 # -0.05
slope <- 0.5 # 0.275
# Manual calculation
# The logistic regression prediction formula combines a linear combination of predictor variables with the logistic function to model the probability of a binary outcome
y_pred <- plogis(intercept + slope * x_actual)
# Both likelihood and log-likelihood increase to a maximum value
likelihoods <- sum(y_pred * y_actual + (1 - y_pred) * (1 - y_actual)) # Likelihood
likelihoods
## [1] 205.0343
log_likelihoods <- log(y_pred) * y_actual + log(1 - y_pred) * (1 - y_actual) # Log-likelihood
sum(log_likelihoods)
## [1] -326.2599
# Simple version of glm()
calc_neg_log_likelihood <- function(coeffs) {
intercept <- coeffs[1]
slope <- coeffs[2]
y_pred <- plogis(intercept + slope * x_actual)
log_likelihoods <- log(y_pred) * y_actual + log(1 - y_pred) * (1 - y_actual)
-sum(log_likelihoods)
}
# Optimize the metric
optim(
# Initially guess 0 intercept and 1 slope
par = c(intercept = 0, slope = 1),
# Use calc_neg_log_likelihood as the optimization fn
fn = calc_neg_log_likelihood
)
## $par
## intercept slope
## -0.03478255 0.26890041
##
## $value
## [1] 273.2002
##
## $counts
## function gradient
## 51 NA
##
## $convergence
## [1] 0
##
## $message
## NULL
# Compare the coefficients to those calculated by glm()
glm(has_churned ~ time_since_last_purchase, data = churn, family = binomial)
##
## Call: glm(formula = has_churned ~ time_since_last_purchase, family = binomial,
## data = churn)
##
## Coefficients:
## (Intercept) time_since_last_purchase
## -0.03502 0.26921
##
## Degrees of Freedom: 399 Total (i.e. Null); 398 Residual
## Null Deviance: 554.5
## Residual Deviance: 546.4 AIC: 550.4