Part 1: Logistic Regression

For this part of the lab we will be working with the titanic.csv dataset which contains information on the fate of 1,043 passengers on the fatal maiden voyage of the ocean liner ‘Titanic’. Our interest is in predicting whether or not a passenger survived. Because the variable we want to predict is binary (survived = Yes if the passenger survived and survived = No if they did not), logistic regression is appropriate. For this session we’ll assess three predictor variables: pclass, sex, and age.

• survived (dependent variable) = Yes if the passenger survived and No if they did not

• pclass (predictor variable) = the economic class of the passenger summarized as 1st, 2nd, 3rd, and Crew class.

• sex (predictor variable) = reported sex of the passenger (“male”, “female”)

• age (predictor variable) = age of passenger. Note that ages <1 are for infants.

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ recipes      1.1.0
## ✔ dials        1.3.0     ✔ rsample      1.2.1
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.2
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.4.2
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
library(vip)
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi
titanic <- read_csv("titanic.csv")
## Rows: 1043 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pclass, survived, sex
## dbl (1): age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Recode response variable as a factor

titanic <- mutate(titanic, survived = factor(survived))
  1. Compute the proportion of survived = Yes vs. survived = No in this data set. In other words, across all observations, what is the percentage of passengers that survived compared to those that did not?
table(titanic$survived)
## 
##  No Yes 
## 618 425
# total
618 + 425
## [1] 1043
# survived
618 / 1043
## [1] 0.5925216
# did not survive 
425 / 1043
## [1] 0.4074784
  1. Explore potential relationships between the predictor variables and the dependent variable. What patterns do you observe? Do any of the predictor variables appear to have an association with the dependent variable?
set.seed(123)
split <- initial_split(titanic, prop = 0.7, strata = "survived")
titanic_train <- training(split)
titanic_test <- testing(split)

lr_fit1 <- logistic_reg() %>%
  fit(survived ~ sex, data = titanic_train)

tidy(lr_fit1)
## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     1.18     0.142      8.30 1.04e-16
## 2 sexmale        -2.63     0.186    -14.2  1.71e-45
lr_fit2 <- logistic_reg() %>%
  fit(survived ~ pclass, data = titanic_train)

tidy(lr_fit2)
## # A tibble: 3 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    0.589     0.148      3.98 6.81e- 5
## 2 pclass2nd     -0.904     0.208     -4.35 1.37e- 5
## 3 pclass3rd     -1.63      0.193     -8.46 2.62e-17
lr_fit3 <- logistic_reg() %>%
  fit(survived ~ age, data = titanic_train)

tidy(lr_fit3)
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) -0.185     0.173       -1.07   0.285
## 2 age         -0.00628   0.00521     -1.21   0.227
  1. Using stratified sampling split the data into a training set and a test set. Use a 70/30 split (70% for the training data & 30% for the test set). Set the seed to 123 so that we all get the same split. Assess the distribution of the dependent variable – Is there a similar distribution of the dependent variable between the training and test sets?
set.seed(123)
split <- initial_split(titanic, prop = 0.7, strata = "survived")
titanic_train <- training(split)
titanic_test <- testing(split)
  1. Fill in the blanks to apply a logistic regression model using all predictor variables while performing a 5-fold cross validation procedure. What is the mean cross validation accuracy rate?
# create resampling procedure
set.seed(123)
kfold <- vfold_cv(titanic_train, v = 5)

# titanic_train model via cross validation
results <- logistic_reg() %>%
fit_resamples(survived ~ ., kfold)

# collect the average accuracy rate
collect_metrics(results)
## # A tibble: 3 × 6
##   .metric     .estimator  mean     n std_err .config             
##   <chr>       <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy    binary     0.789     5 0.0111  Preprocessor1_Model1
## 2 brier_class binary     0.146     5 0.00606 Preprocessor1_Model1
## 3 roc_auc     binary     0.851     5 0.0121  Preprocessor1_Model1
  1. Execute the code below and assess the coefficients of the model. How do you interpret these coefficients? How do we interpret these coefficients? Since the coefficient for age is negative we can interpret it as the probability of surviving decreases as age increases. In fact, we can state that for every year older a passenger is, the log odds decreases by -0.033553. Or equivalently, it multiplies the odds by e^−0.033553
# retrain our model across the entire training data
final_fit <- logistic_reg() %>%
  fit(survived ~ ., data = titanic_train)

tidy(final_fit)
## # A tibble: 5 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   3.67     0.406        9.03 1.68e-19
## 2 pclass2nd    -1.43     0.275       -5.20 1.99e- 7
## 3 pclass3rd    -2.41     0.284       -8.49 2.03e-17
## 4 sexmale      -2.69     0.205      -13.1  2.33e-39
## 5 age          -0.0336   0.00775     -4.33 1.51e- 5
  1. Create a confusion matrix with the test data. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression in this case. Does it experience more false negatives or false positives?
final_fit %>%
  predict(titanic_test) %>%
  bind_cols(titanic_test %>% select(survived)) %>%
  conf_mat(survived, estimate = .pred_class)
##           Truth
## Prediction  No Yes
##        No  151  44
##        Yes  35  84
  1. Plot the feature importance for the three predictor variables. Which predictor variable is most influential?
vip(final_fit)

Part 2: Regularized Regression

For this part of the lab we’ll use the Boston housing data set. Recall that the purpose of this data set is to predict the median value of owner-occupied homes for various census tracts in the Boston area. Each row (observation) represents a given census tract and the variable we wish to predict is cmedv (median value of owner-occupied homes in USD 1000’s). The other variables are variables we want to use to help make predictions of cmedv and include:

• lon: longitude of census tract

• lat: latitude of census tract

• crim: per capita crime rate by town

• zn: proportion of residential land zoned for lots over 25,000 sq.ft

• indus: proportion of non-retail business acres per town

• chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)

• nox: nitric oxides concentration (parts per 10 million) –> aka air pollution

• rm: average number of rooms per dwelling

• age: proportion of owner-occupied units built prior to 1940

• dis: weighted distances to five Boston employment centers

• rad: index of accessibility to radial highways

• tax: full-value property-tax rate per USD 10,000

• ptratio: pupil-teacher ratio by town

• lstat: percentage of lower status of the population

boston <- read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  1. Split the data into a training set and test set using a 70-30% split. Be sure to include the set.seed(123) so that your train and test sets are the same as mine.
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)
  1. Create a recipe that will model cmedv as a function of all predictor variables. Also, be sure to normalize all numeric predictor variables using a Yeo-Johnson transformation and then standardize all numeric predictor variables.
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
  1. Fit three ridge models, one each with the below penalty (λ) values and use a 5-fold cross-validation procedure to assess the generalization RMSE. Which penalty parameter value resulted in the lowest RMSE?

• penalty = 0.1

• penalty = 10

• penalty = 100

# ridge model
ridge_mod <- linear_reg(penalty = 0, mixture = 0.1) %>%
  set_engine("glmnet")

# Create Model & Recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

# Fit Model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf <- workflow() %>%
  add_recipe(boston_recipe) %>%
  add_model(ridge_mod) %>%
  fit_resamples(kfolds) %>%
  collect_metrics()

ridge_wf
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.81      5  0.432  Preprocessor1_Model1
## 2 rsq     standard   0.728     5  0.0373 Preprocessor1_Model1
# ridge model 
ridge_mod1 <- linear_reg(mixture = 0, penalty = 10) %>%
  set_engine("glmnet")

# create model & recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Fit model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf1 <- workflow() %>%
   add_recipe(boston_recipe) %>%
   add_model(ridge_mod1) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

ridge_wf1
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   5.42      5  0.712  Preprocessor1_Model1
## 2 rsq     standard   0.695     5  0.0446 Preprocessor1_Model1
# ridge model 
ridge_mod2 <- linear_reg(mixture = 0, penalty = 100) %>%
  set_engine("glmnet")

# create model & recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Fit model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf2 <- workflow() %>%
   add_recipe(boston_recipe) %>%
   add_model(ridge_mod2) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

ridge_wf2
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   7.66      5  0.317  Preprocessor1_Model1
## 2 rsq     standard   0.578     5  0.0580 Preprocessor1_Model1
  1. Fit three Lasso models, one each with the below penalty (λ) values and use a 5-fold cross-validation procedure to compute the generalization RMSE. Which penalty parameter value resulted in the lowest RMSE?

• penalty = 0.01

• penalty = 0.1

• penalty = 0.5

# ridge model 
lasso_mod <- linear_reg(mixture = 1, penalty = 0.01) %>%
  set_engine("glmnet")

# create model & recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Fit model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf <- workflow() %>%
   add_recipe(boston_recipe ) %>%
   add_model(lasso_mod) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_mod
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 0.01
##   mixture = 1
## 
## Computational engine: glmnet
# ridge model 
lasso_mod1 <- linear_reg(mixture = 1, penalty = 0.1) %>%
  set_engine("glmnet")

# create model & recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Fit model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf1 <- workflow() %>%
   add_recipe(boston_recipe) %>%
   add_model(lasso_mod1) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_mod1
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 0.1
##   mixture = 1
## 
## Computational engine: glmnet
# ridge model 
lasso_mod2 <- linear_reg(mixture = 1, penalty = 0.5) %>%
  set_engine("glmnet")

# create model & recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Fit model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf2 <- workflow() %>%
   add_recipe(boston_recipe) %>%
   add_model(lasso_mod2) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_mod2
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 0.5
##   mixture = 1
## 
## Computational engine: glmnet
  1. Pick the Ridge or Lasso model from above that had the lowest generalization RMSE. Fit a final model across all the training data, extract that final fit from the workflow object, and then use the vip package to visualize the variable importance scores for all 15 features. Which feature is most influential in this final model?
best_mod <- linear_reg(penalty = 0.1, mixture = 1) %>%
  set_engine("glmnet")

best_mod_wf <- workflow() %>%
  add_recipe(boston_recipe) %>%
  add_model(best_mod)

final_fit <- best_mod_wf %>%
  fit(data = boston_train)

final_fit %>%
  extract_fit_parsnip() %>%
  vip(num_features = 15)