Part 1: Logistic Regression

For this part of the lab we will be working with the titanic.csv dataset which contains information on the fate of 1,043 passengers on the fatal maiden voyage of the ocean liner ‘Titanic’. Our interest is in predicting whether or not a passenger survived. Because the variable we want to predict is binary (survived = Yes if the passenger survived and survived = No if they did not), logistic regression is appropriate. For this session we’ll assess three predictor variables: pclass, sex, and age.
• survived (dependent variable) = Yes if the passenger survived and No if they did not
• pclass (predictor variable) = the economic class of the passenger summarized as 1st, 2nd, 3rd, and Crew class.
• sex (predictor variable) = reported sex of the passenger (“male”, “female”)
• age (predictor variable) = age of passenger. Note that ages <1 are for infants.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.0 
## ✔ dials        1.2.1      ✔ tune         1.1.2 
## ✔ infer        1.0.6      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.0.1 
## ✔ parsnip      1.2.0      ✔ yardstick    1.3.0 
## ✔ recipes      1.0.10
## Warning: package 'rsample' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(vip)
## Warning: package 'vip' was built under R version 4.3.3
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi
library(readr)
titanic <- read_csv("D:/UC Course/Spring 2024/Data mining/HW/Lab 11/titanic.csv")
## Rows: 1043 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pclass, survived, sex
## dbl (1): age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Recode response variable as a factor
titanic <- mutate(titanic, survived = factor(survived))

titanic
## # A tibble: 1,043 × 4
##    pclass survived sex       age
##    <chr>  <fct>    <chr>   <dbl>
##  1 1st    Yes      female 29    
##  2 1st    Yes      male    0.917
##  3 1st    No       female  2    
##  4 1st    No       male   30    
##  5 1st    No       female 25    
##  6 1st    Yes      male   48    
##  7 1st    Yes      female 63    
##  8 1st    No       male   39    
##  9 1st    Yes      female 53    
## 10 1st    No       male   71    
## # ℹ 1,033 more rows

1. Compute the proportion of survived = Yes vs. survived = No in this data set. In other words, across all observations, what is the percentage of passengers that survived compared to those that did not?

table(titanic$survived)
## 
##  No Yes 
## 618 425
618 + 425
## [1] 1043
618 / 1043
## [1] 0.5925216
425 / 1043
## [1] 0.4074784

2. Explore potential relationships between the predictor variables and the dependent variable. What patterns do you observe? Do any of the predictor variables appear to have an association with the dependent variable? There are many ways to assess these relationships so I’m not expecting a “right” versus “wrong” way to do this. Basically, I want you to use summary statistics or visualization to understand how the predictor variables (age, sex, pclass) relate to the response variable (survived).

set.seed(123)
split <- initial_split(titanic, prop = .7, strata = "survived")
titanic_train <- training(split)
titanic_test <- testing(split)

lr_fit1 <- logistic_reg() %>%
  fit(survived ~ sex, data = titanic_train)

tidy(lr_fit1)
## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     1.18     0.142      8.30 1.04e-16
## 2 sexmale        -2.63     0.186    -14.2  1.71e-45
lr_fit2 <- logistic_reg() %>%
  fit(survived ~ pclass, data = titanic_train)

tidy(lr_fit2)
## # A tibble: 3 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    0.589     0.148      3.98 6.81e- 5
## 2 pclass2nd     -0.904     0.208     -4.35 1.37e- 5
## 3 pclass3rd     -1.63      0.193     -8.46 2.62e-17
lr_fit3 <- logistic_reg() %>%
  fit(survived ~ age, data = titanic_train)

tidy(lr_fit3)
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) -0.185     0.173       -1.07   0.285
## 2 age         -0.00628   0.00521     -1.21   0.227

3. Using stratified sampling split the data into a training set and a test set. Use a 70/30 split (70% for the training data & 30% for the test set). Set the seed to 123 so that we all get the same split. Assess the distribution of the dependent variable – Is there a similar distribution of the dependent variable between the training and test sets?

set.seed(123)  
split <- initial_split(titanic, prop = 0.7, strata = "survived")
titanic_train <- training(split)
titanic_test  <- testing(split)

4. Fill in the blanks to apply a logistic regression model using all predictor variables while performing a 5-fold cross validation procedure. What is the mean cross validation accuracy rate?

# create resampling procedure
set.seed(123)
kfold <- vfold_cv(titanic_train, v = 5)

# titanic_train model via cross validation
results <- logistic_reg() %>%
  fit_resamples(survived ~ ., kfold)

# collect the average accuracy rate
collect_metrics(results)
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.789     5  0.0111 Preprocessor1_Model1
## 2 roc_auc  binary     0.851     5  0.0121 Preprocessor1_Model1

5. Execute the code below and assess the coefficients of the model. How do you interpret these coefficients? How do we interpret these coefficients? Since the coefficient for age is negative we can interpret it as the probability of surviving decreases as age increases. In fact, we can state that for every year older a passenger is, the log odds decreases by -0.033553. Or equivalently, it multiplies the odds by e^−0.033553.

# retrain our model across the entire training data
final_fit <- logistic_reg() %>%
fit(survived ~ ., data = titanic_train)
tidy(final_fit)
## # A tibble: 5 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   3.67     0.406        9.03 1.68e-19
## 2 pclass2nd    -1.43     0.275       -5.20 1.99e- 7
## 3 pclass3rd    -2.41     0.284       -8.49 2.03e-17
## 4 sexmale      -2.69     0.205      -13.1  2.33e-39
## 5 age          -0.0336   0.00775     -4.33 1.51e- 5

6. Create a confusion matrix with the test data. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression in this case. Does it experience more false negatives or false positives?

final_fit %>%
predict(titanic_test) %>%
bind_cols(titanic_test %>% select(survived)) %>%
conf_mat(truth = survived, estimate = .pred_class)
##           Truth
## Prediction  No Yes
##        No  151  44
##        Yes  35  84

7. Plot the feature importance for the three predictor variables. Which predictor variable is most influential?

vip(final_fit$fit, num_features = 20)

Part 2: Regularized Regression

For this part of the lab we’ll use the Boston housing data set. Recall that the purpose of this data set is to predict the median value of owner-occupied homes for various census tracts in the Boston area. Each row (observation) represents a given census tract and the variable we wish to predict is cmedv (median value of owner-occupied homes in USD 1000’s). The other variables are variables we want to use to help make predictions of cmedv and include:

boston <- read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1. Split the data into a training set and test set using a 70-30% split. Be sure to include the set.seed(123) so that your train and test sets are the same as mine.

set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)

2. Create a recipe that will model cmedv as a function of all predictor variables. Also, be sure to normalize all numeric predictor variables using a Yeo-Johnson transformation and then standardize all numeric predictor variables.

boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())

3. Fit three ridge models, one each with the below penalty (λ) values and use a 5-fold cross-validation procedure to assess the generalization RMSE. Which penalty parameter value resulted in the lowest RMSE?

# Step 1: create ridge model object
ridge_mod <- linear_reg(mixture = 0, penalty = .1) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(ridge_mod) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()
## Warning: package 'glmnet' was built under R version 4.3.3
## Warning: package 'Matrix' was built under R version 4.3.3
ridge_wf
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.78      5  0.452  Preprocessor1_Model1
## 2 rsq     standard   0.729     5  0.0388 Preprocessor1_Model1
# Step 1: create ridge model object
ridge_mod1 <- linear_reg(mixture = 0, penalty = 10) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf1 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(ridge_mod1) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

ridge_wf1
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   5.42      5  0.712  Preprocessor1_Model1
## 2 rsq     standard   0.695     5  0.0446 Preprocessor1_Model1
# Step 1: create ridge model object
ridge_mod2 <- linear_reg(mixture = 0, penalty = 100) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf2 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(ridge_mod2) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

ridge_wf2
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   7.66      5  0.317  Preprocessor1_Model1
## 2 rsq     standard   0.578     5  0.0580 Preprocessor1_Model1

4. Fit three Lasso models, one each with the below penalty (λ) values and use a 5-fold cross-validation procedure to compute the generalization RMSE. Which penalty parameter value resulted in the lowest RMSE?

# Step 1: create ridge model object
lasso_mod <- linear_reg(mixture = 1, penalty = .01) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(lasso_mod) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_wf
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.73      5  0.559  Preprocessor1_Model1
## 2 rsq     standard   0.733     5  0.0537 Preprocessor1_Model1
# Step 1: create ridge model object
lasso_mod1 <- linear_reg(mixture = 1, penalty = .1) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf1 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(lasso_mod1) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_wf1
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.94      5  0.366  Preprocessor1_Model1
## 2 rsq     standard   0.708     5  0.0461 Preprocessor1_Model1
# Step 1: create ridge model object
lasso_mod2 <- linear_reg(mixture = 1, penalty = 0.5) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf2 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(lasso_mod2) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_wf2
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   5.09      5  0.239  Preprocessor1_Model1
## 2 rsq     standard   0.699     5  0.0175 Preprocessor1_Model1

5. Pick the Ridge or Lasso model from above that had the lowest generalization RMSE. Fit a final model across all the training data, extract that final fit from the workflow object, and then use the vip package to visualize the variable importance scores for all 15 features. Which feature is most influential in this final model?

best_mod <- linear_reg(penalty = 0.01, mixture = 1) %>%
set_engine("glmnet")

best_mod_wf <- workflow() %>%
  add_recipe(model_recipe) %>%
  add_model(best_mod)

final_fit <- best_mod_wf %>%
  fit(data = boston_train)

final_fit %>%
  extract_fit_parsnip %>%
  vip(num_features = 20)