For this part of the lab we will be working with the titanic.csv dataset which contains information on the fate of 1,043 passengers on the fatal maiden voyage of the ocean liner ‘Titanic’. Our interest is in predicting whether or not a passenger survived. Because the variable we want to predict is binary (survived = Yes if the passenger survived and survived = No if they did not), logistic regression is appropriate. For this session we’ll assess three predictor variables: pclass, sex, and age.
• survived (dependent variable) = Yes if the passenger survived and No if they did not
• pclass (predictor variable) = the economic class of the passenger summarized as 1st, 2nd, 3rd, and Crew class.
• sex (predictor variable) = reported sex of the passenger (“male”, “female”)
• age (predictor variable) = age of passenger. Note that ages <1 are for infants.
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ recipes 1.1.0
## ✔ dials 1.3.0 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.2
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.4.2
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(vip)
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
titanic <- read_csv("titanic.csv")
## Rows: 1043 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pclass, survived, sex
## dbl (1): age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Recode response variable as a factor
titanic <- mutate(titanic, survived = factor(survived))
table(titanic$survived)
##
## No Yes
## 618 425
# total
618 + 425
## [1] 1043
# survived
618 / 1043
## [1] 0.5925216
# did not survive
425 / 1043
## [1] 0.4074784
set.seed(123)
split <- initial_split(titanic, prop = 0.7, strata = "survived")
titanic_train <- training(split)
titanic_test <- testing(split)
lr_fit1 <- logistic_reg() %>%
fit(survived ~ sex, data = titanic_train)
tidy(lr_fit1)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.18 0.142 8.30 1.04e-16
## 2 sexmale -2.63 0.186 -14.2 1.71e-45
lr_fit2 <- logistic_reg() %>%
fit(survived ~ pclass, data = titanic_train)
tidy(lr_fit2)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.589 0.148 3.98 6.81e- 5
## 2 pclass2nd -0.904 0.208 -4.35 1.37e- 5
## 3 pclass3rd -1.63 0.193 -8.46 2.62e-17
lr_fit3 <- logistic_reg() %>%
fit(survived ~ age, data = titanic_train)
tidy(lr_fit3)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.185 0.173 -1.07 0.285
## 2 age -0.00628 0.00521 -1.21 0.227
set.seed(123)
split <- initial_split(titanic, prop = 0.7, strata = "survived")
titanic_train <- training(split)
titanic_test <- testing(split)
# create resampling procedure
set.seed(123)
kfold <- vfold_cv(titanic_train, v = 5)
# titanic_train model via cross validation
results <- logistic_reg() %>%
fit_resamples(survived ~ ., kfold)
# collect the average accuracy rate
collect_metrics(results)
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.789 5 0.0111 Preprocessor1_Model1
## 2 brier_class binary 0.146 5 0.00606 Preprocessor1_Model1
## 3 roc_auc binary 0.851 5 0.0121 Preprocessor1_Model1
# retrain our model across the entire training data
final_fit <- logistic_reg() %>%
fit(survived ~ ., data = titanic_train)
tidy(final_fit)
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.67 0.406 9.03 1.68e-19
## 2 pclass2nd -1.43 0.275 -5.20 1.99e- 7
## 3 pclass3rd -2.41 0.284 -8.49 2.03e-17
## 4 sexmale -2.69 0.205 -13.1 2.33e-39
## 5 age -0.0336 0.00775 -4.33 1.51e- 5
final_fit %>%
predict(titanic_test) %>%
bind_cols(titanic_test %>% select(survived)) %>%
conf_mat(survived, estimate = .pred_class)
## Truth
## Prediction No Yes
## No 151 44
## Yes 35 84
vip(final_fit)
For this part of the lab we’ll use the Boston housing data set. Recall that the purpose of this data set is to predict the median value of owner-occupied homes for various census tracts in the Boston area. Each row (observation) represents a given census tract and the variable we wish to predict is cmedv (median value of owner-occupied homes in USD 1000’s). The other variables are variables we want to use to help make predictions of cmedv and include:
• lon: longitude of census tract
• lat: latitude of census tract
• crim: per capita crime rate by town
• zn: proportion of residential land zoned for lots over 25,000 sq.ft
• indus: proportion of non-retail business acres per town
• chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
• nox: nitric oxides concentration (parts per 10 million) –> aka air pollution
• rm: average number of rooms per dwelling
• age: proportion of owner-occupied units built prior to 1940
• dis: weighted distances to five Boston employment centers
• rad: index of accessibility to radial highways
• tax: full-value property-tax rate per USD 10,000
• ptratio: pupil-teacher ratio by town
• lstat: percentage of lower status of the population
boston <- read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
• penalty = 0.1
• penalty = 10
• penalty = 100
# ridge model
ridge_mod <- linear_reg(penalty = 0, mixture = 0.1) %>%
set_engine("glmnet")
# Create Model & Recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Fit Model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
ridge_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(ridge_mod) %>%
fit_resamples(kfolds) %>%
collect_metrics()
ridge_wf
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 4.81 5 0.432 Preprocessor1_Model1
## 2 rsq standard 0.728 5 0.0373 Preprocessor1_Model1
# ridge model
ridge_mod1 <- linear_reg(mixture = 0, penalty = 10) %>%
set_engine("glmnet")
# create model & recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Fit model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
ridge_wf1 <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(ridge_mod1) %>%
fit_resamples(kfolds) %>%
collect_metrics()
ridge_wf1
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 5.42 5 0.712 Preprocessor1_Model1
## 2 rsq standard 0.695 5 0.0446 Preprocessor1_Model1
# ridge model
ridge_mod2 <- linear_reg(mixture = 0, penalty = 100) %>%
set_engine("glmnet")
# create model & recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Fit model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
ridge_wf2 <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(ridge_mod2) %>%
fit_resamples(kfolds) %>%
collect_metrics()
ridge_wf2
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 7.66 5 0.317 Preprocessor1_Model1
## 2 rsq standard 0.578 5 0.0580 Preprocessor1_Model1
• penalty = 0.01
• penalty = 0.1
• penalty = 0.5
# ridge model
lasso_mod <- linear_reg(mixture = 1, penalty = 0.01) %>%
set_engine("glmnet")
# create model & recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Fit model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
lasso_wf <- workflow() %>%
add_recipe(boston_recipe ) %>%
add_model(lasso_mod) %>%
fit_resamples(kfolds) %>%
collect_metrics()
lasso_mod
## Linear Regression Model Specification (regression)
##
## Main Arguments:
## penalty = 0.01
## mixture = 1
##
## Computational engine: glmnet
# ridge model
lasso_mod1 <- linear_reg(mixture = 1, penalty = 0.1) %>%
set_engine("glmnet")
# create model & recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Fit model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
lasso_wf1 <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(lasso_mod1) %>%
fit_resamples(kfolds) %>%
collect_metrics()
lasso_mod1
## Linear Regression Model Specification (regression)
##
## Main Arguments:
## penalty = 0.1
## mixture = 1
##
## Computational engine: glmnet
# ridge model
lasso_mod2 <- linear_reg(mixture = 1, penalty = 0.5) %>%
set_engine("glmnet")
# create model & recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Fit model
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
lasso_wf2 <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(lasso_mod2) %>%
fit_resamples(kfolds) %>%
collect_metrics()
lasso_mod2
## Linear Regression Model Specification (regression)
##
## Main Arguments:
## penalty = 0.5
## mixture = 1
##
## Computational engine: glmnet
best_mod <- linear_reg(penalty = 0.1, mixture = 1) %>%
set_engine("glmnet")
best_mod_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(best_mod)
final_fit <- best_mod_wf %>%
fit(data = boston_train)
final_fit %>%
extract_fit_parsnip() %>%
vip(num_features = 15)