Packages

# Helper packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Modeling packages
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
# Model interpretability packages
library(vip)
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
library(readr)
titanic <- read_csv("~/Desktop/BANA 4080 R/data_bana4080/titanic.csv")
## Rows: 1043 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pclass, survived, sex
## dbl (1): age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Part 1: Logistic Regression

titanic <- mutate(titanic, survived = factor(survived))
titanic
## # A tibble: 1,043 × 4
##    pclass survived sex       age
##    <chr>  <fct>    <chr>   <dbl>
##  1 1st    Yes      female 29    
##  2 1st    Yes      male    0.917
##  3 1st    No       female  2    
##  4 1st    No       male   30    
##  5 1st    No       female 25    
##  6 1st    Yes      male   48    
##  7 1st    Yes      female 63    
##  8 1st    No       male   39    
##  9 1st    Yes      female 53    
## 10 1st    No       male   71    
## # ℹ 1,033 more rows

Question 1:

table(titanic$survived)
## 
##  No Yes 
## 618 425
618 + 425
## [1] 1043
618 / 1043
## [1] 0.5925216
425 / 1043
## [1] 0.4074784

Question 2:

set.seed(123) 
titanic_split <- initial_split(titanic, prop = .7, strata = "survived")
titanic_train <- training(titanic_split)
titanic_test  <- testing(titanic_split)

lr_fit1 <- logistic_reg() %>%
  fit(survived ~ sex, data = titanic_train)
tidy(lr_fit1)
## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     1.18     0.142      8.30 1.04e-16
## 2 sexmale        -2.63     0.186    -14.2  1.71e-45
lr_fit2 <- logistic_reg() %>%
  fit(survived ~ pclass, data = titanic_train)

tidy(lr_fit2)
## # A tibble: 3 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    0.589     0.148      3.98 6.81e- 5
## 2 pclass2nd     -0.904     0.208     -4.35 1.37e- 5
## 3 pclass3rd     -1.63      0.193     -8.46 2.62e-17
lr_fit3 <- logistic_reg() %>%
  fit(survived ~ age, data = titanic_train)

tidy(lr_fit3)
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) -0.185     0.173       -1.07   0.285
## 2 age         -0.00628   0.00521     -1.21   0.227

question 3:

set.seed(123)  
titanic_split <- initial_split(titanic, prop = .7, strata = "survived")
titanic_train <- training(titanic_split)
titanic_test  <- testing(titanic_split)

question 4:

set.seed(123)
kfold <- vfold_cv(titanic_train, v = 5)

results <- logistic_reg() %>%
  fit_resamples(survived ~ ., kfold)

collect_metrics(results)
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.789     5  0.0111 Preprocessor1_Model1
## 2 roc_auc  binary     0.851     5  0.0121 Preprocessor1_Model1

question 5:

final_fit <- logistic_reg() %>%
fit(survived ~ ., data = titanic_train)
tidy(final_fit)
## # A tibble: 5 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   3.67     0.406        9.03 1.68e-19
## 2 pclass2nd    -1.43     0.275       -5.20 1.99e- 7
## 3 pclass3rd    -2.41     0.284       -8.49 2.03e-17
## 4 sexmale      -2.69     0.205      -13.1  2.33e-39
## 5 age          -0.0336   0.00775     -4.33 1.51e- 5

question 6:

final_fit %>%
predict(titanic_test) %>%
bind_cols(titanic_test %>% select(survived)) %>%
conf_mat(truth = survived, estimate = .pred_class)
##           Truth
## Prediction  No Yes
##        No  151  44
##        Yes  35  84

question 7:

vip(final_fit$fit, num_features = 20)

PART 2:

boston <- read_csv("~/Desktop/BANA 4080 R/data_bana4080/boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

question 1:

set.seed(123)  
split <- initial_split(boston, prop = 0.7, strata = "cmedv")
boston_train  <- training(split)
boston_test   <- testing(split)

question 2:

model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

question 3:

ridge_mod <- linear_reg(mixture = 0, penalty = .1) %>%
  set_engine("glmnet")

model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(ridge_mod) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

ridge_wf
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.78      5  0.452  Preprocessor1_Model1
## 2 rsq     standard   0.729     5  0.0388 Preprocessor1_Model1
ridge_mod1 <- linear_reg(mixture = 0, penalty = 10) %>%
  set_engine("glmnet")

model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf1 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(ridge_mod1) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

ridge_wf1
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   5.42      5  0.712  Preprocessor1_Model1
## 2 rsq     standard   0.695     5  0.0446 Preprocessor1_Model1
ridge_mod2 <- linear_reg(mixture = 0, penalty = 100) %>%
  set_engine("glmnet")

model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf2 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(ridge_mod2) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

ridge_wf2
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   7.66      5  0.317  Preprocessor1_Model1
## 2 rsq     standard   0.578     5  0.0580 Preprocessor1_Model1

question 4:

lasso_mod <- linear_reg(mixture = 1, penalty = .01) %>%
  set_engine("glmnet")

model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(lasso_mod) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_wf
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.73      5  0.559  Preprocessor1_Model1
## 2 rsq     standard   0.733     5  0.0537 Preprocessor1_Model1
lasso_mod1 <- linear_reg(mixture = 1, penalty = .1) %>%
  set_engine("glmnet")

model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf1 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(lasso_mod1) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_wf1
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.94      5  0.366  Preprocessor1_Model1
## 2 rsq     standard   0.708     5  0.0461 Preprocessor1_Model1
lasso_mod2 <- linear_reg(mixture = 1, penalty = 0.5) %>%
  set_engine("glmnet")

model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf2 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(lasso_mod2) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_wf2
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   5.09      5  0.239  Preprocessor1_Model1
## 2 rsq     standard   0.699     5  0.0175 Preprocessor1_Model1

question 5:

best_mod <- linear_reg(penalty = 0.01, mixture = 1) %>%
set_engine("glmnet")

best_mod_wf <- workflow() %>%
  add_recipe(model_recipe) %>%
  add_model(best_mod)

final_fit <- best_mod_wf %>%
  fit(data = boston_train)

final_fit %>%
  extract_fit_parsnip %>%
  vip(num_features = 20)