Packages
# Helper packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Modeling packages
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
# Model interpretability packages
library(vip)
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(readr)
titanic <- read_csv("~/Desktop/BANA 4080 R/data_bana4080/titanic.csv")
## Rows: 1043 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pclass, survived, sex
## dbl (1): age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Part 1: Logistic Regression
titanic <- mutate(titanic, survived = factor(survived))
titanic
## # A tibble: 1,043 × 4
## pclass survived sex age
## <chr> <fct> <chr> <dbl>
## 1 1st Yes female 29
## 2 1st Yes male 0.917
## 3 1st No female 2
## 4 1st No male 30
## 5 1st No female 25
## 6 1st Yes male 48
## 7 1st Yes female 63
## 8 1st No male 39
## 9 1st Yes female 53
## 10 1st No male 71
## # ℹ 1,033 more rows
Question 1:
table(titanic$survived)
##
## No Yes
## 618 425
618 + 425
## [1] 1043
618 / 1043
## [1] 0.5925216
425 / 1043
## [1] 0.4074784
Question 2:
set.seed(123)
titanic_split <- initial_split(titanic, prop = .7, strata = "survived")
titanic_train <- training(titanic_split)
titanic_test <- testing(titanic_split)
lr_fit1 <- logistic_reg() %>%
fit(survived ~ sex, data = titanic_train)
tidy(lr_fit1)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.18 0.142 8.30 1.04e-16
## 2 sexmale -2.63 0.186 -14.2 1.71e-45
lr_fit2 <- logistic_reg() %>%
fit(survived ~ pclass, data = titanic_train)
tidy(lr_fit2)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.589 0.148 3.98 6.81e- 5
## 2 pclass2nd -0.904 0.208 -4.35 1.37e- 5
## 3 pclass3rd -1.63 0.193 -8.46 2.62e-17
lr_fit3 <- logistic_reg() %>%
fit(survived ~ age, data = titanic_train)
tidy(lr_fit3)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.185 0.173 -1.07 0.285
## 2 age -0.00628 0.00521 -1.21 0.227
question 3:
set.seed(123)
titanic_split <- initial_split(titanic, prop = .7, strata = "survived")
titanic_train <- training(titanic_split)
titanic_test <- testing(titanic_split)
question 4:
set.seed(123)
kfold <- vfold_cv(titanic_train, v = 5)
results <- logistic_reg() %>%
fit_resamples(survived ~ ., kfold)
collect_metrics(results)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.789 5 0.0111 Preprocessor1_Model1
## 2 roc_auc binary 0.851 5 0.0121 Preprocessor1_Model1
question 5:
final_fit <- logistic_reg() %>%
fit(survived ~ ., data = titanic_train)
tidy(final_fit)
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.67 0.406 9.03 1.68e-19
## 2 pclass2nd -1.43 0.275 -5.20 1.99e- 7
## 3 pclass3rd -2.41 0.284 -8.49 2.03e-17
## 4 sexmale -2.69 0.205 -13.1 2.33e-39
## 5 age -0.0336 0.00775 -4.33 1.51e- 5
question 6:
final_fit %>%
predict(titanic_test) %>%
bind_cols(titanic_test %>% select(survived)) %>%
conf_mat(truth = survived, estimate = .pred_class)
## Truth
## Prediction No Yes
## No 151 44
## Yes 35 84
question 7:
vip(final_fit$fit, num_features = 20)

PART 2:
boston <- read_csv("~/Desktop/BANA 4080 R/data_bana4080/boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
question 1:
set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = "cmedv")
boston_train <- training(split)
boston_test <- testing(split)
question 2:
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
question 3:
ridge_mod <- linear_reg(mixture = 0, penalty = .1) %>%
set_engine("glmnet")
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
ridge_wf <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(ridge_mod) %>%
fit_resamples(kfolds) %>%
collect_metrics()
ridge_wf
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 4.78 5 0.452 Preprocessor1_Model1
## 2 rsq standard 0.729 5 0.0388 Preprocessor1_Model1
ridge_mod1 <- linear_reg(mixture = 0, penalty = 10) %>%
set_engine("glmnet")
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
ridge_wf1 <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(ridge_mod1) %>%
fit_resamples(kfolds) %>%
collect_metrics()
ridge_wf1
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 5.42 5 0.712 Preprocessor1_Model1
## 2 rsq standard 0.695 5 0.0446 Preprocessor1_Model1
ridge_mod2 <- linear_reg(mixture = 0, penalty = 100) %>%
set_engine("glmnet")
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
ridge_wf2 <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(ridge_mod2) %>%
fit_resamples(kfolds) %>%
collect_metrics()
ridge_wf2
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 7.66 5 0.317 Preprocessor1_Model1
## 2 rsq standard 0.578 5 0.0580 Preprocessor1_Model1
question 4:
lasso_mod <- linear_reg(mixture = 1, penalty = .01) %>%
set_engine("glmnet")
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
lasso_wf <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(lasso_mod) %>%
fit_resamples(kfolds) %>%
collect_metrics()
lasso_wf
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 4.73 5 0.559 Preprocessor1_Model1
## 2 rsq standard 0.733 5 0.0537 Preprocessor1_Model1
lasso_mod1 <- linear_reg(mixture = 1, penalty = .1) %>%
set_engine("glmnet")
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
lasso_wf1 <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(lasso_mod1) %>%
fit_resamples(kfolds) %>%
collect_metrics()
lasso_wf1
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 4.94 5 0.366 Preprocessor1_Model1
## 2 rsq standard 0.708 5 0.0461 Preprocessor1_Model1
lasso_mod2 <- linear_reg(mixture = 1, penalty = 0.5) %>%
set_engine("glmnet")
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
lasso_wf2 <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(lasso_mod2) %>%
fit_resamples(kfolds) %>%
collect_metrics()
lasso_wf2
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 5.09 5 0.239 Preprocessor1_Model1
## 2 rsq standard 0.699 5 0.0175 Preprocessor1_Model1
question 5:
best_mod <- linear_reg(penalty = 0.01, mixture = 1) %>%
set_engine("glmnet")
best_mod_wf <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(best_mod)
final_fit <- best_mod_wf %>%
fit(data = boston_train)
final_fit %>%
extract_fit_parsnip %>%
vip(num_features = 20)
