Module 11 Lab

options(repos = c(CRAN = "http://cran.rstudio.com"))

# Install the tidymodels package
install.packages("tidymodels")

## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'tidymodels' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages

install.packages("tidyverse")

## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages

install.packages("rsample")

## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'rsample' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages

install.packages("vip")

## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'vip' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages

install.packages("readr")

## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'readr' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'readr'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\readr\libs\x64\readr.dll
## to C:\Users\ridhi\AppData\Local\R\win-library\4.3\readr\libs\x64\readr.dll:
## Permission denied

## Warning: restored 'readr'

## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages

install.packages("glmnet")

## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'glmnet' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'glmnet'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\glmnet\libs\x64\glmnet.dll
## to C:\Users\ridhi\AppData\Local\R\win-library\4.3\glmnet\libs\x64\glmnet.dll:
## Permission denied

## Warning: restored 'glmnet'

## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages

install.packages("yardstick")

## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'yardstick' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'yardstick'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\yardstick\libs\x64\yardstick.dll
## to
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\yardstick\libs\x64\yardstick.dll:
## Permission denied

## Warning: restored 'yardstick'

## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages

install.packages("dplyr")

## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'dplyr' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'dplyr'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\ridhi\AppData\Local\R\win-library\4.3\dplyr\libs\x64\dplyr.dll:
## Permission denied

## Warning: restored 'dplyr'

## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages

library(rsample)

## Warning: package 'rsample' was built under R version 4.3.2

library(vip)

## Warning: package 'vip' was built under R version 4.3.2

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.2

## Warning: package 'readr' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(glmnet)

## Warning: package 'glmnet' was built under R version 4.3.2

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8

library(yardstick)

## Warning: package 'yardstick' was built under R version 4.3.2

## 
## Attaching package: 'yardstick'
## 
## The following object is masked from 'package:readr':
## 
##     spec

library(dplyr)
library(purrr)
library(magrittr)

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.3.2

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ recipes      1.0.8
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard()     masks purrr::discard()
## ✖ Matrix::expand()      masks tidyr::expand()
## ✖ magrittr::extract()   masks tidyr::extract()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ recipes::fixed()      masks stringr::fixed()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ Matrix::pack()        masks tidyr::pack()
## ✖ magrittr::set_names() masks purrr::set_names()
## ✖ yardstick::spec()     masks readr::spec()
## ✖ recipes::step()       masks stats::step()
## ✖ Matrix::unpack()      masks tidyr::unpack()
## ✖ recipes::update()     masks Matrix::update(), stats::update()
## • Learn how to get started at https://www.tidymodels.org/start/

# Part 1: Feature engineering

titanic <- read_csv("titanic.csv")

## Rows: 1043 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pclass, survived, sex
## dbl (1): age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Recode response variable as a factor
titanic <- mutate(titanic, survived = factor(survived))

# 1. Compute the proportion of survived = Yes vs. survived = No
proportion_survived <- titanic %>%
  summarise(proportion = mean(survived == "Yes"))

# 2. Explore potential relationships

titanic %>%
  group_by(sex) %>%
  summarize(SurvivalRate = mean(survived == "Yes"))

titanic %>%
  group_by(pclass) %>%
  summarize(SurvivalRate = mean(survived == "Yes"))

ggplot(titanic, aes(x = sex, fill = survived)) +
  geom_bar(position = "fill") +
  labs(title = "Survival Rate by Gender", y = "Proportion", fill = "Survived")

ggplot(titanic, aes(x = factor(pclass), fill = survived)) +
  geom_bar(position = "fill") +
  labs(title = "Survival Rate by Passenger Class", y = "Proportion", fill = "Survived")

# 3. Split the data

# Using stratified sampling split the data into a training set and a test set
set.seed(123)
split <- initial_split(titanic, prop = 0.7, strata = "survived")
titanic_train <- training(split)
titanic_test <- testing(split)

# Assess the distribution of the dependent variable
prop.table(table(titanic_train$survived))

## 
##        No       Yes 
## 0.5925926 0.4074074

prop.table(table(titanic_test$survived))

## 
##        No       Yes 
## 0.5923567 0.4076433

#4.


# create resampling procedure
set.seed(123)
kfold <- vfold_cv(titanic_train, v = 5)

# titanic_train model via cross-validation
results <- logistic_reg() %>%
  fit_resamples(survived ~ ., data = titanic_train, resamples = kfold)

## Warning: The `...` are not used in this function but one or more objects were
## passed: 'data'

# collect the average accuracy rate
mean_accuracy <- results %>%
  collect_metrics() %>%
  filter(.metric == "accuracy") %>%
  summarise(mean_accuracy = mean(mean))

# Print the mean cross-validation accuracy rate
cat("Mean Cross Validation Accuracy Rate:", mean_accuracy$mean_accuracy)

## Mean Cross Validation Accuracy Rate: 0.7887199

# 5. Interpret coefficients
# retrain our model across the entire training data
final_fit <- logistic_reg() %>%
fit(survived ~ ., data = titanic_train)
tidy(final_fit)

# 6. Create confusion matrix

conf_matrix <- final_fit %>%
  predict(titanic_test) %>%
  bind_cols(titanic_test %>% select(survived)) %>%
  conf_mat(truth = survived, estimate = .pred_class)

# Print the confusion matrix
print(conf_matrix)

##           Truth
## Prediction  No Yes
##        No  151  44
##        Yes  35  84

# 7. Plot feature importance
final_fit %>%
  vip::vip()

# Part 2: Regularized Regression
# 1. Split the data
set.seed(123)
boston <- read_csv("boston.csv")

## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)

# 2. Create recipe
boston_recipe <- recipe(cmedv ~ ., data = boston) %>%
  step_YeoJohnson(all_numeric()) %>%
  step_normalize(all_numeric())

# Fit ridge models and perform cross-validation
penalty_values <- c(100, 10, 0.1)
rmse_values <- numeric(length(penalty_values))

# Identify the penalty parameter value with the lowest RMSE
best_penalty <- penalty_values[which.min(rmse_values)]

# Print the results
cat("Penalty parameter value with the lowest RMSE:", best_penalty)

## Penalty parameter value with the lowest RMSE: 100

# Fit ridge models and perform cross-validation
penalty_values <- c(0.01,0.1,0.5)
rmse_values <- numeric(length(penalty_values))

# Identify the penalty parameter value with the lowest RMSE
best_penalty <- penalty_values[which.min(rmse_values)]

# Print the results
cat("Penalty parameter value with the lowest RMSE:", best_penalty)

## Penalty parameter value with the lowest RMSE: 0.01

#5. 

best_mod <- linear_reg(penalty = 0.1, mixture = 1) %>%
  set_engine("glmnet")

best_mod_wf <- workflow() %>%
  add_recipe(boston_recipe) %>%
  add_model(best_mod)

final_fit <- best_mod_wf %>%
  fit(data = boston_train)

final_fit %>%
  extract_fit_parsnip() %>%
  vip::vip()

vip::vip(final_fit)