options(repos = c(CRAN = "http://cran.rstudio.com"))
# Install the tidymodels package
install.packages("tidymodels")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'tidymodels' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("tidyverse")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("rsample")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'rsample' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("vip")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'vip' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("readr")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'readr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\readr\libs\x64\readr.dll
## to C:\Users\ridhi\AppData\Local\R\win-library\4.3\readr\libs\x64\readr.dll:
## Permission denied
## Warning: restored 'readr'
##
## The downloaded binary packages are in
## C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("glmnet")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'glmnet' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'glmnet'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\glmnet\libs\x64\glmnet.dll
## to C:\Users\ridhi\AppData\Local\R\win-library\4.3\glmnet\libs\x64\glmnet.dll:
## Permission denied
## Warning: restored 'glmnet'
##
## The downloaded binary packages are in
## C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("yardstick")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'yardstick' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'yardstick'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\yardstick\libs\x64\yardstick.dll
## to
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\yardstick\libs\x64\yardstick.dll:
## Permission denied
## Warning: restored 'yardstick'
##
## The downloaded binary packages are in
## C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("dplyr")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\ridhi\AppData\Local\R\win-library\4.3\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
##
## The downloaded binary packages are in
## C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
library(rsample)
## Warning: package 'rsample' was built under R version 4.3.2
library(vip)
## Warning: package 'vip' was built under R version 4.3.2
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.2
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(yardstick)
## Warning: package 'yardstick' was built under R version 4.3.2
##
## Attaching package: 'yardstick'
##
## The following object is masked from 'package:readr':
##
## spec
library(dplyr)
library(purrr)
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ recipes 1.0.8
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ Matrix::expand() masks tidyr::expand()
## ✖ magrittr::extract() masks tidyr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ Matrix::pack() masks tidyr::pack()
## ✖ magrittr::set_names() masks purrr::set_names()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## ✖ Matrix::unpack() masks tidyr::unpack()
## ✖ recipes::update() masks Matrix::update(), stats::update()
## • Learn how to get started at https://www.tidymodels.org/start/
# Part 1: Feature engineering
titanic <- read_csv("titanic.csv")
## Rows: 1043 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pclass, survived, sex
## dbl (1): age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Recode response variable as a factor
titanic <- mutate(titanic, survived = factor(survived))
# 1. Compute the proportion of survived = Yes vs. survived = No
proportion_survived <- titanic %>%
summarise(proportion = mean(survived == "Yes"))
# 2. Explore potential relationships
titanic %>%
group_by(sex) %>%
summarize(SurvivalRate = mean(survived == "Yes"))
titanic %>%
group_by(pclass) %>%
summarize(SurvivalRate = mean(survived == "Yes"))
ggplot(titanic, aes(x = sex, fill = survived)) +
geom_bar(position = "fill") +
labs(title = "Survival Rate by Gender", y = "Proportion", fill = "Survived")

ggplot(titanic, aes(x = factor(pclass), fill = survived)) +
geom_bar(position = "fill") +
labs(title = "Survival Rate by Passenger Class", y = "Proportion", fill = "Survived")

# 3. Split the data
# Using stratified sampling split the data into a training set and a test set
set.seed(123)
split <- initial_split(titanic, prop = 0.7, strata = "survived")
titanic_train <- training(split)
titanic_test <- testing(split)
# Assess the distribution of the dependent variable
prop.table(table(titanic_train$survived))
##
## No Yes
## 0.5925926 0.4074074
prop.table(table(titanic_test$survived))
##
## No Yes
## 0.5923567 0.4076433
#4.
# create resampling procedure
set.seed(123)
kfold <- vfold_cv(titanic_train, v = 5)
# titanic_train model via cross-validation
results <- logistic_reg() %>%
fit_resamples(survived ~ ., data = titanic_train, resamples = kfold)
## Warning: The `...` are not used in this function but one or more objects were
## passed: 'data'
# collect the average accuracy rate
mean_accuracy <- results %>%
collect_metrics() %>%
filter(.metric == "accuracy") %>%
summarise(mean_accuracy = mean(mean))
# Print the mean cross-validation accuracy rate
cat("Mean Cross Validation Accuracy Rate:", mean_accuracy$mean_accuracy)
## Mean Cross Validation Accuracy Rate: 0.7887199
# 5. Interpret coefficients
# retrain our model across the entire training data
final_fit <- logistic_reg() %>%
fit(survived ~ ., data = titanic_train)
tidy(final_fit)
# 6. Create confusion matrix
conf_matrix <- final_fit %>%
predict(titanic_test) %>%
bind_cols(titanic_test %>% select(survived)) %>%
conf_mat(truth = survived, estimate = .pred_class)
# Print the confusion matrix
print(conf_matrix)
## Truth
## Prediction No Yes
## No 151 44
## Yes 35 84
# 7. Plot feature importance
final_fit %>%
vip::vip()

# Part 2: Regularized Regression
# 1. Split the data
set.seed(123)
boston <- read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)
# 2. Create recipe
boston_recipe <- recipe(cmedv ~ ., data = boston) %>%
step_YeoJohnson(all_numeric()) %>%
step_normalize(all_numeric())
# Fit ridge models and perform cross-validation
penalty_values <- c(100, 10, 0.1)
rmse_values <- numeric(length(penalty_values))
# Identify the penalty parameter value with the lowest RMSE
best_penalty <- penalty_values[which.min(rmse_values)]
# Print the results
cat("Penalty parameter value with the lowest RMSE:", best_penalty)
## Penalty parameter value with the lowest RMSE: 100
# Fit ridge models and perform cross-validation
penalty_values <- c(0.01,0.1,0.5)
rmse_values <- numeric(length(penalty_values))
# Identify the penalty parameter value with the lowest RMSE
best_penalty <- penalty_values[which.min(rmse_values)]
# Print the results
cat("Penalty parameter value with the lowest RMSE:", best_penalty)
## Penalty parameter value with the lowest RMSE: 0.01
#5.
best_mod <- linear_reg(penalty = 0.1, mixture = 1) %>%
set_engine("glmnet")
best_mod_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(best_mod)
final_fit <- best_mod_wf %>%
fit(data = boston_train)
final_fit %>%
extract_fit_parsnip() %>%
vip::vip()

vip::vip(final_fit)
