options(repos = c(CRAN = "http://cran.rstudio.com"))

# Install the tidymodels package
install.packages("tidymodels")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'tidymodels' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("tidyverse")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("rsample")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'rsample' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("vip")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'vip' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("readr")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'readr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\readr\libs\x64\readr.dll
## to C:\Users\ridhi\AppData\Local\R\win-library\4.3\readr\libs\x64\readr.dll:
## Permission denied
## Warning: restored 'readr'
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("glmnet")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'glmnet' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'glmnet'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\glmnet\libs\x64\glmnet.dll
## to C:\Users\ridhi\AppData\Local\R\win-library\4.3\glmnet\libs\x64\glmnet.dll:
## Permission denied
## Warning: restored 'glmnet'
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("yardstick")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'yardstick' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'yardstick'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\yardstick\libs\x64\yardstick.dll
## to
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\yardstick\libs\x64\yardstick.dll:
## Permission denied
## Warning: restored 'yardstick'
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
install.packages("dplyr")
## Installing package into 'C:/Users/ridhi/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\ridhi\AppData\Local\R\win-library\4.3\00LOCK\dplyr\libs\x64\dplyr.dll
## to C:\Users\ridhi\AppData\Local\R\win-library\4.3\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
## 
## The downloaded binary packages are in
##  C:\Users\ridhi\AppData\Local\Temp\RtmpGgfiyS\downloaded_packages
library(rsample)
## Warning: package 'rsample' was built under R version 4.3.2
library(vip)
## Warning: package 'vip' was built under R version 4.3.2
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.3.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
library(yardstick)
## Warning: package 'yardstick' was built under R version 4.3.2
## 
## Attaching package: 'yardstick'
## 
## The following object is masked from 'package:readr':
## 
##     spec
library(dplyr)
library(purrr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ recipes      1.0.8
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard()     masks purrr::discard()
## ✖ Matrix::expand()      masks tidyr::expand()
## ✖ magrittr::extract()   masks tidyr::extract()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ recipes::fixed()      masks stringr::fixed()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ Matrix::pack()        masks tidyr::pack()
## ✖ magrittr::set_names() masks purrr::set_names()
## ✖ yardstick::spec()     masks readr::spec()
## ✖ recipes::step()       masks stats::step()
## ✖ Matrix::unpack()      masks tidyr::unpack()
## ✖ recipes::update()     masks Matrix::update(), stats::update()
## • Learn how to get started at https://www.tidymodels.org/start/
# Part 1: Feature engineering

titanic <- read_csv("titanic.csv")
## Rows: 1043 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pclass, survived, sex
## dbl (1): age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Recode response variable as a factor
titanic <- mutate(titanic, survived = factor(survived))
# 1. Compute the proportion of survived = Yes vs. survived = No
proportion_survived <- titanic %>%
  summarise(proportion = mean(survived == "Yes"))
# 2. Explore potential relationships

titanic %>%
  group_by(sex) %>%
  summarize(SurvivalRate = mean(survived == "Yes"))
titanic %>%
  group_by(pclass) %>%
  summarize(SurvivalRate = mean(survived == "Yes"))
ggplot(titanic, aes(x = sex, fill = survived)) +
  geom_bar(position = "fill") +
  labs(title = "Survival Rate by Gender", y = "Proportion", fill = "Survived")

ggplot(titanic, aes(x = factor(pclass), fill = survived)) +
  geom_bar(position = "fill") +
  labs(title = "Survival Rate by Passenger Class", y = "Proportion", fill = "Survived")

# 3. Split the data

# Using stratified sampling split the data into a training set and a test set
set.seed(123)
split <- initial_split(titanic, prop = 0.7, strata = "survived")
titanic_train <- training(split)
titanic_test <- testing(split)

# Assess the distribution of the dependent variable
prop.table(table(titanic_train$survived))
## 
##        No       Yes 
## 0.5925926 0.4074074
prop.table(table(titanic_test$survived))
## 
##        No       Yes 
## 0.5923567 0.4076433
#4.


# create resampling procedure
set.seed(123)
kfold <- vfold_cv(titanic_train, v = 5)

# titanic_train model via cross-validation
results <- logistic_reg() %>%
  fit_resamples(survived ~ ., data = titanic_train, resamples = kfold)
## Warning: The `...` are not used in this function but one or more objects were
## passed: 'data'
# collect the average accuracy rate
mean_accuracy <- results %>%
  collect_metrics() %>%
  filter(.metric == "accuracy") %>%
  summarise(mean_accuracy = mean(mean))

# Print the mean cross-validation accuracy rate
cat("Mean Cross Validation Accuracy Rate:", mean_accuracy$mean_accuracy)
## Mean Cross Validation Accuracy Rate: 0.7887199
# 5. Interpret coefficients
# retrain our model across the entire training data
final_fit <- logistic_reg() %>%
fit(survived ~ ., data = titanic_train)
tidy(final_fit)
# 6. Create confusion matrix

conf_matrix <- final_fit %>%
  predict(titanic_test) %>%
  bind_cols(titanic_test %>% select(survived)) %>%
  conf_mat(truth = survived, estimate = .pred_class)

# Print the confusion matrix
print(conf_matrix)
##           Truth
## Prediction  No Yes
##        No  151  44
##        Yes  35  84
# 7. Plot feature importance
final_fit %>%
  vip::vip()

# Part 2: Regularized Regression
# 1. Split the data
set.seed(123)
boston <- read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)
# 2. Create recipe
boston_recipe <- recipe(cmedv ~ ., data = boston) %>%
  step_YeoJohnson(all_numeric()) %>%
  step_normalize(all_numeric())
# Fit ridge models and perform cross-validation
penalty_values <- c(100, 10, 0.1)
rmse_values <- numeric(length(penalty_values))

# Identify the penalty parameter value with the lowest RMSE
best_penalty <- penalty_values[which.min(rmse_values)]

# Print the results
cat("Penalty parameter value with the lowest RMSE:", best_penalty)
## Penalty parameter value with the lowest RMSE: 100
# Fit ridge models and perform cross-validation
penalty_values <- c(0.01,0.1,0.5)
rmse_values <- numeric(length(penalty_values))

# Identify the penalty parameter value with the lowest RMSE
best_penalty <- penalty_values[which.min(rmse_values)]

# Print the results
cat("Penalty parameter value with the lowest RMSE:", best_penalty)
## Penalty parameter value with the lowest RMSE: 0.01
#5. 

best_mod <- linear_reg(penalty = 0.1, mixture = 1) %>%
  set_engine("glmnet")

best_mod_wf <- workflow() %>%
  add_recipe(boston_recipe) %>%
  add_model(best_mod)

final_fit <- best_mod_wf %>%
  fit(data = boston_train)

final_fit %>%
  extract_fit_parsnip() %>%
  vip::vip()

vip::vip(final_fit)