Prereqs

# Helper packages
library(tidyverse)

# Modeling packages
library(tidymodels)

# Model interpretability packages
library(vip)
library(readr)
titanic <- read_csv("data/titanic.csv")

titanic <- mutate(titanic, survived = factor(survived))
titanic
## # A tibble: 1,043 × 4
##    pclass survived sex       age
##    <chr>  <fct>    <chr>   <dbl>
##  1 1st    Yes      female 29    
##  2 1st    Yes      male    0.917
##  3 1st    No       female  2    
##  4 1st    No       male   30    
##  5 1st    No       female 25    
##  6 1st    Yes      male   48    
##  7 1st    Yes      female 63    
##  8 1st    No       male   39    
##  9 1st    Yes      female 53    
## 10 1st    No       male   71    
## # … with 1,033 more rows

#1. Compute the proportion of survived = Yes vs. survived = No in this data set. In other words, across all observations, what is the percentage of passengers that survived compared to those that did not?


table(titanic$survived)
## 
##  No Yes 
## 618 425

618 + 425
## [1] 1043

618 / 1043
## [1] 0.5925216
425 / 1043
## [1] 0.4074784

survived 41% didn’t survive 59%

#2. Explore potential relationships between the predictor variables and the dependent variable. What patterns do you observe? Do any of the predictor variables appear to have an association with the dependent variable?

  set.seed(123) 
titanic_split <- initial_split(titanic, prop = .7, strata = "survived")
titanic_train <- training(titanic_split)
titanic_test  <- testing(titanic_split)

lr_fit1 <- logistic_reg() %>%
  fit(survived ~ sex, data = titanic_train)

tidy(lr_fit1)
## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     1.18     0.142      8.30 1.04e-16
## 2 sexmale        -2.63     0.186    -14.2  1.71e-45

lr_fit2 <- logistic_reg() %>%
  fit(survived ~ pclass, data = titanic_train)

tidy(lr_fit2)
## # A tibble: 3 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    0.589     0.148      3.98 6.81e- 5
## 2 pclass2nd     -0.904     0.208     -4.35 1.37e- 5
## 3 pclass3rd     -1.63      0.193     -8.46 2.62e-17

lr_fit3 <- logistic_reg() %>%
  fit(survived ~ age, data = titanic_train)

tidy(lr_fit3)
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) -0.185     0.173       -1.07   0.285
## 2 age         -0.00628   0.00521     -1.21   0.227

One thing that I noticed is that 1st class tend to have survived more than 2nd & 3rd

#3. Using stratified sampling split the data into a training set and a test set. Use a 70/30 split (70% for the training data & 30% for the test set). Set the seed to 123 so that we all get the same split. Assess the distribution of the dependent variable – Is there a similar distribution of the dependent variable between the training and test sets?


set.seed(123)  
titanic_split <- initial_split(titanic, prop = .7, strata = "survived")
titanic_train <- training(titanic_split)
titanic_test  <- testing(titanic_split)

#4. Fill in the blanks to apply a logistic regression model using all predictor variables while performing a 5-fold cross validation procedure. What is the mean cross validation accuracy rate?


set.seed(123)
kfold <- vfold_cv(titanic_train, v = 5)

results <- logistic_reg() %>%
  fit_resamples(survived ~ ., kfold)

collect_metrics(results)
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.789     5  0.0111 Preprocessor1_Model1
## 2 roc_auc  binary     0.851     5  0.0121 Preprocessor1_Model1

#5. Execute the code below and assess the coefficients of the model. How do you interpret these coefficients? How do we interpret these coefficients? Since the coefficient for age is negative we can interpret it as the probability of surviving decreases as age increases. In fact, we can state that for every year older a passenger is, the log odds decreases by -0.033553. Or equivalently, it multiplies the odds by e−0.033553


final_fit <- logistic_reg() %>%
fit(survived ~ ., data = titanic_train)
tidy(final_fit)
## # A tibble: 5 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   3.67     0.406        9.03 1.68e-19
## 2 pclass2nd    -1.43     0.275       -5.20 1.99e- 7
## 3 pclass3rd    -2.41     0.284       -8.49 2.03e-17
## 4 sexmale      -2.69     0.205      -13.1  2.33e-39
## 5 age          -0.0336   0.00775     -4.33 1.51e- 5

I would assume that the negative numbers represent an inverse relations ship with survival. Example, for sex females have a much higher survival rate.

#6. Create a confusion matrix with the test data. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression in this case. Does it experience more false negatives or false positives?

final_fit %>%
predict(titanic_test) %>%
bind_cols(titanic_test %>% select(survived)) %>%
conf_mat(truth = survived, estimate = .pred_class)
##           Truth
## Prediction  No Yes
##        No  151  44
##        Yes  35  84

more false negatives

#7. Plot the feature importance for the three predictor variables. Which predictor variable is most influential?


vip(final_fit$fit, num_features = 20)

The sex is the most influential

##Part 2

boston <- read_csv("data/boston.csv")

#1. Split the data into a training set and test set using a 70-30% split. Be sure to include the set.seed(123) so that your train and test sets are the same as mine.

set.seed(123)  # for reproducibility
split <- initial_split(boston, prop = 0.7, strata = "cmedv")
boston_train  <- training(split)
boston_test   <- testing(split)

#2. Create a recipe that will model cmedv as a function of all predictor variables. Also, be sure to normalize all numeric predictor variables using a Yeo-Johnson transformation and then standardize all numeric predictor variables.

model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

#3. Fit three ridge models, one each with the below penalty (λ) values and use a 5-fold cross-validation procedure to assess the generalization RMSE. Which penalty parameter value resulted in the lowest RMSE?

# Step 1: create ridge model object
ridge_mod <- linear_reg(mixture = 0, penalty = .1) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(ridge_mod) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

ridge_wf
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.78      5  0.452  Preprocessor1_Model1
## 2 rsq     standard   0.729     5  0.0388 Preprocessor1_Model1

# Step 1: create ridge model object
ridge_mod1 <- linear_reg(mixture = 0, penalty = 10) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf1 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(ridge_mod1) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

ridge_wf1
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   5.42      5  0.712  Preprocessor1_Model1
## 2 rsq     standard   0.695     5  0.0446 Preprocessor1_Model1
# Step 1: create ridge model object
ridge_mod2 <- linear_reg(mixture = 0, penalty = 100) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

ridge_wf2 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(ridge_mod2) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

ridge_wf2
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   7.66      5  0.317  Preprocessor1_Model1
## 2 rsq     standard   0.578     5  0.0580 Preprocessor1_Model1

Penalty 0f 0.1 has the lowest rmse with 4.9499/4.73111

#4. Fit three Lasso models, one each with the below penalty (λ) values and use a 5-fold cross-validation procedure to compute the generalization RMSE. Which penalty parameter value resulted in the lowest RMSE?

# Step 1: create ridge model object
lasso_mod <- linear_reg(mixture = 1, penalty = .01) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(lasso_mod) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_wf
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.73      5  0.559  Preprocessor1_Model1
## 2 rsq     standard   0.733     5  0.0537 Preprocessor1_Model1

# Step 1: create ridge model object
lasso_mod1 <- linear_reg(mixture = 1, penalty = .1) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf1 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(lasso_mod1) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_wf1
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.94      5  0.366  Preprocessor1_Model1
## 2 rsq     standard   0.708     5  0.0461 Preprocessor1_Model1
# Step 1: create ridge model object
lasso_mod2 <- linear_reg(mixture = 1, penalty = 0.5) %>%
  set_engine("glmnet")

# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)

lasso_wf2 <- workflow() %>%
   add_recipe(model_recipe) %>%
   add_model(lasso_mod2) %>%
   fit_resamples(kfolds) %>%
   collect_metrics()

lasso_wf2
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   5.09      5  0.239  Preprocessor1_Model1
## 2 rsq     standard   0.699     5  0.0175 Preprocessor1_Model1

#5. Pick the Ridge or Lasso model from above that had the lowest generalization RMSE. Fit a final model across all the training data, extract that final fit from the workflow object, and then use the vip package to visualize the variable importance scores for all 15 features. Which feature is most influential in this final model?

best_mod <- linear_reg(penalty = 0.01, mixture = 1) %>%
set_engine("glmnet")

best_mod_wf <- workflow() %>%
  add_recipe(model_recipe) %>%
  add_model(best_mod)

final_fit <- best_mod_wf %>%
  fit(data = boston_train)

final_fit %>%
  extract_fit_parsnip %>%
  vip(num_features = 20)

lstat is the most influential