# Helper packages
library(tidyverse)
# Modeling packages
library(tidymodels)
# Model interpretability packages
library(vip)
library(readr)
titanic <- read_csv("data/titanic.csv")
titanic <- mutate(titanic, survived = factor(survived))
titanic
## # A tibble: 1,043 × 4
## pclass survived sex age
## <chr> <fct> <chr> <dbl>
## 1 1st Yes female 29
## 2 1st Yes male 0.917
## 3 1st No female 2
## 4 1st No male 30
## 5 1st No female 25
## 6 1st Yes male 48
## 7 1st Yes female 63
## 8 1st No male 39
## 9 1st Yes female 53
## 10 1st No male 71
## # … with 1,033 more rows
#1. Compute the proportion of survived = Yes vs. survived = No in this data set. In other words, across all observations, what is the percentage of passengers that survived compared to those that did not?
table(titanic$survived)
##
## No Yes
## 618 425
618 + 425
## [1] 1043
618 / 1043
## [1] 0.5925216
425 / 1043
## [1] 0.4074784
survived 41% didn’t survive 59%
#2. Explore potential relationships between the predictor variables and the dependent variable. What patterns do you observe? Do any of the predictor variables appear to have an association with the dependent variable?
set.seed(123)
titanic_split <- initial_split(titanic, prop = .7, strata = "survived")
titanic_train <- training(titanic_split)
titanic_test <- testing(titanic_split)
lr_fit1 <- logistic_reg() %>%
fit(survived ~ sex, data = titanic_train)
tidy(lr_fit1)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.18 0.142 8.30 1.04e-16
## 2 sexmale -2.63 0.186 -14.2 1.71e-45
lr_fit2 <- logistic_reg() %>%
fit(survived ~ pclass, data = titanic_train)
tidy(lr_fit2)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.589 0.148 3.98 6.81e- 5
## 2 pclass2nd -0.904 0.208 -4.35 1.37e- 5
## 3 pclass3rd -1.63 0.193 -8.46 2.62e-17
lr_fit3 <- logistic_reg() %>%
fit(survived ~ age, data = titanic_train)
tidy(lr_fit3)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.185 0.173 -1.07 0.285
## 2 age -0.00628 0.00521 -1.21 0.227
One thing that I noticed is that 1st class tend to have survived more than 2nd & 3rd
#3. Using stratified sampling split the data into a training set and a test set. Use a 70/30 split (70% for the training data & 30% for the test set). Set the seed to 123 so that we all get the same split. Assess the distribution of the dependent variable – Is there a similar distribution of the dependent variable between the training and test sets?
set.seed(123)
titanic_split <- initial_split(titanic, prop = .7, strata = "survived")
titanic_train <- training(titanic_split)
titanic_test <- testing(titanic_split)
#4. Fill in the blanks to apply a logistic regression model using all predictor variables while performing a 5-fold cross validation procedure. What is the mean cross validation accuracy rate?
set.seed(123)
kfold <- vfold_cv(titanic_train, v = 5)
results <- logistic_reg() %>%
fit_resamples(survived ~ ., kfold)
collect_metrics(results)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.789 5 0.0111 Preprocessor1_Model1
## 2 roc_auc binary 0.851 5 0.0121 Preprocessor1_Model1
#5. Execute the code below and assess the coefficients of the model. How do you interpret these coefficients? How do we interpret these coefficients? Since the coefficient for age is negative we can interpret it as the probability of surviving decreases as age increases. In fact, we can state that for every year older a passenger is, the log odds decreases by -0.033553. Or equivalently, it multiplies the odds by e−0.033553
final_fit <- logistic_reg() %>%
fit(survived ~ ., data = titanic_train)
tidy(final_fit)
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.67 0.406 9.03 1.68e-19
## 2 pclass2nd -1.43 0.275 -5.20 1.99e- 7
## 3 pclass3rd -2.41 0.284 -8.49 2.03e-17
## 4 sexmale -2.69 0.205 -13.1 2.33e-39
## 5 age -0.0336 0.00775 -4.33 1.51e- 5
I would assume that the negative numbers represent an inverse relations ship with survival. Example, for sex females have a much higher survival rate.
#6. Create a confusion matrix with the test data. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression in this case. Does it experience more false negatives or false positives?
final_fit %>%
predict(titanic_test) %>%
bind_cols(titanic_test %>% select(survived)) %>%
conf_mat(truth = survived, estimate = .pred_class)
## Truth
## Prediction No Yes
## No 151 44
## Yes 35 84
more false negatives
#7. Plot the feature importance for the three predictor variables. Which predictor variable is most influential?
vip(final_fit$fit, num_features = 20)
The sex is the most influential
##Part 2
boston <- read_csv("data/boston.csv")
#1. Split the data into a training set and test set using a 70-30% split. Be sure to include the set.seed(123) so that your train and test sets are the same as mine.
set.seed(123) # for reproducibility
split <- initial_split(boston, prop = 0.7, strata = "cmedv")
boston_train <- training(split)
boston_test <- testing(split)
#2. Create a recipe that will model cmedv as a function of all predictor variables. Also, be sure to normalize all numeric predictor variables using a Yeo-Johnson transformation and then standardize all numeric predictor variables.
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
#3. Fit three ridge models, one each with the below penalty (λ) values and use a 5-fold cross-validation procedure to assess the generalization RMSE. Which penalty parameter value resulted in the lowest RMSE?
# Step 1: create ridge model object
ridge_mod <- linear_reg(mixture = 0, penalty = .1) %>%
set_engine("glmnet")
# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
ridge_wf <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(ridge_mod) %>%
fit_resamples(kfolds) %>%
collect_metrics()
ridge_wf
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 4.78 5 0.452 Preprocessor1_Model1
## 2 rsq standard 0.729 5 0.0388 Preprocessor1_Model1
# Step 1: create ridge model object
ridge_mod1 <- linear_reg(mixture = 0, penalty = 10) %>%
set_engine("glmnet")
# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
ridge_wf1 <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(ridge_mod1) %>%
fit_resamples(kfolds) %>%
collect_metrics()
ridge_wf1
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 5.42 5 0.712 Preprocessor1_Model1
## 2 rsq standard 0.695 5 0.0446 Preprocessor1_Model1
# Step 1: create ridge model object
ridge_mod2 <- linear_reg(mixture = 0, penalty = 100) %>%
set_engine("glmnet")
# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
ridge_wf2 <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(ridge_mod2) %>%
fit_resamples(kfolds) %>%
collect_metrics()
ridge_wf2
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 7.66 5 0.317 Preprocessor1_Model1
## 2 rsq standard 0.578 5 0.0580 Preprocessor1_Model1
Penalty 0f 0.1 has the lowest rmse with 4.9499/4.73111
#4. Fit three Lasso models, one each with the below penalty (λ) values and use a 5-fold cross-validation procedure to compute the generalization RMSE. Which penalty parameter value resulted in the lowest RMSE?
# Step 1: create ridge model object
lasso_mod <- linear_reg(mixture = 1, penalty = .01) %>%
set_engine("glmnet")
# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
lasso_wf <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(lasso_mod) %>%
fit_resamples(kfolds) %>%
collect_metrics()
lasso_wf
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 4.73 5 0.559 Preprocessor1_Model1
## 2 rsq standard 0.733 5 0.0537 Preprocessor1_Model1
# Step 1: create ridge model object
lasso_mod1 <- linear_reg(mixture = 1, penalty = .1) %>%
set_engine("glmnet")
# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
lasso_wf1 <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(lasso_mod1) %>%
fit_resamples(kfolds) %>%
collect_metrics()
lasso_wf1
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 4.94 5 0.366 Preprocessor1_Model1
## 2 rsq standard 0.708 5 0.0461 Preprocessor1_Model1
# Step 1: create ridge model object
lasso_mod2 <- linear_reg(mixture = 1, penalty = 0.5) %>%
set_engine("glmnet")
# Step 2: create model & preprocessing recipe
model_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Step 3: fit model workflow
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
lasso_wf2 <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(lasso_mod2) %>%
fit_resamples(kfolds) %>%
collect_metrics()
lasso_wf2
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 5.09 5 0.239 Preprocessor1_Model1
## 2 rsq standard 0.699 5 0.0175 Preprocessor1_Model1
#5. Pick the Ridge or Lasso model from above that had the lowest generalization RMSE. Fit a final model across all the training data, extract that final fit from the workflow object, and then use the vip package to visualize the variable importance scores for all 15 features. Which feature is most influential in this final model?
best_mod <- linear_reg(penalty = 0.01, mixture = 1) %>%
set_engine("glmnet")
best_mod_wf <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(best_mod)
final_fit <- best_mod_wf %>%
fit(data = boston_train)
final_fit %>%
extract_fit_parsnip %>%
vip(num_features = 20)
lstat is the most influential