Machine Learning in the Tidyverse

1. Foundations of “tidy” machine learning
2. Multiple models with broom
- Exploring coefficients across models
- Visually inspect the fit of many models - R²
3. Build, Tune & Evaluate Regression Models
4. Build, Tune & Evaluate Classification Models

This report is a summary of lesson by Dima, Data camp

1. Foundations of “tidy” machine learning

`List column` Workflow

1) Nesting your data

gap_nested <- gapminder %>% 
  group_by(country) %>% 
  nest()

# Explore gap_nested
head(gap_nested)

## # A tibble: 6 × 2
## # Groups:   country [6]
##   country    data             
##   <fct>      <list>           
## 1 Algeria    <tibble [52 × 6]>
## 2 Argentina  <tibble [52 × 6]>
## 3 Australia  <tibble [52 × 6]>
## 4 Austria    <tibble [52 × 6]>
## 5 Bangladesh <tibble [52 × 6]>
## 6 Belgium    <tibble [52 × 6]>

2) Mapping your data

- case 1

# Calculate the mean population for each country
pop_nested <- gap_nested %>%
  mutate(mean_pop = map(data, ~mean(.x$population)))

# Take a look at pop_nested
head(pop_nested)

## # A tibble: 6 × 3
## # Groups:   country [6]
##   country    data              mean_pop 
##   <fct>      <list>            <list>   
## 1 Algeria    <tibble [52 × 6]> <dbl [1]>
## 2 Argentina  <tibble [52 × 6]> <dbl [1]>
## 3 Australia  <tibble [52 × 6]> <dbl [1]>
## 4 Austria    <tibble [52 × 6]> <dbl [1]>
## 5 Bangladesh <tibble [52 × 6]> <dbl [1]>
## 6 Belgium    <tibble [52 × 6]> <dbl [1]>

# Extract the mean_pop value by using unnest
pop_mean <- pop_nested %>% 
  unnest(mean_pop)

# Take a look at pop_mean
head(pop_mean)

## # A tibble: 6 × 3
## # Groups:   country [6]
##   country    data               mean_pop
##   <fct>      <list>                <dbl>
## 1 Algeria    <tibble [52 × 6]> 23129438.
## 2 Argentina  <tibble [52 × 6]> 30783053.
## 3 Australia  <tibble [52 × 6]> 16074837.
## 4 Austria    <tibble [52 × 6]>  7746272.
## 5 Bangladesh <tibble [52 × 6]> 97649407.
## 6 Belgium    <tibble [52 × 6]>  9983596.

- case 2

# Build a linear model for each country
gap_models <- gap_nested %>%
    mutate(model = map(data, ~lm(formula = life_expectancy~year, data = .x)))
    
# Extract the model for Algeria    
algeria_model <- gap_models$model[[1]]

# View the summary for the Algeria model
summary(algeria_model)

## 
## Call:
## lm(formula = life_expectancy ~ year, data = .x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.044 -1.577 -0.543  1.700  3.843 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.197e+03  3.994e+01  -29.96   <2e-16 ***
## year         6.349e-01  2.011e-02   31.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.177 on 50 degrees of freedom
## Multiple R-squared:  0.9522, Adjusted R-squared:  0.9513 
## F-statistic: 996.2 on 1 and 50 DF,  p-value: < 2.2e-16

3) Tidy your models with broom

tidy(): returns the statistical findings of the model(such as coefficients)
glance(): retruns a concise one-row summary of the model
augment(): adds prediction columns to the data being modeled

# Build the augmented data frame
algeria_fitted <- augment(algeria_model)

# Compare the predicted values with the actual values of life expectancy
algeria_fitted %>% 
  ggplot(aes(x = year)) +
  geom_point(aes(y = life_expectancy)) + 
  geom_line(aes(y = .fitted), color = "red")

2. Multiple models with broom

Exploring coefficients across models

# Extract the coefficient statistics of each model into nested data frames
model_coef_nested <- gap_models %>% 
    mutate(coef = map(model, ~tidy(.x)))
    
# Simplify the coef data frames for each model    
model_coef <- model_coef_nested %>%
    unnest(coef)

# Plot a histogram of the coefficient estimates for year         
model_coef %>% 
  filter(term == "year") %>% 
  ggplot(aes(x = estimate)) +
  geom_histogram()

### Evaluating the fit of many models

# Extract the fit statistics of each model into data frames
model_perf_nested <- gap_models %>% 
    mutate(fit = map(model, ~glance(.x)))

# Simplify the fit data frames for each model    
model_perf <- model_perf_nested %>% 
    unnest(fit)
    
# Look at the first six rows of model_perf
head(model_perf, 6)

## # A tibble: 6 × 15
## # Groups:   country [6]
##   country  data     model r.squared adj.r.squared sigma statistic  p.value    df
##   <fct>    <list>   <lis>     <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>
## 1 Algeria  <tibble> <lm>      0.952         0.951 2.18       996. 1.11e-34     1
## 2 Argenti… <tibble> <lm>      0.984         0.984 0.431     3137. 8.78e-47     1
## 3 Austral… <tibble> <lm>      0.983         0.983 0.511     2905. 5.83e-46     1
## 4 Austria  <tibble> <lm>      0.987         0.986 0.438     3702. 1.48e-48     1
## 5 Banglad… <tibble> <lm>      0.949         0.947 1.83       921. 7.10e-34     1
## 6 Belgium  <tibble> <lm>      0.990         0.990 0.331     5094. 5.54e-52     1
## # ℹ 6 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
## #   df.residual <int>, nobs <int>

# Plot a histogram of rsquared for the 77 models    
model_perf %>% 
  ggplot(aes(x = r.squared)) + 
  geom_histogram()

# Extract the 4 best fitting models
best_fit <- model_perf %>% 
  ungroup() %>% 
  slice_max(r.squared, n = 4)

# Extract the 4 models with the worst fit
worst_fit <- model_perf %>% 
  ungroup() %>% 
  slice_min(r.squared, n = 4)

Visually inspect the fit of many models - R²

Augment the fitted values of each model

best_augmented <- best_fit %>% 
  # Build the augmented data frame for each country model
  mutate(augmented = map(model, ~ augment(.x))) %>% 
  # Expand the augmented data frames
  unnest(augmented)

worst_augmented <- worst_fit %>% 
  # Build the augmented data frame for each country model
  mutate(augmented = map(model, ~ augment(.x))) %>% 
  # Expand the augmented data frames
  unnest(augmented)

head(best_augmented)

## # A tibble: 6 × 23
##   country data     model  r.squared adj.r.squared sigma statistic  p.value    df
##   <fct>   <list>   <list>     <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>
## 1 Italy   <tibble> <lm>       0.997         0.997 0.226    15665. 4.14e-64     1
## 2 Italy   <tibble> <lm>       0.997         0.997 0.226    15665. 4.14e-64     1
## 3 Italy   <tibble> <lm>       0.997         0.997 0.226    15665. 4.14e-64     1
## 4 Italy   <tibble> <lm>       0.997         0.997 0.226    15665. 4.14e-64     1
## 5 Italy   <tibble> <lm>       0.997         0.997 0.226    15665. 4.14e-64     1
## 6 Italy   <tibble> <lm>       0.997         0.997 0.226    15665. 4.14e-64     1
## # ℹ 14 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
## #   df.residual <int>, nobs <int>, life_expectancy <dbl>, year <int>,
## #   .fitted <dbl>, .resid <dbl>, .hat <dbl>, .sigma <dbl>, .cooksd <dbl>,
## #   .std.resid <dbl>

Explore your best and worst fitting models

Best fitting

best_augmented %>% 
  ggplot(aes(x = year)) +
  geom_point(aes(y = life_expectancy)) + 
  geom_line(aes(y = .fitted), color = "red") +
  facet_wrap(~country, scales = "free_y")

##### Worst fitting

worst_augmented %>% 
  ggplot(aes(x = year)) +
  geom_point(aes(y = life_expectancy)) + 
  geom_line(aes(y = .fitted), color = "red") +
  facet_wrap(~country, scales = "free_y")

#### Improve the fit of your models

# Build a linear model for each country using all features
gap_fullmodel <- gap_nested %>% 
  mutate(model = map(data, ~lm(formula = life_expectancy ~ ., data = .x)))

fullmodel_perf <- gap_fullmodel %>% 
  # Extract the fit statistics of each model into data frames
  mutate(fit = map(model, ~glance(.x))) %>% 
  # Simplify the fit data frames for each model
  unnest(fit)
  
# View the performance for the four countries with the worst fitting four simple models you looked at before
fullmodel_perf %>% 
  filter(country %in% worst_fit$country) %>% 
  select(country, adj.r.squared)

## # A tibble: 4 × 2
## # Groups:   country [4]
##   country  adj.r.squared
##   <fct>            <dbl>
## 1 Botswana         0.844
## 2 Lesotho          0.908
## 3 Zambia           0.706
## 4 Zimbabwe         0.978

3. Build, Tune & Evaluate Regression Models

How well would my model perform on new data?
Did i select the best performing model?

Training, test and validatation splits

Train - Test split

`rsample::initial_split(data = …, prop = …)``

gap_split <- initial_split(gapminder, prop = 0.75)
training_data <- training(gap_split)
testing_data <- testing(gap_split)

##
nrow(training_data)

## [1] 3003

nrow(testing_data)

## [1] 1001

Cross Validation

vfold_cv(data = ..., v = ...)

cv_split <- vfold_cv(training_data, v = 5)
cv_split

## #  5-fold cross-validation 
## # A tibble: 5 × 2
##   splits             id   
##   <list>             <chr>
## 1 <split [2402/601]> Fold1
## 2 <split [2402/601]> Fold2
## 3 <split [2402/601]> Fold3
## 4 <split [2403/600]> Fold4
## 5 <split [2403/600]> Fold5

Mapping train & validate

cv_data <- cv_split %>% 
  mutate(train = map(splits, ~training(.x)),
         validate = map(splits, ~testing(.x)))

head(cv_data)

## #  5-fold cross-validation 
## # A tibble: 5 × 4
##   splits             id    train                validate          
##   <list>             <chr> <list>               <list>            
## 1 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble [601 × 7]>
## 2 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble [601 × 7]>
## 3 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble [601 × 7]>
## 4 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble [600 × 7]>
## 5 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble [600 × 7]>

Measuring cross-validation performance

Actual Vs. Predicted

MAE(Mean Absolute Error): 평균적으로 모델의 예측이 실제와 얼마나 다른지의 척도로 작을수록 better

Ingredients for Performance Measurement
1. Actual y values
2. Predicted y values
3. A metric to compare 1 & 2

1) Extract the actual values & Predicted values

# Build a model using the train data for each fold of the cross validation
cv_models_lm <- cv_data %>% 
  mutate(model = map(train, ~lm(life_expectancy ~ ., data = .x)))

cv_prep_lm <- cv_models_lm %>% 
  mutate(
    # Extract the recorded life expectancy for the records in the validate data frames
    validate_actual = map(validate, ~.x$life_expectancy),
    # Predict life expectancy for each validate set using its corresponding model
    validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y))
  )

cv_prep_lm

## #  5-fold cross-validation 
## # A tibble: 5 × 7
##   splits             id    train                validate model  validate_actual
##   <list>             <chr> <list>               <list>   <list> <list>         
## 1 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble> <lm>   <dbl [601]>    
## 2 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble> <lm>   <dbl [601]>    
## 3 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble> <lm>   <dbl [601]>    
## 4 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble> <lm>   <dbl [600]>    
## 5 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble> <lm>   <dbl [600]>    
## # ℹ 1 more variable: validate_predicted <list>

2. Evaluate model performance

# Calculate the mean absolute error for each validate fold       
cv_eval_lm <- cv_prep_lm %>% 
  mutate(validate_mae = map2_dbl(validate_actual, validate_predicted, ~mae(actual = .x, predicted = .y)))

# Print the validate_mae column
cv_eval_lm$validate_mae

## [1] 1.544342 1.500521 1.505744 1.516214 1.546751

# Calculate the mean of validate_mae column
mean(cv_eval_lm$validate_mae)

## [1] 1.522714

cv_eval_lm 모델은 예측치와 평균 약 8년 정도의 오차를 보이고 있다. 더 정확한 모델로 개선할 수 있을까?

Building and tuning a random forest model

Random Forest

여러 개의 결정 트리(Decision Trees)를 훈련하여 예측하는 앙상블 학습 방법

회귀(regression) - 여러 트리의 예측값 중 평균을 최종 예측값으로 결정 - Averaging
분류(classification) - 여러 트리의 예측값 중 가장 많이 등장한 클래스(majority vote) 선택 - Voting

- Benefits

Can handle non-linear relationships
Can handle interactions

- Model

rf_model <- ranger::ranger(formula = …, data = …, seed = …, mtry = …, num.trees = …)

seed argument는 결과를 reproducible 하도록 하기 위해 설정
mtry 1 : number of features (default: 회귀 - feat / 3, 분류 - $\sqrt{ number of feat }$ )
- 개별 트리를 학습할 때, 노드 분할 시 모든 변수를 고려하는 것이 아니라, mtry개의 변수를 랜덤하게 선택하여 최적의 분할을 찾음
- mtry값이 작으면 더 랜덤한 모델(성능이 낮아짐)이 되고, 크면 더 결정론적인 모델(과적합 위험 증가)이 됨
num.trees 1 : $\infty$ (default 500)
- 결정 트리의 개수를 설정

- Prediction

prediction <- predict(rf_model, new_data)$predictions

library(ranger)

# Build a random forest model for each fold
cv_model_rf <- cv_data %>%
  mutate(model = map(train, ~ ranger(formula = life_expectancy ~ .,
                                     data = .x,
                                     seed = 42,
                                     num.trees = 100)))

# Generate predictions using the random forest model
cv_prep_rf <- cv_model_rf %>%
  mutate(validate_predicted = map2(model, validate,
                                   ~ predict(.x, .y)$predictions),
         validate_actual = map(validate, ~.x$life_expectancy))

# Calculate validate MAE for each fold
cv_eval_rf <- cv_prep_rf %>% 
  mutate(validate_mae = map2_dbl(validate_actual, validate_predicted, ~ mae(actual = .x, predicted = .y)))

# Print the validate_mae column
cv_eval_rf$validate_mae

## [1] 0.7884257 0.8071857 0.7737300 0.7586612 0.9464448

# Calculate the mean of validate_mae column
mean(cv_eval_rf$validate_mae)

## [1] 0.8148895

- Tune `mtry`

crossing

# Prepare for tuning your cross validation folds by varying mtry
cv_tune <- cv_data %>% 
  crossing(mtry = 2:5)

cv_tune

## # A tibble: 20 × 5
##    splits             id    train                validate            mtry
##    <list>             <chr> <list>               <list>             <int>
##  1 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble [601 × 7]>     2
##  2 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble [601 × 7]>     3
##  3 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble [601 × 7]>     4
##  4 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble [601 × 7]>     5
##  5 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble [601 × 7]>     2
##  6 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble [601 × 7]>     3
##  7 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble [601 × 7]>     4
##  8 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble [601 × 7]>     5
##  9 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble [601 × 7]>     2
## 10 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble [601 × 7]>     3
## 11 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble [601 × 7]>     4
## 12 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble [601 × 7]>     5
## 13 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble [600 × 7]>     2
## 14 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble [600 × 7]>     3
## 15 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble [600 × 7]>     4
## 16 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble [600 × 7]>     5
## 17 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble [600 × 7]>     2
## 18 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble [600 × 7]>     3
## 19 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble [600 × 7]>     4
## 20 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble [600 × 7]>     5

# Build a model for each fold & mtry combination
cv_model_tunerf <- cv_tune %>% 
  mutate(model = map2(train, mtry, ~ ranger(formula = life_expectancy ~ .,
                                            data = .x,
                                            mtry = .y,
                                            num.trees = 100)))
# Generate predictions using the random forest model
cv_prep_tunerf <- cv_model_tunerf %>%
  mutate(validate_predicted = map2(model, validate,
                                   ~ predict(.x, .y)$predictions),
         validate_actual = map(validate, ~.x$life_expectancy))

# Calculate validate MAE for each fold
cv_eval_tunerf <- cv_prep_tunerf %>% 
  mutate(validate_mae = map2_dbl(validate_actual, validate_predicted, ~ mae(actual = .x, predicted = .y)))

# Calculate the mean validate_mae for each mtry used  
cv_eval_tunerf %>% 
  group_by(mtry) %>% 
  summarise(mean_mae = mean(validate_mae))

## # A tibble: 4 × 2
##    mtry mean_mae
##   <int>    <dbl>
## 1     2    0.821
## 2     3    0.808
## 3     4    0.807
## 4     5    0.805

Using ranger with an mtry = 4 and num.trees = 100 is the best!

Measuring the test performance

best_model <- ranger(formula = life_expectancy ~ .,
                     data = training_data,
                     mtry = 4, 
                     num.trees = 100, 
                     seed = 42)

test_actual <- testing_data$life_expectancy
test_predict <- predict(best_model, testing_data)$predictions

mae(test_actual, test_predict)

## [1] 0.6326096

모델에 따르면 0.6년 정도의 기대수명 오차가 존재한다!!!

4. Build, Tune & Evaluate Classification Models

Logistic regression models

Binary classification model

Logistic Regression

glm(formula = …, data = …, family = “binomial”)

Prepare train-test-validate parts

set.seed(42)

# Prepare the initial split object
data_split <- initial_split(attrition, prop = 0.75)

# Extract the training data frame
training_data <- training(data_split)

# Extract the testing data frame
testing_data <- testing(data_split)

cv_split <- vfold_cv(training_data, v = 5)

cv_data <- cv_split %>% 
  mutate(
    # Extract the train data frame for each split
    train = map(splits, ~training(.x)),
    # Extract the validate data frame for each split
    validate = map(splits, ~testing(.x))
  )

cv_data

## #  5-fold cross-validation 
## # A tibble: 5 × 4
##   splits            id    train           validate       
##   <list>            <chr> <list>          <list>         
## 1 <split [881/221]> Fold1 <df [881 × 31]> <df [221 × 31]>
## 2 <split [881/221]> Fold2 <df [881 × 31]> <df [221 × 31]>
## 3 <split [882/220]> Fold3 <df [882 × 31]> <df [220 × 31]>
## 4 <split [882/220]> Fold4 <df [882 × 31]> <df [220 × 31]>
## 5 <split [882/220]> Fold5 <df [882 × 31]> <df [220 × 31]>

Build cross-validated models

cv_models_lr <- cv_data %>% 
  mutate(model = map(train, ~ glm(formula = Attrition ~ .,
                                  data = .x,
                                  family = "binomial")))
cv_models_lr

## #  5-fold cross-validation 
## # A tibble: 5 × 5
##   splits            id    train           validate        model 
##   <list>            <chr> <list>          <list>          <list>
## 1 <split [881/221]> Fold1 <df [881 × 31]> <df [221 × 31]> <glm> 
## 2 <split [881/221]> Fold2 <df [881 × 31]> <df [221 × 31]> <glm> 
## 3 <split [882/220]> Fold3 <df [882 × 31]> <df [220 × 31]> <glm> 
## 4 <split [882/220]> Fold4 <df [882 × 31]> <df [220 × 31]> <glm> 
## 5 <split [882/220]> Fold5 <df [882 × 31]> <df [220 × 31]> <glm>

Evaluating classification models

Ingredients for Performance Measurement (same as before)
1. Actual y values
2. Predicted y values
3. A metric to compare 1 & 2

1) Prepare Actual Classes

값들이 모두 “YES”, “NO” 이기 때문에 TRUE와 FALSE로 바꿔줘야 함

validate_actual <- validate$Attrition == "YES"

2) Prepare Predicted Classes

predict은 probability vector로 반환하기에 binary vector로 변환시켜야 함

validate_prob <- predict(model, validatae, type = "respose")
validate_predicted <- validate_prob > 0.5

3) A metric to compare 1) & 2)

Confusion matrix

* table(validate_actual, validate_predicted)

(1) Accuracy

How well your model predicted both the TRUE and FALSE classes

* accuracy(validate_actual, validate_predicted)

(2) Precision

How often the model is correct at predicting the TRUE class
(TRUE 예상한 것 중에 얼마나 실제 TRUE 인지?)

* precision(validate_actual, validate_predicted)

(3) Recall

Compares the number of observations the model has correctly identified as TRUE to the total number of TRUE observations
(실제 TRUE 중에 얼마나 TRUE를 예상한건지?)

* recall(validate_actual, validate_predicted)

Prepare for cross-validated performance

cv_prep_lr <- cv_models_lr %>% 
  mutate(
    # Prepare binary vector of actual Attrition values in validate
    validate_actual = map(validate, ~.x$Attrition == "Yes"),
    # Prepare binary vector of predicted Attrition values for validate
    validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y, type = "response") > 0.5)
  )

cv_prep_lr

## #  5-fold cross-validation 
## # A tibble: 5 × 7
##   splits            id    train           validate        model  validate_actual
##   <list>            <chr> <list>          <list>          <list> <list>         
## 1 <split [881/221]> Fold1 <df [881 × 31]> <df [221 × 31]> <glm>  <lgl [221]>    
## 2 <split [881/221]> Fold2 <df [881 × 31]> <df [221 × 31]> <glm>  <lgl [221]>    
## 3 <split [882/220]> Fold3 <df [882 × 31]> <df [220 × 31]> <glm>  <lgl [220]>    
## 4 <split [882/220]> Fold4 <df [882 × 31]> <df [220 × 31]> <glm>  <lgl [220]>    
## 5 <split [882/220]> Fold5 <df [882 × 31]> <df [220 × 31]> <glm>  <lgl [220]>    
## # ℹ 1 more variable: validate_predicted <list>

Calculate cross-validated performance

# Calculate the validate recall for each cross validation fold
cv_perf_recall <- cv_prep_lr %>% 
  mutate(validate_recall = map2_dbl(.x = validate_actual,
                                    .y = validate_predicted,
                                    .f = ~ recall(actual = .x, predicted = .y)))
# Print the validate_recall column
cv_perf_recall$validate_recall

## [1] 0.3157895 0.2727273 0.4651163 0.4705882 0.5172414

# Calculate the average of the validate_recall column
mean(cv_perf_recall$validate_recall)

## [1] 0.4082925

Random forest for classification

library(ranger)

# Prepare for tuning your cross validation folds by varying mtry
cv_tune <- cv_data %>%
  crossing(mtry = c(2, 4, 8, 16)) 

# Build a cross validation model for each fold & mtry combination
cv_models_rf <- cv_tune %>% 
  mutate(model = map2(train, mtry, ~ranger(formula = Attrition~., 
                                           data = .x, mtry = .y,
                                           num.trees = 100, seed = 42)))
cv_prep_rf <- cv_models_rf %>% 
  mutate(
    # Prepare binary vector of actual Attrition values in validate
    validate_actual = map(validate, ~.x$Attrition == "Yes"),
    # Prepare binary vector of predicted Attrition values for validate
    validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y, type = "response")$predictions == "Yes")
  )

# Calculate the validate recall for each cross validation fold
cv_perf_recall <- cv_prep_rf %>% 
  mutate(recall = map2_dbl(.x = validate_actual, .y = validate_predicted, ~recall(actual = .x, predicted = .y)))

# Calculate the mean recall for each mtry used  
cv_perf_recall %>% 
  group_by(mtry) %>% 
  summarise(mean_recall = mean(recall))

## # A tibble: 4 × 2
##    mtry mean_recall
##   <dbl>       <dbl>
## 1     2      0.0818
## 2     4      0.137 
## 3     8      0.155 
## 4    16      0.177

기존 logistic regression model의 `recall` performance는 대략 0.4로 random forest 보다 뛰어나다!!!

Best final classfication model

# Build the logistic regression model using all training data
best_model <- glm(formula = Attrition ~ ., 
                  data = training_data, family = "binomial")


# Prepare binary vector of actual Attrition values for testing_data
test_actual <- testing_data$Attrition == "Yes"

# Prepare binary vector of predicted Attrition values for testing_data
test_predicted <- predict(best_model, testing_data, type = "response") > 0.5

# Compare the actual & predicted performance visually using a table
table(test_actual, test_predicted)

##            test_predicted
## test_actual FALSE TRUE
##       FALSE   301    7
##       TRUE     33   27

# Calculate the test accuracy
accuracy(test_actual, test_predicted)

## [1] 0.8913043

# Calculate the test precision
precision(test_actual, test_predicted)

## [1] 0.7941176

# Calculate the test recall
recall(test_actual, test_predicted)

## [1] 0.45