library(readr)
library(janitor)
library(tidyverse)
library(tidymodels)
library(skimr)

STEP 1: LOAD AND PREPROCESS DATA

1.1 loading data

test <- read_csv("test.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   PassengerId = col_double(),
##   Pclass = col_double(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_double(),
##   Parch = col_double(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )
train <- read_csv("train.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   PassengerId = col_double(),
##   Survived = col_double(),
##   Pclass = col_double(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_double(),
##   Parch = col_double(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )

1.2 preprocess training and tesing data

full_dat <- (bind_rows(train, test)) %>% 
  clean_names() %>%
  mutate(index = case_when(is.na(survived)=="FALSE"~"train",
                           TRUE ~ "test"),
         survived = factor(survived,
                           levels = c("0", "1"),
                           labels = c("0", "1")),
         pclass = factor(pclass,
                         levels = c("1", "2", "3"),
                         labels = c("1st", "2nd", "3rd")),
         embarked = as.factor(embarked),
         sex = as.factor(sex))

preprocess_fulldat <- recipe(~., data = full_dat) %>%
  step_knnimpute(embarked) %>%
  step_meanimpute(fare, age) %>%
  prep()

fulldat2<- (bake(preprocess_fulldat, new_data = NULL))
  
train2 <- fulldat2[fulldat2$index=="train",]
test2 <- fulldat2[fulldat2$index=="test",]

explore survival status by other factors

skim(train2)
Data summary
Name train2
Number of rows 891
Number of columns 13
_______________________
Column type frequency:
factor 8
numeric 5
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
survived 0 1.00 FALSE 2 0: 549, 1: 342
pclass 0 1.00 FALSE 3 3rd: 491, 1st: 216, 2nd: 184
name 0 1.00 FALSE 891 Abb: 1, Abb: 1, Abb: 1, Abe: 1
sex 0 1.00 FALSE 2 mal: 577, fem: 314
ticket 0 1.00 FALSE 681 160: 7, 347: 7, CA.: 7, 310: 6
cabin 687 0.23 FALSE 147 B96: 4, C23: 4, G6: 4, C22: 3
embarked 0 1.00 FALSE 3 S: 644, C: 170, Q: 77
index 0 1.00 FALSE 1 tra: 891, tes: 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
passenger_id 0 1 446.00 257.35 1.00 223.50 446.00 668.5 891.00 ▇▇▇▇▇
age 0 1 29.74 13.00 0.42 22.00 29.88 35.0 80.00 ▂▇▃▁▁
sib_sp 0 1 0.52 1.10 0.00 0.00 0.00 1.0 8.00 ▇▁▁▁▁
parch 0 1 0.38 0.81 0.00 0.00 0.00 0.0 6.00 ▇▁▁▁▁
fare 0 1 32.20 49.69 0.00 7.91 14.45 31.0 512.33 ▇▁▁▁▁
train2 %>% ggplot(aes(survived, fare, fill = sex)) +
  geom_boxplot() +
  scale_y_log10()

train2 %>% ggplot(aes(survived, age, fill = sex)) +
  geom_boxplot()

train2 %>% ggplot(aes(survived, age, fill= survived)) +
  geom_boxplot() +
  facet_wrap(~embarked)

train2 %>% 
  group_by(survived, sex) %>%
  summarise(mean_age = mean(age, na.rm = T)) %>%
  pivot_wider(names_from = sex,
              values_from = mean_age)
## # A tibble: 2 x 3
## # Groups:   survived [2]
##   survived female  male
##   <fct>     <dbl> <dbl>
## 1 0          26.1  31.2
## 2 1          29.0  27.7
train2 %>% 
  group_by(survived, sex) %>%
  summarise(mean_fare = mean(fare, na.rm = T)) %>%
  pivot_wider(names_from = sex,
              values_from = mean_fare)
## # A tibble: 2 x 3
## # Groups:   survived [2]
##   survived female  male
##   <fct>     <dbl> <dbl>
## 1 0          23.0  22.0
## 2 1          51.9  40.8
train2 %>% 
  group_by(survived, sex) %>%
  summarise(mean_parch = mean(parch, na.rm = T)) %>%
  pivot_wider(names_from = sex,
              values_from = mean_parch)
## # A tibble: 2 x 3
## # Groups:   survived [2]
##   survived female  male
##   <fct>     <dbl> <dbl>
## 1 0         1.04  0.207
## 2 1         0.515 0.358

#STEP 2: BUILDING MODEL

2.1 spliting dat

set.seed(111)
ttn_split <- initial_split(train2, strata = survived)
ttn_training <- training(ttn_split)
ttn_tesing <- testing(ttn_split)

2.2 resampling

set.seed(123)
ttn_resampling <- bootstraps(ttn_training, strata = survived)

##2.3 buid preprocessing and function

library(themis) # to downsample
## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes
## 
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample
ttn_rcp <- recipe(survived ~ ., data = ttn_training) %>%
  update_role(passenger_id, new_role = "id") %>%
  step_downsample(survived) %>% # we need to upsample since died and survived class are imbalance but we want to retain obs as much as possible
  step_rm(cabin, ticket, name) # convert NA value to "unknown"

ttn_rcp 
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          1
##    outcome          1
##  predictor         11
## 
## Operations:
## 
## Down-sampling based on survived
## Delete terms cabin, ticket, name

2.4 build model engine

library(ranger)
rf_model <- rand_forest (trees = tune(),
                         mtry = tune(),
                         min_n = tune()) %>%
  set_engine("ranger") %>%
  set_mode("classification")
rf_model
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = tune()
##   min_n = tune()
## 
## Computational engine: ranger

2.5 build workflows

ttn_wf <- workflow ()%>%
  add_recipe(ttn_rcp) %>%
  add_model(rf_model)
ttn_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: rand_forest()
## 
## -- Preprocessor ----------------------------------------------------------------
## 2 Recipe Steps
## 
## * step_downsample()
## * step_rm()
## 
## -- Model -----------------------------------------------------------------------
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = tune()
##   min_n = tune()
## 
## Computational engine: ranger

2.6 tune themodel

grid_srch <- grid_regular(min_n(range = c(1,20)),
                          mtry(range = c(1,8)),# Maximum is the number of variable
                          trees(range = c(50,1000)),
                          levels = 5)

tuned_model <- tune_grid(ttn_wf,
                         resamples = ttn_resampling,
                         grid = grid_srch,
                         metrics = metric_set(accuracy, roc_auc, sensitivity, specificity), 
                         control = control_resamples(save_pred = T))
## Warning: package 'rlang' was built under R version 4.0.4
## Warning: package 'vctrs' was built under R version 4.0.4

2.7 best model selection based on ROC-AUC

show_best(tuned_model, "roc_auc")
## # A tibble: 5 x 9
##    mtry trees min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2   762     5 roc_auc binary     0.845    25 0.00353 Preprocessor1_Model0~
## 2     2  1000    10 roc_auc binary     0.845    25 0.00365 Preprocessor1_Model1~
## 3     2   762     1 roc_auc binary     0.845    25 0.00342 Preprocessor1_Model0~
## 4     2   525     1 roc_auc binary     0.845    25 0.00350 Preprocessor1_Model0~
## 5     2  1000     5 roc_auc binary     0.845    25 0.00359 Preprocessor1_Model1~

2.8 Best model selection based on accuracy

show_best(tuned_model, "accuracy")
## # A tibble: 5 x 9
##    mtry trees min_n .metric  .estimator  mean     n std_err .config             
##   <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1     2  1000     1 accuracy binary     0.788    25 0.00375 Preprocessor1_Model~
## 2     2   525     1 accuracy binary     0.788    25 0.00364 Preprocessor1_Model~
## 3     4   762    20 accuracy binary     0.788    25 0.00407 Preprocessor1_Model~
## 4     4   287    15 accuracy binary     0.788    25 0.00386 Preprocessor1_Model~
## 5     4  1000    15 accuracy binary     0.787    25 0.00392 Preprocessor1_Model~

2.9 finalize the model based on best roc_auc

finalized_model <- finalize_model(rf_model, select_best(tuned_model, "accuracy"))
finalized_model
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = 2
##   trees = 1000
##   min_n = 1
## 
## Computational engine: ranger

2.10 rebuild the model using the best_tunned_specs

tunned_wf <- workflow() %>%
  add_recipe(ttn_rcp) %>%
  add_model(finalized_model)

2.11 final fit the model on training data and evaluate the model on tesing data using finalized model

final_fit <-tunned_wf %>% last_fit(ttn_split,
                                   metrics = metric_set(accuracy, roc_auc))

2.12 collect the predicted value on testing

final_fit %>% conf_mat_resampled()
## # A tibble: 4 x 3
##   Prediction Truth  Freq
##   <fct>      <fct> <dbl>
## 1 0          0       118
## 2 0          1        13
## 3 1          0        19
## 4 1          1        72

STEP 3: PREDICT SURVIVAL STATUS

##3.1 Pull the workfows and fit it on testing data

saved_model <- final_fit$.workflow[[1]]
suvival_prediction <- predict(saved_model, test2)
write_csv(suvival_prediction, "submission.csv")