STEP 1: LOAD AND PREPROCESS DATA

1.1 loading data

test <- read_csv("test.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   PassengerId = col_double(),
##   Pclass = col_double(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_double(),
##   Parch = col_double(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )

train <- read_csv("train.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   PassengerId = col_double(),
##   Survived = col_double(),
##   Pclass = col_double(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_double(),
##   Parch = col_double(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )

1.2 preprocess training and tesing data

full_dat <- (bind_rows(train, test)) %>% 
  clean_names() %>%
  mutate(index = case_when(is.na(survived)=="FALSE"~"train",
                           TRUE ~ "test"),
         survived = factor(survived,
                           levels = c("0", "1"),
                           labels = c("0", "1")),
         pclass = factor(pclass,
                         levels = c("1", "2", "3"),
                         labels = c("1st", "2nd", "3rd")),
         embarked = as.factor(embarked),
         sex = as.factor(sex))

preprocess_fulldat <- recipe(~., data = full_dat) %>%
  step_knnimpute(embarked) %>%
  step_meanimpute(fare, age) %>%
  prep()

fulldat2<- (bake(preprocess_fulldat, new_data = NULL))
  
train2 <- fulldat2[fulldat2$index=="train",]
test2 <- fulldat2[fulldat2$index=="test",]

explore survival status by other factors

skim(train2)

Data summary
Name	train2
Number of rows	891
Number of columns	13
_______________________
Column type frequency:
factor	8
numeric	5
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
survived	0	1.00	FALSE	2	0: 549, 1: 342
pclass	0	1.00	FALSE	3	3rd: 491, 1st: 216, 2nd: 184
name	0	1.00	FALSE	891	Abb: 1, Abb: 1, Abb: 1, Abe: 1
sex	0	1.00	FALSE	2	mal: 577, fem: 314
ticket	0	1.00	FALSE	681	160: 7, 347: 7, CA.: 7, 310: 6
cabin	687	0.23	FALSE	147	B96: 4, C23: 4, G6: 4, C22: 3
embarked	0	1.00	FALSE	3	S: 644, C: 170, Q: 77
index	0	1.00	FALSE	1	tra: 891, tes: 0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
passenger_id	1	446.00	257.35	1.00	223.50	446.00	668.5	891.00	▇▇▇▇▇
age	1	29.74	13.00	0.42	22.00	29.88	35.0	80.00	▂▇▃▁▁
sib_sp	1	0.52	1.10	0.00	0.00	0.00	1.0	8.00	▇▁▁▁▁
parch	1	0.38	0.81	0.00	0.00	0.00	0.0	6.00	▇▁▁▁▁
fare	1	32.20	49.69	0.00	7.91	14.45	31.0	512.33	▇▁▁▁▁

train2 %>% ggplot(aes(survived, fare, fill = sex)) +
  geom_boxplot() +
  scale_y_log10()

train2 %>% ggplot(aes(survived, age, fill = sex)) +
  geom_boxplot()

train2 %>% ggplot(aes(survived, age, fill= survived)) +
  geom_boxplot() +
  facet_wrap(~embarked)

train2 %>% 
  group_by(survived, sex) %>%
  summarise(mean_age = mean(age, na.rm = T)) %>%
  pivot_wider(names_from = sex,
              values_from = mean_age)

## # A tibble: 2 x 3
## # Groups:   survived [2]
##   survived female  male
##   <fct>     <dbl> <dbl>
## 1 0          26.1  31.2
## 2 1          29.0  27.7

train2 %>% 
  group_by(survived, sex) %>%
  summarise(mean_fare = mean(fare, na.rm = T)) %>%
  pivot_wider(names_from = sex,
              values_from = mean_fare)

## # A tibble: 2 x 3
## # Groups:   survived [2]
##   survived female  male
##   <fct>     <dbl> <dbl>
## 1 0          23.0  22.0
## 2 1          51.9  40.8

train2 %>% 
  group_by(survived, sex) %>%
  summarise(mean_parch = mean(parch, na.rm = T)) %>%
  pivot_wider(names_from = sex,
              values_from = mean_parch)

## # A tibble: 2 x 3
## # Groups:   survived [2]
##   survived female  male
##   <fct>     <dbl> <dbl>
## 1 0         1.04  0.207
## 2 1         0.515 0.358

#STEP 2: BUILDING MODEL

2.1 spliting dat

set.seed(111)
ttn_split <- initial_split(train2, strata = survived)
ttn_training <- training(ttn_split)
ttn_tesing <- testing(ttn_split)

2.2 resampling

set.seed(123)
ttn_resampling <- bootstraps(ttn_training, strata = survived)

##2.3 buid preprocessing and function

library(themis) # to downsample

## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes

## 
## Attaching package: 'themis'

## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample

ttn_rcp <- recipe(survived ~ ., data = ttn_training) %>%
  update_role(passenger_id, new_role = "id") %>%
  step_downsample(survived) %>% # we need to upsample since died and survived class are imbalance but we want to retain obs as much as possible
  step_rm(cabin, ticket, name) # convert NA value to "unknown"

ttn_rcp

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          1
##    outcome          1
##  predictor         11
## 
## Operations:
## 
## Down-sampling based on survived
## Delete terms cabin, ticket, name

2.4 build model engine

library(ranger)
rf_model <- rand_forest (trees = tune(),
                         mtry = tune(),
                         min_n = tune()) %>%
  set_engine("ranger") %>%
  set_mode("classification")
rf_model

## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = tune()
##   min_n = tune()
## 
## Computational engine: ranger

2.5 build workflows

ttn_wf <- workflow ()%>%
  add_recipe(ttn_rcp) %>%
  add_model(rf_model)
ttn_wf

## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: rand_forest()
## 
## -- Preprocessor ----------------------------------------------------------------
## 2 Recipe Steps
## 
## * step_downsample()
## * step_rm()
## 
## -- Model -----------------------------------------------------------------------
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = tune()
##   min_n = tune()
## 
## Computational engine: ranger

2.6 tune themodel

grid_srch <- grid_regular(min_n(range = c(1,20)),
                          mtry(range = c(1,8)),# Maximum is the number of variable
                          trees(range = c(50,1000)),
                          levels = 5)

tuned_model <- tune_grid(ttn_wf,
                         resamples = ttn_resampling,
                         grid = grid_srch,
                         metrics = metric_set(accuracy, roc_auc, sensitivity, specificity), 
                         control = control_resamples(save_pred = T))

## Warning: package 'rlang' was built under R version 4.0.4

## Warning: package 'vctrs' was built under R version 4.0.4

2.7 best model selection based on ROC-AUC

show_best(tuned_model, "roc_auc")

## # A tibble: 5 x 9
##    mtry trees min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2   762     5 roc_auc binary     0.845    25 0.00353 Preprocessor1_Model0~
## 2     2  1000    10 roc_auc binary     0.845    25 0.00365 Preprocessor1_Model1~
## 3     2   762     1 roc_auc binary     0.845    25 0.00342 Preprocessor1_Model0~
## 4     2   525     1 roc_auc binary     0.845    25 0.00350 Preprocessor1_Model0~
## 5     2  1000     5 roc_auc binary     0.845    25 0.00359 Preprocessor1_Model1~

2.8 Best model selection based on accuracy

show_best(tuned_model, "accuracy")

## # A tibble: 5 x 9
##    mtry trees min_n .metric  .estimator  mean     n std_err .config             
##   <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1     2  1000     1 accuracy binary     0.788    25 0.00375 Preprocessor1_Model~
## 2     2   525     1 accuracy binary     0.788    25 0.00364 Preprocessor1_Model~
## 3     4   762    20 accuracy binary     0.788    25 0.00407 Preprocessor1_Model~
## 4     4   287    15 accuracy binary     0.788    25 0.00386 Preprocessor1_Model~
## 5     4  1000    15 accuracy binary     0.787    25 0.00392 Preprocessor1_Model~

2.9 finalize the model based on best roc_auc

finalized_model <- finalize_model(rf_model, select_best(tuned_model, "accuracy"))
finalized_model

## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = 2
##   trees = 1000
##   min_n = 1
## 
## Computational engine: ranger

2.10 rebuild the model using the best_tunned_specs

tunned_wf <- workflow() %>%
  add_recipe(ttn_rcp) %>%
  add_model(finalized_model)

2.11 final fit the model on training data and evaluate the model on tesing data using finalized model

final_fit <-tunned_wf %>% last_fit(ttn_split,
                                   metrics = metric_set(accuracy, roc_auc))

2.12 collect the predicted value on testing

final_fit %>% conf_mat_resampled()

## # A tibble: 4 x 3
##   Prediction Truth  Freq
##   <fct>      <fct> <dbl>
## 1 0          0       118
## 2 0          1        13
## 3 1          0        19
## 4 1          1        72

TITANIC SUVIVAL PREDICTION USING RANDOM FOREST

HUNG.NGUYEN

4/13/2020

STEP 1: LOAD AND PREPROCESS DATA

1.1 loading data

1.2 preprocess training and tesing data

2.1 spliting dat

2.2 resampling

2.4 build model engine

2.5 build workflows

2.6 tune themodel

2.7 best model selection based on ROC-AUC

2.8 Best model selection based on accuracy

2.9 finalize the model based on best roc_auc

2.10 rebuild the model using the best_tunned_specs

2.11 final fit the model on training data and evaluate the model on tesing data using finalized model

2.12 collect the predicted value on testing

STEP 3: PREDICT SURVIVAL STATUS