Data Preparation

Introduction

In the previous post, I predicted which Spaceship Titanic passengers are transported to an alternate dimension with logistic regression and KNN. This time, I intend to do the same with different tools.

Import Data

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(DataExplorer)
library(missRanger)
library(performance)
library(see)
library(class)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(gtools)
library(ROCR)
## 
## Attaching package: 'ROCR'
## The following object is masked from 'package:performance':
## 
##     performance
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:gtools':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(broom) 
library(earth) # mars
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
library(e1071) # bayes
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:gtools':
## 
##     permutations
library(partykit) # decision tree
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(ranger) # fast random forest
train <- read_csv("train.csv")
## Rows: 8693 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (3): CryoSleep, VIP, Transported
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv("test.csv")
## Rows: 4277 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (2): CryoSleep, VIP
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(train, 3)
## # A tibble: 3 x 14
##   PassengerId HomePlanet CryoSleep Cabin Destination   Age VIP   RoomService
##   <chr>       <chr>      <lgl>     <chr> <chr>       <dbl> <lgl>       <dbl>
## 1 0001_01     Europa     FALSE     B/0/P TRAPPIST-1e    39 FALSE           0
## 2 0002_01     Earth      FALSE     F/0/S TRAPPIST-1e    24 FALSE         109
## 3 0003_01     Europa     FALSE     A/0/S TRAPPIST-1e    58 TRUE           43
## # ... with 6 more variables: FoodCourt <dbl>, ShoppingMall <dbl>, Spa <dbl>,
## #   VRDeck <dbl>, Name <chr>, Transported <lgl>
head(test, 3)
## # A tibble: 3 x 13
##   PassengerId HomePlanet CryoSleep Cabin Destination   Age VIP   RoomService
##   <chr>       <chr>      <lgl>     <chr> <chr>       <dbl> <lgl>       <dbl>
## 1 0013_01     Earth      TRUE      G/3/S TRAPPIST-1e    27 FALSE           0
## 2 0018_01     Earth      FALSE     F/4/S TRAPPIST-1e    19 FALSE           0
## 3 0019_01     Europa     TRUE      C/0/S 55 Cancri e    31 FALSE           0
## # ... with 5 more variables: FoodCourt <dbl>, ShoppingMall <dbl>, Spa <dbl>,
## #   VRDeck <dbl>, Name <chr>

To understand and process the data as a whole, first, we need to join them together

whole <- bind_rows(train, test)

Initial Cleansing

First, clean the column names.

whole <- whole %>% clean_names()
str(whole)
## spec_tbl_df [12,970 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ passenger_id : chr [1:12970] "0001_01" "0002_01" "0003_01" "0003_02" ...
##  $ home_planet  : chr [1:12970] "Europa" "Earth" "Europa" "Europa" ...
##  $ cryo_sleep   : logi [1:12970] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ cabin        : chr [1:12970] "B/0/P" "F/0/S" "A/0/S" "A/0/S" ...
##  $ destination  : chr [1:12970] "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" ...
##  $ age          : num [1:12970] 39 24 58 33 16 44 26 28 35 14 ...
##  $ vip          : logi [1:12970] FALSE FALSE TRUE FALSE FALSE FALSE ...
##  $ room_service : num [1:12970] 0 109 43 0 303 0 42 0 0 0 ...
##  $ food_court   : num [1:12970] 0 9 3576 1283 70 ...
##  $ shopping_mall: num [1:12970] 0 25 0 371 151 0 3 0 17 0 ...
##  $ spa          : num [1:12970] 0 549 6715 3329 565 ...
##  $ vr_deck      : num [1:12970] 0 44 49 193 2 0 0 NA 0 0 ...
##  $ name         : chr [1:12970] "Maham Ofracculy" "Juanna Vines" "Altark Susent" "Solam Susent" ...
##  $ transported  : logi [1:12970] FALSE TRUE FALSE FALSE TRUE TRUE ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   PassengerId = col_character(),
##   ..   HomePlanet = col_character(),
##   ..   CryoSleep = col_logical(),
##   ..   Cabin = col_character(),
##   ..   Destination = col_character(),
##   ..   Age = col_double(),
##   ..   VIP = col_logical(),
##   ..   RoomService = col_double(),
##   ..   FoodCourt = col_double(),
##   ..   ShoppingMall = col_double(),
##   ..   Spa = col_double(),
##   ..   VRDeck = col_double(),
##   ..   Name = col_character(),
##   ..   Transported = col_logical()
##   .. )
##  - attr(*, "problems")=<externalptr>

Let’s check for missing values.

plot_missing(whole)

The missing values are not significant, it’s good enough percentage to be omitted. But to be safe, I will impute them later. For now, let’s consider how many unique value in the data, to conclude whether some variables need to be converted into factor or not.

sapply(whole, function(x) n_distinct(x))
##  passenger_id   home_planet    cryo_sleep         cabin   destination 
##         12970             4             3          9826             4 
##           age           vip  room_service    food_court shopping_mall 
##            81             3          1579          1954          1368 
##           spa       vr_deck          name   transported 
##          1680          1643         12630             3
str(whole)
## spec_tbl_df [12,970 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ passenger_id : chr [1:12970] "0001_01" "0002_01" "0003_01" "0003_02" ...
##  $ home_planet  : chr [1:12970] "Europa" "Earth" "Europa" "Europa" ...
##  $ cryo_sleep   : logi [1:12970] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ cabin        : chr [1:12970] "B/0/P" "F/0/S" "A/0/S" "A/0/S" ...
##  $ destination  : chr [1:12970] "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" ...
##  $ age          : num [1:12970] 39 24 58 33 16 44 26 28 35 14 ...
##  $ vip          : logi [1:12970] FALSE FALSE TRUE FALSE FALSE FALSE ...
##  $ room_service : num [1:12970] 0 109 43 0 303 0 42 0 0 0 ...
##  $ food_court   : num [1:12970] 0 9 3576 1283 70 ...
##  $ shopping_mall: num [1:12970] 0 25 0 371 151 0 3 0 17 0 ...
##  $ spa          : num [1:12970] 0 549 6715 3329 565 ...
##  $ vr_deck      : num [1:12970] 0 44 49 193 2 0 0 NA 0 0 ...
##  $ name         : chr [1:12970] "Maham Ofracculy" "Juanna Vines" "Altark Susent" "Solam Susent" ...
##  $ transported  : logi [1:12970] FALSE TRUE FALSE FALSE TRUE TRUE ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   PassengerId = col_character(),
##   ..   HomePlanet = col_character(),
##   ..   CryoSleep = col_logical(),
##   ..   Cabin = col_character(),
##   ..   Destination = col_character(),
##   ..   Age = col_double(),
##   ..   VIP = col_logical(),
##   ..   RoomService = col_double(),
##   ..   FoodCourt = col_double(),
##   ..   ShoppingMall = col_double(),
##   ..   Spa = col_double(),
##   ..   VRDeck = col_double(),
##   ..   Name = col_character(),
##   ..   Transported = col_logical()
##   .. )
##  - attr(*, "problems")=<externalptr>

Since cryo_sleep and vip are already logical class, we only need to convert home_planet and destination from character to factor.

whole <- whole %>% 
  mutate(home_planet = as.factor(home_planet),
         destination = as.factor(destination)
         )

Imputing the missing value with missRanger with a formula to target every columns except passenger id and transported. But for the predictor, we also have to dismissed any unordered factor with more than 53 level.

init_imp = 
  home_planet + cryo_sleep + cabin + destination + age + vip + room_service + food_court + shopping_mall + spa + vr_deck + name ~ home_planet + cryo_sleep + + destination + vip + room_service + food_court + shopping_mall + spa + vr_deck

whole_init_imp <- missRanger(data = whole,
                             formula = init_imp,
                             pmm.k = 3, # to round imputation result with predictive mean matching
                             verbose = 2, 
                             num.trees = 50, # tweaks to make things faster
                             sample.fraction = 0.1, # tweaks to make things faster
                             splitrule = "extratrees", # tweaks to make things faster
                             max.depth = 6, # tweaks to make things faster
                             min.node.size = 10000, # tweaks to make things faster
                             maxiter = 2)# tweaks to make things faster
## 
## Missing value imputation by random forests
## 
##   Variables to impute:       home_planet, cryo_sleep, cabin, destination, age, vip, room_service, food_court, shopping_mall, spa, vr_deck, name
##   Variables used to impute:  home_planet, cryo_sleep, destination, vip, room_service, food_court, shopping_mall, spa, vr_deck
##  rm_srv  vr_dck  age dstntn  spa hm_pln  fd_crt  name    vip cabin   shppn_  cry_sl
## iter 1:  1.0000  1.0001  1.0001  0.3013  1.0001  0.4587  1.0001  1.0000  0.0215  1.0000  1.0000  0.3618  
## iter 2:  1.0001  1.0001  1.0002  0.3013  1.0000  0.4587  1.0000  1.0000  0.0215  0.9998  1.0001  0.3618  
colSums(is.na(whole_init_imp))
##  passenger_id   home_planet    cryo_sleep         cabin   destination 
##             0             0             0             0             0 
##           age           vip  room_service    food_court shopping_mall 
##             0             0             0             0             0 
##           spa       vr_deck          name   transported 
##             0             0             0          4277

Feature Engineering

We can extract groupings and order within group from the passenger_id. Then we can extract deck, number, and side from the cabin code. Furthermore, we can extract average, sum, IQR, and more from how they spent their money. We can also split the name into first and last.

lux <- c("room_service", "food_court", "shopping_mall", "spa", "vr_deck")

whole_init_exp <- whole_init_imp %>% 
  mutate(passenger_id = as.factor(passenger_id), # turn into factor so it doesn't removed later
         passenger_group = str_sub(passenger_id, 1, 4),
         passenger_number_within_group = 
           as.factor(str_sub(passenger_id, 6, 7)),
         cabin_deck = as.factor(str_sub(cabin, 1, 1)),
         cabin_number = as.factor(str_sub(cabin, 3, 3)),
         cabin_side = as.factor(str_sub(cabin, -1))) %>% 
  separate(col = name,
           sep = " ",
           into = c("name_first","name_last"),) %>% 
  mutate(name_last = as.factor(name_last)) %>% 
  rowwise() %>% 
  mutate(lux_spent_average = mean(c_across(cols = all_of(lux))),
         lux_spent_sum = sum(c_across(cols = all_of(lux))),
         lux_spent_median = median(c_across(cols = all_of(lux))),
         lux_spent_var = var(c_across(cols = all_of(lux))),
         lux_spent_sd = sd(c_across(cols = all_of(lux))),
         lux_spent_iqr = IQR(c_across(cols = all_of(lux)))
         ) %>% 
  ungroup()

head(whole_init_exp)[,]
## # A tibble: 6 x 26
##   passenger_id home_planet cryo_sleep cabin destination   age vip   room_service
##   <fct>        <fct>       <lgl>      <chr> <fct>       <dbl> <lgl>        <dbl>
## 1 0001_01      Europa      FALSE      B/0/P TRAPPIST-1e    39 FALSE            0
## 2 0002_01      Earth       FALSE      F/0/S TRAPPIST-1e    24 FALSE          109
## 3 0003_01      Europa      FALSE      A/0/S TRAPPIST-1e    58 TRUE            43
## 4 0003_02      Europa      FALSE      A/0/S TRAPPIST-1e    33 FALSE            0
## 5 0004_01      Earth       FALSE      F/1/S TRAPPIST-1e    16 FALSE          303
## 6 0005_01      Earth       FALSE      F/0/P PSO J318.5~    44 FALSE            0
## # ... with 18 more variables: food_court <dbl>, shopping_mall <dbl>, spa <dbl>,
## #   vr_deck <dbl>, name_first <chr>, name_last <fct>, transported <lgl>,
## #   passenger_group <chr>, passenger_number_within_group <fct>,
## #   cabin_deck <fct>, cabin_number <fct>, cabin_side <fct>,
## #   lux_spent_average <dbl>, lux_spent_sum <dbl>, lux_spent_median <dbl>,
## #   lux_spent_var <dbl>, lux_spent_sd <dbl>, lux_spent_iqr <dbl>

Remove character class from the data frame. The feature passenger_id is kept around for the test submission later.

whole_init_exp <- whole_init_exp %>% select(-where(is.character)) # passenger id is kept 
head(whole_init_exp,2)
## # A tibble: 2 x 23
##   passenger_id home_planet cryo_sleep destination   age vip   room_service
##   <fct>        <fct>       <lgl>      <fct>       <dbl> <lgl>        <dbl>
## 1 0001_01      Europa      FALSE      TRAPPIST-1e    39 FALSE            0
## 2 0002_01      Earth       FALSE      TRAPPIST-1e    24 FALSE          109
## # ... with 16 more variables: food_court <dbl>, shopping_mall <dbl>, spa <dbl>,
## #   vr_deck <dbl>, name_last <fct>, transported <lgl>,
## #   passenger_number_within_group <fct>, cabin_deck <fct>, cabin_number <fct>,
## #   cabin_side <fct>, lux_spent_average <dbl>, lux_spent_sum <dbl>,
## #   lux_spent_median <dbl>, lux_spent_var <dbl>, lux_spent_sd <dbl>,
## #   lux_spent_iqr <dbl>

Analysis

Exploratory Data Analysis

Let’s see the expanded dataset’s summary.

plot_intro(whole_init_exp)

Let’s visualize the discrete features that have less than 50 categories

plot_bar(whole_init_exp, 
         by = "transported", 
         by_position = "fill",
         nrow = 2,
         ncol = 2)
## 2 columns ignored with more than 50 categories.
## passenger_id: 12970 categories
## name_last: 2406 categories

From the quick peek above, vip passengers that slept in the cryo were much more likely to be not transported away. So did, having a cabin in deck E and number 0.

Then we take a look at the continuous numbers.

plot_boxplot(whole_init_exp,
             by = "transported", 
             nrow = 2, 
             ncol = 2)

Passengers that spent their money in crowded open space like food_court and shopping_mall were more likely to be vanished. Meanwhile, spending money in more private situation like room_service, spa, and vr_deck helped their chance to be not taken away into some alternate dimension. Was there something contagious with the way the vanishing happened?

They’re mostly not normally distributed, but to be sure, let’s see them in qq plot.

plot_qq(whole_init_exp, 
        by = "transported",
        nrow = 2,
        ncol = 2
        )

Only age that has resemblance of normal distribution. The rest are skewed.

Cross-Validation

We have to split the train data into two parts:

  • Train - Learn : We use this to train the model
  • Train - Validation : Then we evaluate the model in this data set
train_expanded <- whole_init_exp %>% filter(!is.na(transported))
test_expanded <- whole_init_exp %>% filter(is.na(transported))
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(1)
idx <- sample(nrow(train_expanded),
              0.8 * nrow(train_expanded))
train_exp_learn <- train_expanded[idx,-1]
train_exp_val <- train_expanded[-idx,-1]

Let’s check the target balance.

prop.table(table(train_exp_learn$transported))
## 
##     FALSE      TRUE 
## 0.4982744 0.5017256

Cool, it’s already balanced.

Modeling and Validation

Features Selection

I’m going to use earth library to look for important variable. The package used a non-parametric regression technique called multivariate adaptive regression splines. Said technique automatically models nonlinearities and interactions between variables.

mars_train_learn <- earth(transported ~ ., 
                          data=train_exp_learn) # build model
ev <- evimp(mars_train_learn)
ev
##                        nsubsets   gcv    rss
## lux_spent_average            23 100.0  100.0
## food_court                   22  68.0   68.7
## shopping_mall                21  60.7   61.5
## lux_spent_iqr                20  53.5   54.5
## cabin_deckG                  19  41.2   42.6
## cryo_sleepTRUE               18  36.1   37.7
## cabin_sideS                  17  32.7   34.4
## home_planetEuropa            14  26.5   28.2
## cabin_number9                13  23.4   25.3
## age                          12  21.5   23.4
## lux_spent_median             10  17.9   19.8
## cabin_deckE                   9  16.1   18.0
## home_planetMars               5   9.7   11.5
## destinationTRAPPIST-1e        5   9.7   11.5
## name_lastMcbriddley           4   8.5   10.1
## cabin_deckC                   2   5.1    6.4

MARS’ automated feature selection above shows the important features based on generalized cross validation and residual sum of squares.

topf <- c("lux_spent_average","food_court","lux_spent_iqr","shopping_mall","cryo_sleep","home_planet","lux_spent_sum","cabin_side","lux_spent_median","cabin_number","age","cabin_deck","destination","name_last","room_service","spa")
formula_topf <- as.formula(paste("transported ~ ", paste(topf, collapse = "+"))) 
formula_topf
## transported ~ lux_spent_average + food_court + lux_spent_iqr + 
##     shopping_mall + cryo_sleep + home_planet + lux_spent_sum + 
##     cabin_side + lux_spent_median + cabin_number + age + cabin_deck + 
##     destination + name_last + room_service + spa

Naive Bayes Model

Naive Bayes assumes there exist a dependent relationship between the response variable and the explanatory variables. The ‘naive’ part of its name relates to the assumptions that the predictors are independent between each other and weighed equally. Naive Bayes is weakened when there are zeroes in the data, so we need to handle that with laplace smoothing.

Fit the model.

naive_spaceship <- naiveBayes(formula_topf, train_exp_learn, laplace=5)

Predict the result.

naive_result <- train_exp_val %>% 
  mutate(pred = predict(naive_spaceship, train_exp_val, type = "class")) %>% 
  select(transported, pred)
naive_result[1:20,]
## # A tibble: 20 x 2
##    transported pred 
##    <lgl>       <fct>
##  1 TRUE        TRUE 
##  2 FALSE       FALSE
##  3 FALSE       TRUE 
##  4 TRUE        TRUE 
##  5 FALSE       TRUE 
##  6 FALSE       TRUE 
##  7 FALSE       TRUE 
##  8 FALSE       FALSE
##  9 FALSE       FALSE
## 10 FALSE       FALSE
## 11 FALSE       FALSE
## 12 TRUE        TRUE 
## 13 FALSE       TRUE 
## 14 FALSE       FALSE
## 15 TRUE        TRUE 
## 16 TRUE        TRUE 
## 17 FALSE       FALSE
## 18 TRUE        TRUE 
## 19 TRUE        TRUE 
## 20 TRUE        TRUE

Let’s see the confusion matrix.

naive_confmat <- confusionMatrix(data = naive_result$pred, reference = as.factor(naive_result$transported), positive = "TRUE")

naive_confmat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   341   46
##      TRUE    509  843
##                                           
##                Accuracy : 0.6809          
##                  95% CI : (0.6584, 0.7027)
##     No Information Rate : 0.5112          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3537          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9483          
##             Specificity : 0.4012          
##          Pos Pred Value : 0.6235          
##          Neg Pred Value : 0.8811          
##              Prevalence : 0.5112          
##          Detection Rate : 0.4848          
##    Detection Prevalence : 0.7775          
##       Balanced Accuracy : 0.6747          
##                                           
##        'Positive' Class : TRUE            
## 

The accuracy doesn’t look so good. But the sensitivity is high, and in this case of predicting calamity, perhaps, greater sensitivity at the cost of precision is preferable–can’t be too careful.

Next, we have to see the ROC and AUC.

naive_result <- naive_result %>% 
  mutate(num_pred = predict(naive_spaceship, train_exp_val, type = "raw"),
         num_trans = ifelse(transported == TRUE, 1, 0))

naive_roc <- prediction(predictions = naive_result$num_pred[,"TRUE"], 
                        labels = naive_result[,"num_trans"])
# ROC curve
perf_naive_roc <- ROCR::performance(
                  prediction.obj = naive_roc, 
                  measure = "tpr", 
                  x.measure = "fpr")
plot(perf_naive_roc, main="Naive Bayes ROC Curve")
abline(0,1,lty = 8)

naive_auc <- ROCR::performance(prediction.obj = naive_roc, measure = "auc")
naive_auc@y.values
## [[1]]
## [1] 0.8537365

Decision Tree Model

Decision Tree grow a tree from the data by splitting them according to entropy to maximize information gain.

Fit a tree. The featore transported and cryo_sleep are converted to factor for it to fit the tree.

# tree_spaceship <- ctree(formula_topf,
#                         data = train_exp_learn %>% mutate(transported = as.factor(transported),
#                                                           cryo_sleep = as.factor(cryo_sleep))
#                         )
# saveRDS(object = tree_spaceship,file = "tree_spaceship.RDS")

Whoa, the training took a while, so I need to save the model to RDS, comment out the code, and read back the RDS. This makes sure that I do not need to re-train the tree because it’s so time consuming.

tree_spaceship <- readRDS("tree_spaceship.RDS")

The tree

tree_spaceship
## 
## Model formula:
## transported ~ lux_spent_average + food_court + lux_spent_iqr + 
##     shopping_mall + cryo_sleep + home_planet + lux_spent_sum + 
##     cabin_side + lux_spent_median + cabin_number + age + cabin_deck + 
##     destination + name_last + room_service + spa
## 
## Fitted party:
## [1] root
## |   [2] cryo_sleep in FALSE
## |   |   [3] food_court <= 2165
## |   |   |   [4] shopping_mall <= 627
## |   |   |   |   [5] lux_spent_average <= 5.2
## |   |   |   |   |   [6] cabin_deck in A, B, C, F
## |   |   |   |   |   |   [7] age <= 30
## |   |   |   |   |   |   |   [8] home_planet in Earth, Europa: TRUE (n = 42, err = 4.8%)
## |   |   |   |   |   |   |   [9] home_planet in Mars: TRUE (n = 65, err = 0.0%)
## |   |   |   |   |   |   [10] age > 30: TRUE (n = 31, err = 41.9%)
## |   |   |   |   |   [11] cabin_deck in D, E, G
## |   |   |   |   |   |   [12] age <= 15: TRUE (n = 245, err = 45.3%)
## |   |   |   |   |   |   [13] age > 15: FALSE (n = 54, err = 22.2%)
## |   |   |   |   [14] lux_spent_average > 5.2
## |   |   |   |   |   [15] food_court <= 499
## |   |   |   |   |   |   [16] lux_spent_sum <= 1483
## |   |   |   |   |   |   |   [17] food_court <= 165
## |   |   |   |   |   |   |   |   [18] destination in 55 Cancri e, PSO J318.5-22
## |   |   |   |   |   |   |   |   |   [19] cabin_deck in D, F, G: FALSE (n = 349, err = 18.9%)
## |   |   |   |   |   |   |   |   |   [20] cabin_deck in E
## |   |   |   |   |   |   |   |   |   |   [21] lux_spent_sum <= 722: TRUE (n = 18, err = 0.0%)
## |   |   |   |   |   |   |   |   |   |   [22] lux_spent_sum > 722: FALSE (n = 14, err = 35.7%)
## |   |   |   |   |   |   |   |   [23] destination in TRAPPIST-1e: FALSE (n = 1062, err = 15.6%)
## |   |   |   |   |   |   |   [24] food_court > 165
## |   |   |   |   |   |   |   |   [25] shopping_mall <= 414: FALSE (n = 214, err = 26.6%)
## |   |   |   |   |   |   |   |   [26] shopping_mall > 414: TRUE (n = 18, err = 22.2%)
## |   |   |   |   |   |   [27] lux_spent_sum > 1483: FALSE (n = 632, err = 6.0%)
## |   |   |   |   |   [28] food_court > 499
## |   |   |   |   |   |   [29] lux_spent_iqr <= 451
## |   |   |   |   |   |   |   [30] cabin_side in P
## |   |   |   |   |   |   |   |   [31] lux_spent_median <= 62: FALSE (n = 162, err = 46.9%)
## |   |   |   |   |   |   |   |   [32] lux_spent_median > 62: FALSE (n = 18, err = 5.6%)
## |   |   |   |   |   |   |   [33] cabin_side in S: TRUE (n = 189, err = 34.4%)
## |   |   |   |   |   |   [34] lux_spent_iqr > 451
## |   |   |   |   |   |   |   [35] lux_spent_average <= 855
## |   |   |   |   |   |   |   |   [36] cabin_side in P: FALSE (n = 94, err = 17.0%)
## |   |   |   |   |   |   |   |   [37] cabin_side in S
## |   |   |   |   |   |   |   |   |   [38] cabin_deck in A, B, D, E, F, G: FALSE (n = 75, err = 24.0%)
## |   |   |   |   |   |   |   |   |   [39] cabin_deck in C: TRUE (n = 18, err = 11.1%)
## |   |   |   |   |   |   |   [40] lux_spent_average > 855: FALSE (n = 106, err = 2.8%)
## |   |   |   [41] shopping_mall > 627
## |   |   |   |   [42] lux_spent_iqr <= 226
## |   |   |   |   |   [43] shopping_mall <= 1540: TRUE (n = 284, err = 39.4%)
## |   |   |   |   |   [44] shopping_mall > 1540: TRUE (n = 59, err = 3.4%)
## |   |   |   |   [45] lux_spent_iqr > 226
## |   |   |   |   |   [46] shopping_mall <= 2975: FALSE (n = 276, err = 30.1%)
## |   |   |   |   |   [47] shopping_mall > 2975: TRUE (n = 25, err = 28.0%)
## |   |   [48] food_court > 2165
## |   |   |   [49] lux_spent_iqr <= 2192
## |   |   |   |   [50] lux_spent_iqr <= 1017
## |   |   |   |   |   [51] lux_spent_median <= 168: TRUE (n = 162, err = 2.5%)
## |   |   |   |   |   [52] lux_spent_median > 168: TRUE (n = 48, err = 20.8%)
## |   |   |   |   [53] lux_spent_iqr > 1017
## |   |   |   |   |   [54] food_court <= 4155: FALSE (n = 44, err = 34.1%)
## |   |   |   |   |   [55] food_court > 4155: TRUE (n = 42, err = 16.7%)
## |   |   |   [56] lux_spent_iqr > 2192
## |   |   |   |   [57] shopping_mall <= 552
## |   |   |   |   |   [58] food_court <= 4089: FALSE (n = 48, err = 2.1%)
## |   |   |   |   |   [59] food_court > 4089
## |   |   |   |   |   |   [60] lux_spent_iqr <= 4103: TRUE (n = 25, err = 48.0%)
## |   |   |   |   |   |   [61] lux_spent_iqr > 4103: FALSE (n = 35, err = 2.9%)
## |   |   |   |   [62] shopping_mall > 552: TRUE (n = 7, err = 42.9%)
## |   [63] cryo_sleep in TRUE
## |   |   [64] cabin_deck in A, B, C, D, F
## |   |   |   [65] lux_spent_average <= 118
## |   |   |   |   [66] home_planet in Earth: TRUE (n = 30, err = 23.3%)
## |   |   |   |   [67] home_planet in Europa, Mars: TRUE (n = 1116, err = 1.4%)
## |   |   |   [68] lux_spent_average > 118
## |   |   |   |   [69] spa <= 29
## |   |   |   |   |   [70] home_planet in Earth: FALSE (n = 14, err = 42.9%)
## |   |   |   |   |   [71] home_planet in Europa, Mars: TRUE (n = 48, err = 14.6%)
## |   |   |   |   [72] spa > 29: FALSE (n = 20, err = 25.0%)
## |   |   [73] cabin_deck in E, G
## |   |   |   [74] home_planet in Earth, Mars
## |   |   |   |   [75] cabin_side in P
## |   |   |   |   |   [76] destination in 55 Cancri e, PSO J318.5-22: TRUE (n = 239, err = 33.1%)
## |   |   |   |   |   [77] destination in TRAPPIST-1e: TRUE (n = 374, err = 47.6%)
## |   |   |   |   [78] cabin_side in S
## |   |   |   |   |   [79] cabin_number in 0, 1, 2, 3, 4, 5
## |   |   |   |   |   |   [80] cabin_number in 0, 2, 3: FALSE (n = 101, err = 46.5%)
## |   |   |   |   |   |   [81] cabin_number in 1, 4, 5: TRUE (n = 325, err = 30.8%)
## |   |   |   |   |   [82] cabin_number in 6, 7, 8, 9: TRUE (n = 166, err = 10.8%)
## |   |   |   [83] home_planet in Europa: TRUE (n = 60, err = 6.7%)
## 
## Number of inner nodes:    41
## Number of terminal nodes: 42
plot(tree_spaceship, type="simple")

Damn. The tree has grown into a maze. The advantage of Decision Tree is supposed to be interpretable, but this? Perhaps we need to prune it later. If this tree performed better than the other models, it would earn our extra attention. Let’s just use it to predict.

train_exp_val_fortreepredict <- train_exp_val %>% mutate(cryo_sleep = as.factor(cryo_sleep),
                                                         transported = as.factor(transported)) 
                                                  
                                                        
tree_result <- predict(object = tree_spaceship,
                       newdata =  train_exp_val_fortreepredict,
                       type = "response")

Observe the confusion matrix.

tree_confmat <- confusionMatrix(data = as.factor(tree_result),
                reference = as.factor(train_exp_val$transported))
tree_confmat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   649  157
##      TRUE    201  732
##                                           
##                Accuracy : 0.7941          
##                  95% CI : (0.7744, 0.8129)
##     No Information Rate : 0.5112          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.5876          
##                                           
##  Mcnemar's Test P-Value : 0.02305         
##                                           
##             Sensitivity : 0.7635          
##             Specificity : 0.8234          
##          Pos Pred Value : 0.8052          
##          Neg Pred Value : 0.7846          
##              Prevalence : 0.4888          
##          Detection Rate : 0.3732          
##    Detection Prevalence : 0.4635          
##       Balanced Accuracy : 0.7935          
##                                           
##        'Positive' Class : FALSE           
## 

Proceed to ROC and AUC

tree_roc <- ROCR::prediction(predictions = as.vector(as.numeric(tree_result)), 
                        labels = as.vector(as.numeric(train_exp_val$transported)))
perf_tree_roc <- ROCR::performance(
                  prediction.obj = tree_roc, 
                  measure = "tpr", 
                  x.measure = "fpr")
plot(perf_tree_roc, main="Decision Tree ROC Curve")
abline(0,1,lty = 8)

tree_auc <- ROCR::performance(prediction.obj = tree_roc, measure = "auc")
tree_auc@y.values
## [[1]]
## [1] 0.7934632

Random Forest Model

Fit the forest, this time I use ranger, because it is FAST.

forest_spaceship <- ranger(formula = formula_topf,
                           data = train_exp_learn,
                           importance = "impurity",
                           min.node.size = 1,
                           seed = 123,
                           num.trees = 10000
                           )

forest_spaceship
## Ranger result
## 
## Call:
##  ranger(formula = formula_topf, data = train_exp_learn, importance = "impurity",      min.node.size = 1, seed = 123, num.trees = 10000) 
## 
## Type:                             Classification 
## Number of trees:                  10000 
## Sample size:                      6954 
## Number of independent variables:  16 
## Mtry:                             4 
## Target node size:                 1 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             20.72 %

Predict.

forest_val_pred <- predict(forest_spaceship, train_exp_val)

Validate with Confusion Matrix.

forest_confmat <- confusionMatrix(data = as.factor(forest_val_pred$predictions), 
                                  reference = as.factor(as.numeric(train_exp_val$transported)),
                                  positive = "1")
forest_confmat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 667 172
##          1 183 717
##                                           
##                Accuracy : 0.7959          
##                  95% CI : (0.7761, 0.8146)
##     No Information Rate : 0.5112          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5914          
##                                           
##  Mcnemar's Test P-Value : 0.5956          
##                                           
##             Sensitivity : 0.8065          
##             Specificity : 0.7847          
##          Pos Pred Value : 0.7967          
##          Neg Pred Value : 0.7950          
##              Prevalence : 0.5112          
##          Detection Rate : 0.4123          
##    Detection Prevalence : 0.5175          
##       Balanced Accuracy : 0.7956          
##                                           
##        'Positive' Class : 1               
## 

We continue to ROC and AUC

forest_roc <- ROCR::prediction(predictions = forest_val_pred$predictions, 
                        labels = train_exp_val$transported)
perf_forest_roc <- ROCR::performance(
                  prediction.obj = forest_roc, 
                  measure = "tpr", 
                  x.measure = "fpr")
plot(perf_forest_roc, main="Random Forest ROC Curve")
abline(0,1,lty = 8)

forest_auc <- ROCR::performance(prediction.obj = forest_roc, measure = "auc")
forest_auc@y.values
## [[1]]
## [1] 0.795615

Evaluation

Confusion Matrix Comparison
naive_confmat$overall[1]
##  Accuracy 
## 0.6808511
tree_confmat$overall[1]
##  Accuracy 
## 0.7941346
forest_confmat$overall[1]
##  Accuracy 
## 0.7958597
naive_confmat$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.9482565            0.4011765            0.6235207 
##       Neg Pred Value            Precision               Recall 
##            0.8811370            0.6235207            0.9482565 
##                   F1           Prevalence       Detection Rate 
##            0.7523427            0.5112133            0.4847614 
## Detection Prevalence    Balanced Accuracy 
##            0.7774583            0.6747165
tree_confmat$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.7635294            0.8233971            0.8052109 
##       Neg Pred Value            Precision               Recall 
##            0.7845659            0.8052109            0.7635294 
##                   F1           Prevalence       Detection Rate 
##            0.7838164            0.4887867            0.3732030 
## Detection Prevalence    Balanced Accuracy 
##            0.4634848            0.7934632
forest_confmat$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.8065242            0.7847059            0.7966667 
##       Neg Pred Value            Precision               Recall 
##            0.7949940            0.7966667            0.8065242 
##                   F1           Prevalence       Detection Rate 
##            0.8015651            0.5112133            0.4123059 
## Detection Prevalence    Balanced Accuracy 
##            0.5175388            0.7956150
ROC & AUC Comparison

ROC

# ROC curve
plot(perf_naive_roc, main="Naive Bayes ROC Curve")

plot(perf_tree_roc, main="Decision Tree ROC Curve")

plot(perf_forest_roc, main="Random Forest ROC Curve")

AUC

# AUC value
naive_auc@y.values
## [[1]]
## [1] 0.8537365
tree_auc@y.values
## [[1]]
## [1] 0.7934632
forest_auc@y.values
## [[1]]
## [1] 0.795615

The random forest has the best performance here. Next, let’s use it to predict.

Decision

For the prediction submission I have to use Logistic Regression since it gives better result at the validation data. But before I put it to the test data, I have to improve the model.

Improvement

forest_spaceship
## Ranger result
## 
## Call:
##  ranger(formula = formula_topf, data = train_exp_learn, importance = "impurity",      min.node.size = 1, seed = 123, num.trees = 10000) 
## 
## Type:                             Classification 
## Number of trees:                  10000 
## Sample size:                      6954 
## Number of independent variables:  16 
## Mtry:                             4 
## Target node size:                 1 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             20.72 %

Let’s increase the: number of trees, mtry, Let’s set to false to: oob.error to save computation time

forest_spacecruise <- ranger(formula = formula_topf,
                             num.trees = 20000,
                             mtry = 10,
                             data = train_exp_learn, 
                             importance = "impurity",      
                             min.node.size = 1, 
                             seed = 123, 
                             oob.error = FALSE) 

Prediction

Predict.

forest_submission_pred <- predict(forest_spacecruise, test_expanded)
test_expanded$Transported <- 
ifelse(forest_submission_pred$predictions==1, "True", "False")

test_submission <- test_expanded %>% 
  select(passenger_id, Transported) %>% 
  rename(PassengerId = passenger_id)
test_submission[1:10,]
## # A tibble: 10 x 2
##    PassengerId Transported
##    <fct>       <chr>      
##  1 0013_01     False      
##  2 0018_01     False      
##  3 0019_01     True       
##  4 0021_01     True       
##  5 0023_01     True       
##  6 0027_01     False      
##  7 0029_01     True       
##  8 0032_01     True       
##  9 0032_02     True       
## 10 0033_01     True

Save submission to csv.

write_csv(x = test_submission, file = "husada_random_forest_space_titanic_submission.csv")

Result

knitr::include_graphics("random_forest_result.JPG")