In the previous post, I predicted which Spaceship Titanic passengers are transported to an alternate dimension with logistic regression and KNN. This time, I intend to do the same with different tools.
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(janitor)##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(DataExplorer)
library(missRanger)
library(performance)
library(see)
library(class)
library(caret)## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(gtools)
library(ROCR)##
## Attaching package: 'ROCR'
## The following object is masked from 'package:performance':
##
## performance
library(car)## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:gtools':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(broom)
library(earth) # mars## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
library(e1071) # bayes##
## Attaching package: 'e1071'
## The following object is masked from 'package:gtools':
##
## permutations
library(partykit) # decision tree## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(ranger) # fast random foresttrain <- read_csv("train.csv")## Rows: 8693 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (3): CryoSleep, VIP, Transported
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv("test.csv")## Rows: 4277 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (2): CryoSleep, VIP
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(train, 3)## # A tibble: 3 x 14
## PassengerId HomePlanet CryoSleep Cabin Destination Age VIP RoomService
## <chr> <chr> <lgl> <chr> <chr> <dbl> <lgl> <dbl>
## 1 0001_01 Europa FALSE B/0/P TRAPPIST-1e 39 FALSE 0
## 2 0002_01 Earth FALSE F/0/S TRAPPIST-1e 24 FALSE 109
## 3 0003_01 Europa FALSE A/0/S TRAPPIST-1e 58 TRUE 43
## # ... with 6 more variables: FoodCourt <dbl>, ShoppingMall <dbl>, Spa <dbl>,
## # VRDeck <dbl>, Name <chr>, Transported <lgl>
head(test, 3)## # A tibble: 3 x 13
## PassengerId HomePlanet CryoSleep Cabin Destination Age VIP RoomService
## <chr> <chr> <lgl> <chr> <chr> <dbl> <lgl> <dbl>
## 1 0013_01 Earth TRUE G/3/S TRAPPIST-1e 27 FALSE 0
## 2 0018_01 Earth FALSE F/4/S TRAPPIST-1e 19 FALSE 0
## 3 0019_01 Europa TRUE C/0/S 55 Cancri e 31 FALSE 0
## # ... with 5 more variables: FoodCourt <dbl>, ShoppingMall <dbl>, Spa <dbl>,
## # VRDeck <dbl>, Name <chr>
To understand and process the data as a whole, first, we need to join them together
whole <- bind_rows(train, test)First, clean the column names.
whole <- whole %>% clean_names()str(whole)## spec_tbl_df [12,970 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ passenger_id : chr [1:12970] "0001_01" "0002_01" "0003_01" "0003_02" ...
## $ home_planet : chr [1:12970] "Europa" "Earth" "Europa" "Europa" ...
## $ cryo_sleep : logi [1:12970] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ cabin : chr [1:12970] "B/0/P" "F/0/S" "A/0/S" "A/0/S" ...
## $ destination : chr [1:12970] "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" ...
## $ age : num [1:12970] 39 24 58 33 16 44 26 28 35 14 ...
## $ vip : logi [1:12970] FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ room_service : num [1:12970] 0 109 43 0 303 0 42 0 0 0 ...
## $ food_court : num [1:12970] 0 9 3576 1283 70 ...
## $ shopping_mall: num [1:12970] 0 25 0 371 151 0 3 0 17 0 ...
## $ spa : num [1:12970] 0 549 6715 3329 565 ...
## $ vr_deck : num [1:12970] 0 44 49 193 2 0 0 NA 0 0 ...
## $ name : chr [1:12970] "Maham Ofracculy" "Juanna Vines" "Altark Susent" "Solam Susent" ...
## $ transported : logi [1:12970] FALSE TRUE FALSE FALSE TRUE TRUE ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_character(),
## .. HomePlanet = col_character(),
## .. CryoSleep = col_logical(),
## .. Cabin = col_character(),
## .. Destination = col_character(),
## .. Age = col_double(),
## .. VIP = col_logical(),
## .. RoomService = col_double(),
## .. FoodCourt = col_double(),
## .. ShoppingMall = col_double(),
## .. Spa = col_double(),
## .. VRDeck = col_double(),
## .. Name = col_character(),
## .. Transported = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
Let’s check for missing values.
plot_missing(whole)The missing values are not significant, it’s good enough percentage to be omitted. But to be safe, I will impute them later. For now, let’s consider how many unique value in the data, to conclude whether some variables need to be converted into factor or not.
sapply(whole, function(x) n_distinct(x))## passenger_id home_planet cryo_sleep cabin destination
## 12970 4 3 9826 4
## age vip room_service food_court shopping_mall
## 81 3 1579 1954 1368
## spa vr_deck name transported
## 1680 1643 12630 3
str(whole)## spec_tbl_df [12,970 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ passenger_id : chr [1:12970] "0001_01" "0002_01" "0003_01" "0003_02" ...
## $ home_planet : chr [1:12970] "Europa" "Earth" "Europa" "Europa" ...
## $ cryo_sleep : logi [1:12970] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ cabin : chr [1:12970] "B/0/P" "F/0/S" "A/0/S" "A/0/S" ...
## $ destination : chr [1:12970] "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" ...
## $ age : num [1:12970] 39 24 58 33 16 44 26 28 35 14 ...
## $ vip : logi [1:12970] FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ room_service : num [1:12970] 0 109 43 0 303 0 42 0 0 0 ...
## $ food_court : num [1:12970] 0 9 3576 1283 70 ...
## $ shopping_mall: num [1:12970] 0 25 0 371 151 0 3 0 17 0 ...
## $ spa : num [1:12970] 0 549 6715 3329 565 ...
## $ vr_deck : num [1:12970] 0 44 49 193 2 0 0 NA 0 0 ...
## $ name : chr [1:12970] "Maham Ofracculy" "Juanna Vines" "Altark Susent" "Solam Susent" ...
## $ transported : logi [1:12970] FALSE TRUE FALSE FALSE TRUE TRUE ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_character(),
## .. HomePlanet = col_character(),
## .. CryoSleep = col_logical(),
## .. Cabin = col_character(),
## .. Destination = col_character(),
## .. Age = col_double(),
## .. VIP = col_logical(),
## .. RoomService = col_double(),
## .. FoodCourt = col_double(),
## .. ShoppingMall = col_double(),
## .. Spa = col_double(),
## .. VRDeck = col_double(),
## .. Name = col_character(),
## .. Transported = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
Since cryo_sleep and vip are already logical class, we only need to convert home_planet and destination from character to factor.
whole <- whole %>%
mutate(home_planet = as.factor(home_planet),
destination = as.factor(destination)
)Imputing the missing value with missRanger with a formula to target every columns except passenger id and transported. But for the predictor, we also have to dismissed any unordered factor with more than 53 level.
init_imp =
home_planet + cryo_sleep + cabin + destination + age + vip + room_service + food_court + shopping_mall + spa + vr_deck + name ~ home_planet + cryo_sleep + + destination + vip + room_service + food_court + shopping_mall + spa + vr_deck
whole_init_imp <- missRanger(data = whole,
formula = init_imp,
pmm.k = 3, # to round imputation result with predictive mean matching
verbose = 2,
num.trees = 50, # tweaks to make things faster
sample.fraction = 0.1, # tweaks to make things faster
splitrule = "extratrees", # tweaks to make things faster
max.depth = 6, # tweaks to make things faster
min.node.size = 10000, # tweaks to make things faster
maxiter = 2)# tweaks to make things faster##
## Missing value imputation by random forests
##
## Variables to impute: home_planet, cryo_sleep, cabin, destination, age, vip, room_service, food_court, shopping_mall, spa, vr_deck, name
## Variables used to impute: home_planet, cryo_sleep, destination, vip, room_service, food_court, shopping_mall, spa, vr_deck
## rm_srv vr_dck age dstntn spa hm_pln fd_crt name vip cabin shppn_ cry_sl
## iter 1: 1.0000 1.0001 1.0001 0.3013 1.0001 0.4587 1.0001 1.0000 0.0215 1.0000 1.0000 0.3618
## iter 2: 1.0001 1.0001 1.0002 0.3013 1.0000 0.4587 1.0000 1.0000 0.0215 0.9998 1.0001 0.3618
colSums(is.na(whole_init_imp))## passenger_id home_planet cryo_sleep cabin destination
## 0 0 0 0 0
## age vip room_service food_court shopping_mall
## 0 0 0 0 0
## spa vr_deck name transported
## 0 0 0 4277
We can extract groupings and order within group from the passenger_id. Then we can extract deck, number, and side from the cabin code. Furthermore, we can extract average, sum, IQR, and more from how they spent their money. We can also split the name into first and last.
lux <- c("room_service", "food_court", "shopping_mall", "spa", "vr_deck")
whole_init_exp <- whole_init_imp %>%
mutate(passenger_id = as.factor(passenger_id), # turn into factor so it doesn't removed later
passenger_group = str_sub(passenger_id, 1, 4),
passenger_number_within_group =
as.factor(str_sub(passenger_id, 6, 7)),
cabin_deck = as.factor(str_sub(cabin, 1, 1)),
cabin_number = as.factor(str_sub(cabin, 3, 3)),
cabin_side = as.factor(str_sub(cabin, -1))) %>%
separate(col = name,
sep = " ",
into = c("name_first","name_last"),) %>%
mutate(name_last = as.factor(name_last)) %>%
rowwise() %>%
mutate(lux_spent_average = mean(c_across(cols = all_of(lux))),
lux_spent_sum = sum(c_across(cols = all_of(lux))),
lux_spent_median = median(c_across(cols = all_of(lux))),
lux_spent_var = var(c_across(cols = all_of(lux))),
lux_spent_sd = sd(c_across(cols = all_of(lux))),
lux_spent_iqr = IQR(c_across(cols = all_of(lux)))
) %>%
ungroup()
head(whole_init_exp)[,]## # A tibble: 6 x 26
## passenger_id home_planet cryo_sleep cabin destination age vip room_service
## <fct> <fct> <lgl> <chr> <fct> <dbl> <lgl> <dbl>
## 1 0001_01 Europa FALSE B/0/P TRAPPIST-1e 39 FALSE 0
## 2 0002_01 Earth FALSE F/0/S TRAPPIST-1e 24 FALSE 109
## 3 0003_01 Europa FALSE A/0/S TRAPPIST-1e 58 TRUE 43
## 4 0003_02 Europa FALSE A/0/S TRAPPIST-1e 33 FALSE 0
## 5 0004_01 Earth FALSE F/1/S TRAPPIST-1e 16 FALSE 303
## 6 0005_01 Earth FALSE F/0/P PSO J318.5~ 44 FALSE 0
## # ... with 18 more variables: food_court <dbl>, shopping_mall <dbl>, spa <dbl>,
## # vr_deck <dbl>, name_first <chr>, name_last <fct>, transported <lgl>,
## # passenger_group <chr>, passenger_number_within_group <fct>,
## # cabin_deck <fct>, cabin_number <fct>, cabin_side <fct>,
## # lux_spent_average <dbl>, lux_spent_sum <dbl>, lux_spent_median <dbl>,
## # lux_spent_var <dbl>, lux_spent_sd <dbl>, lux_spent_iqr <dbl>
Remove character class from the data frame. The feature passenger_id is kept around for the test submission later.
whole_init_exp <- whole_init_exp %>% select(-where(is.character)) # passenger id is kept
head(whole_init_exp,2)## # A tibble: 2 x 23
## passenger_id home_planet cryo_sleep destination age vip room_service
## <fct> <fct> <lgl> <fct> <dbl> <lgl> <dbl>
## 1 0001_01 Europa FALSE TRAPPIST-1e 39 FALSE 0
## 2 0002_01 Earth FALSE TRAPPIST-1e 24 FALSE 109
## # ... with 16 more variables: food_court <dbl>, shopping_mall <dbl>, spa <dbl>,
## # vr_deck <dbl>, name_last <fct>, transported <lgl>,
## # passenger_number_within_group <fct>, cabin_deck <fct>, cabin_number <fct>,
## # cabin_side <fct>, lux_spent_average <dbl>, lux_spent_sum <dbl>,
## # lux_spent_median <dbl>, lux_spent_var <dbl>, lux_spent_sd <dbl>,
## # lux_spent_iqr <dbl>
Let’s see the expanded dataset’s summary.
plot_intro(whole_init_exp)Let’s visualize the discrete features that have less than 50 categories
plot_bar(whole_init_exp,
by = "transported",
by_position = "fill",
nrow = 2,
ncol = 2)## 2 columns ignored with more than 50 categories.
## passenger_id: 12970 categories
## name_last: 2406 categories
From the quick peek above, vip passengers that slept in the cryo were much more likely to be not transported away. So did, having a cabin in deck E and number 0.
Then we take a look at the continuous numbers.
plot_boxplot(whole_init_exp,
by = "transported",
nrow = 2,
ncol = 2)Passengers that spent their money in crowded open space like food_court and shopping_mall were more likely to be vanished. Meanwhile, spending money in more private situation like room_service, spa, and vr_deck helped their chance to be not taken away into some alternate dimension. Was there something contagious with the way the vanishing happened?
They’re mostly not normally distributed, but to be sure, let’s see them in qq plot.
plot_qq(whole_init_exp,
by = "transported",
nrow = 2,
ncol = 2
)Only age that has resemblance of normal distribution. The rest are skewed.
We have to split the train data into two parts:
train_expanded <- whole_init_exp %>% filter(!is.na(transported))
test_expanded <- whole_init_exp %>% filter(is.na(transported))RNGkind(sample.kind = "Rounding")## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(1)
idx <- sample(nrow(train_expanded),
0.8 * nrow(train_expanded))
train_exp_learn <- train_expanded[idx,-1]
train_exp_val <- train_expanded[-idx,-1]Let’s check the target balance.
prop.table(table(train_exp_learn$transported))##
## FALSE TRUE
## 0.4982744 0.5017256
Cool, it’s already balanced.
I’m going to use earth library to look for important variable. The package used a non-parametric regression technique called multivariate adaptive regression splines. Said technique automatically models nonlinearities and interactions between variables.
mars_train_learn <- earth(transported ~ .,
data=train_exp_learn) # build model
ev <- evimp(mars_train_learn)
ev## nsubsets gcv rss
## lux_spent_average 23 100.0 100.0
## food_court 22 68.0 68.7
## shopping_mall 21 60.7 61.5
## lux_spent_iqr 20 53.5 54.5
## cabin_deckG 19 41.2 42.6
## cryo_sleepTRUE 18 36.1 37.7
## cabin_sideS 17 32.7 34.4
## home_planetEuropa 14 26.5 28.2
## cabin_number9 13 23.4 25.3
## age 12 21.5 23.4
## lux_spent_median 10 17.9 19.8
## cabin_deckE 9 16.1 18.0
## home_planetMars 5 9.7 11.5
## destinationTRAPPIST-1e 5 9.7 11.5
## name_lastMcbriddley 4 8.5 10.1
## cabin_deckC 2 5.1 6.4
MARS’ automated feature selection above shows the important features based on generalized cross validation and residual sum of squares.
topf <- c("lux_spent_average","food_court","lux_spent_iqr","shopping_mall","cryo_sleep","home_planet","lux_spent_sum","cabin_side","lux_spent_median","cabin_number","age","cabin_deck","destination","name_last","room_service","spa")
formula_topf <- as.formula(paste("transported ~ ", paste(topf, collapse = "+")))
formula_topf## transported ~ lux_spent_average + food_court + lux_spent_iqr +
## shopping_mall + cryo_sleep + home_planet + lux_spent_sum +
## cabin_side + lux_spent_median + cabin_number + age + cabin_deck +
## destination + name_last + room_service + spa
Naive Bayes assumes there exist a dependent relationship between the response variable and the explanatory variables. The ‘naive’ part of its name relates to the assumptions that the predictors are independent between each other and weighed equally. Naive Bayes is weakened when there are zeroes in the data, so we need to handle that with laplace smoothing.
Fit the model.
naive_spaceship <- naiveBayes(formula_topf, train_exp_learn, laplace=5)Predict the result.
naive_result <- train_exp_val %>%
mutate(pred = predict(naive_spaceship, train_exp_val, type = "class")) %>%
select(transported, pred)
naive_result[1:20,]## # A tibble: 20 x 2
## transported pred
## <lgl> <fct>
## 1 TRUE TRUE
## 2 FALSE FALSE
## 3 FALSE TRUE
## 4 TRUE TRUE
## 5 FALSE TRUE
## 6 FALSE TRUE
## 7 FALSE TRUE
## 8 FALSE FALSE
## 9 FALSE FALSE
## 10 FALSE FALSE
## 11 FALSE FALSE
## 12 TRUE TRUE
## 13 FALSE TRUE
## 14 FALSE FALSE
## 15 TRUE TRUE
## 16 TRUE TRUE
## 17 FALSE FALSE
## 18 TRUE TRUE
## 19 TRUE TRUE
## 20 TRUE TRUE
Let’s see the confusion matrix.
naive_confmat <- confusionMatrix(data = naive_result$pred, reference = as.factor(naive_result$transported), positive = "TRUE")
naive_confmat## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 341 46
## TRUE 509 843
##
## Accuracy : 0.6809
## 95% CI : (0.6584, 0.7027)
## No Information Rate : 0.5112
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3537
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9483
## Specificity : 0.4012
## Pos Pred Value : 0.6235
## Neg Pred Value : 0.8811
## Prevalence : 0.5112
## Detection Rate : 0.4848
## Detection Prevalence : 0.7775
## Balanced Accuracy : 0.6747
##
## 'Positive' Class : TRUE
##
The accuracy doesn’t look so good. But the sensitivity is high, and in this case of predicting calamity, perhaps, greater sensitivity at the cost of precision is preferable–can’t be too careful.
Next, we have to see the ROC and AUC.
naive_result <- naive_result %>%
mutate(num_pred = predict(naive_spaceship, train_exp_val, type = "raw"),
num_trans = ifelse(transported == TRUE, 1, 0))
naive_roc <- prediction(predictions = naive_result$num_pred[,"TRUE"],
labels = naive_result[,"num_trans"])# ROC curve
perf_naive_roc <- ROCR::performance(
prediction.obj = naive_roc,
measure = "tpr",
x.measure = "fpr")plot(perf_naive_roc, main="Naive Bayes ROC Curve")
abline(0,1,lty = 8)naive_auc <- ROCR::performance(prediction.obj = naive_roc, measure = "auc")
naive_auc@y.values## [[1]]
## [1] 0.8537365
Decision Tree grow a tree from the data by splitting them according to entropy to maximize information gain.
Fit a tree. The featore transported and cryo_sleep are converted to factor for it to fit the tree.
# tree_spaceship <- ctree(formula_topf,
# data = train_exp_learn %>% mutate(transported = as.factor(transported),
# cryo_sleep = as.factor(cryo_sleep))
# )
# saveRDS(object = tree_spaceship,file = "tree_spaceship.RDS")Whoa, the training took a while, so I need to save the model to RDS, comment out the code, and read back the RDS. This makes sure that I do not need to re-train the tree because it’s so time consuming.
tree_spaceship <- readRDS("tree_spaceship.RDS")The tree
tree_spaceship##
## Model formula:
## transported ~ lux_spent_average + food_court + lux_spent_iqr +
## shopping_mall + cryo_sleep + home_planet + lux_spent_sum +
## cabin_side + lux_spent_median + cabin_number + age + cabin_deck +
## destination + name_last + room_service + spa
##
## Fitted party:
## [1] root
## | [2] cryo_sleep in FALSE
## | | [3] food_court <= 2165
## | | | [4] shopping_mall <= 627
## | | | | [5] lux_spent_average <= 5.2
## | | | | | [6] cabin_deck in A, B, C, F
## | | | | | | [7] age <= 30
## | | | | | | | [8] home_planet in Earth, Europa: TRUE (n = 42, err = 4.8%)
## | | | | | | | [9] home_planet in Mars: TRUE (n = 65, err = 0.0%)
## | | | | | | [10] age > 30: TRUE (n = 31, err = 41.9%)
## | | | | | [11] cabin_deck in D, E, G
## | | | | | | [12] age <= 15: TRUE (n = 245, err = 45.3%)
## | | | | | | [13] age > 15: FALSE (n = 54, err = 22.2%)
## | | | | [14] lux_spent_average > 5.2
## | | | | | [15] food_court <= 499
## | | | | | | [16] lux_spent_sum <= 1483
## | | | | | | | [17] food_court <= 165
## | | | | | | | | [18] destination in 55 Cancri e, PSO J318.5-22
## | | | | | | | | | [19] cabin_deck in D, F, G: FALSE (n = 349, err = 18.9%)
## | | | | | | | | | [20] cabin_deck in E
## | | | | | | | | | | [21] lux_spent_sum <= 722: TRUE (n = 18, err = 0.0%)
## | | | | | | | | | | [22] lux_spent_sum > 722: FALSE (n = 14, err = 35.7%)
## | | | | | | | | [23] destination in TRAPPIST-1e: FALSE (n = 1062, err = 15.6%)
## | | | | | | | [24] food_court > 165
## | | | | | | | | [25] shopping_mall <= 414: FALSE (n = 214, err = 26.6%)
## | | | | | | | | [26] shopping_mall > 414: TRUE (n = 18, err = 22.2%)
## | | | | | | [27] lux_spent_sum > 1483: FALSE (n = 632, err = 6.0%)
## | | | | | [28] food_court > 499
## | | | | | | [29] lux_spent_iqr <= 451
## | | | | | | | [30] cabin_side in P
## | | | | | | | | [31] lux_spent_median <= 62: FALSE (n = 162, err = 46.9%)
## | | | | | | | | [32] lux_spent_median > 62: FALSE (n = 18, err = 5.6%)
## | | | | | | | [33] cabin_side in S: TRUE (n = 189, err = 34.4%)
## | | | | | | [34] lux_spent_iqr > 451
## | | | | | | | [35] lux_spent_average <= 855
## | | | | | | | | [36] cabin_side in P: FALSE (n = 94, err = 17.0%)
## | | | | | | | | [37] cabin_side in S
## | | | | | | | | | [38] cabin_deck in A, B, D, E, F, G: FALSE (n = 75, err = 24.0%)
## | | | | | | | | | [39] cabin_deck in C: TRUE (n = 18, err = 11.1%)
## | | | | | | | [40] lux_spent_average > 855: FALSE (n = 106, err = 2.8%)
## | | | [41] shopping_mall > 627
## | | | | [42] lux_spent_iqr <= 226
## | | | | | [43] shopping_mall <= 1540: TRUE (n = 284, err = 39.4%)
## | | | | | [44] shopping_mall > 1540: TRUE (n = 59, err = 3.4%)
## | | | | [45] lux_spent_iqr > 226
## | | | | | [46] shopping_mall <= 2975: FALSE (n = 276, err = 30.1%)
## | | | | | [47] shopping_mall > 2975: TRUE (n = 25, err = 28.0%)
## | | [48] food_court > 2165
## | | | [49] lux_spent_iqr <= 2192
## | | | | [50] lux_spent_iqr <= 1017
## | | | | | [51] lux_spent_median <= 168: TRUE (n = 162, err = 2.5%)
## | | | | | [52] lux_spent_median > 168: TRUE (n = 48, err = 20.8%)
## | | | | [53] lux_spent_iqr > 1017
## | | | | | [54] food_court <= 4155: FALSE (n = 44, err = 34.1%)
## | | | | | [55] food_court > 4155: TRUE (n = 42, err = 16.7%)
## | | | [56] lux_spent_iqr > 2192
## | | | | [57] shopping_mall <= 552
## | | | | | [58] food_court <= 4089: FALSE (n = 48, err = 2.1%)
## | | | | | [59] food_court > 4089
## | | | | | | [60] lux_spent_iqr <= 4103: TRUE (n = 25, err = 48.0%)
## | | | | | | [61] lux_spent_iqr > 4103: FALSE (n = 35, err = 2.9%)
## | | | | [62] shopping_mall > 552: TRUE (n = 7, err = 42.9%)
## | [63] cryo_sleep in TRUE
## | | [64] cabin_deck in A, B, C, D, F
## | | | [65] lux_spent_average <= 118
## | | | | [66] home_planet in Earth: TRUE (n = 30, err = 23.3%)
## | | | | [67] home_planet in Europa, Mars: TRUE (n = 1116, err = 1.4%)
## | | | [68] lux_spent_average > 118
## | | | | [69] spa <= 29
## | | | | | [70] home_planet in Earth: FALSE (n = 14, err = 42.9%)
## | | | | | [71] home_planet in Europa, Mars: TRUE (n = 48, err = 14.6%)
## | | | | [72] spa > 29: FALSE (n = 20, err = 25.0%)
## | | [73] cabin_deck in E, G
## | | | [74] home_planet in Earth, Mars
## | | | | [75] cabin_side in P
## | | | | | [76] destination in 55 Cancri e, PSO J318.5-22: TRUE (n = 239, err = 33.1%)
## | | | | | [77] destination in TRAPPIST-1e: TRUE (n = 374, err = 47.6%)
## | | | | [78] cabin_side in S
## | | | | | [79] cabin_number in 0, 1, 2, 3, 4, 5
## | | | | | | [80] cabin_number in 0, 2, 3: FALSE (n = 101, err = 46.5%)
## | | | | | | [81] cabin_number in 1, 4, 5: TRUE (n = 325, err = 30.8%)
## | | | | | [82] cabin_number in 6, 7, 8, 9: TRUE (n = 166, err = 10.8%)
## | | | [83] home_planet in Europa: TRUE (n = 60, err = 6.7%)
##
## Number of inner nodes: 41
## Number of terminal nodes: 42
plot(tree_spaceship, type="simple")Damn. The tree has grown into a maze. The advantage of Decision Tree is supposed to be interpretable, but this? Perhaps we need to prune it later. If this tree performed better than the other models, it would earn our extra attention. Let’s just use it to predict.
train_exp_val_fortreepredict <- train_exp_val %>% mutate(cryo_sleep = as.factor(cryo_sleep),
transported = as.factor(transported))
tree_result <- predict(object = tree_spaceship,
newdata = train_exp_val_fortreepredict,
type = "response")Observe the confusion matrix.
tree_confmat <- confusionMatrix(data = as.factor(tree_result),
reference = as.factor(train_exp_val$transported))tree_confmat## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 649 157
## TRUE 201 732
##
## Accuracy : 0.7941
## 95% CI : (0.7744, 0.8129)
## No Information Rate : 0.5112
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.5876
##
## Mcnemar's Test P-Value : 0.02305
##
## Sensitivity : 0.7635
## Specificity : 0.8234
## Pos Pred Value : 0.8052
## Neg Pred Value : 0.7846
## Prevalence : 0.4888
## Detection Rate : 0.3732
## Detection Prevalence : 0.4635
## Balanced Accuracy : 0.7935
##
## 'Positive' Class : FALSE
##
Proceed to ROC and AUC
tree_roc <- ROCR::prediction(predictions = as.vector(as.numeric(tree_result)),
labels = as.vector(as.numeric(train_exp_val$transported)))perf_tree_roc <- ROCR::performance(
prediction.obj = tree_roc,
measure = "tpr",
x.measure = "fpr")plot(perf_tree_roc, main="Decision Tree ROC Curve")
abline(0,1,lty = 8)tree_auc <- ROCR::performance(prediction.obj = tree_roc, measure = "auc")
tree_auc@y.values## [[1]]
## [1] 0.7934632
Fit the forest, this time I use ranger, because it is FAST.
forest_spaceship <- ranger(formula = formula_topf,
data = train_exp_learn,
importance = "impurity",
min.node.size = 1,
seed = 123,
num.trees = 10000
)
forest_spaceship## Ranger result
##
## Call:
## ranger(formula = formula_topf, data = train_exp_learn, importance = "impurity", min.node.size = 1, seed = 123, num.trees = 10000)
##
## Type: Classification
## Number of trees: 10000
## Sample size: 6954
## Number of independent variables: 16
## Mtry: 4
## Target node size: 1
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 20.72 %
Predict.
forest_val_pred <- predict(forest_spaceship, train_exp_val)Validate with Confusion Matrix.
forest_confmat <- confusionMatrix(data = as.factor(forest_val_pred$predictions),
reference = as.factor(as.numeric(train_exp_val$transported)),
positive = "1")
forest_confmat## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 667 172
## 1 183 717
##
## Accuracy : 0.7959
## 95% CI : (0.7761, 0.8146)
## No Information Rate : 0.5112
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5914
##
## Mcnemar's Test P-Value : 0.5956
##
## Sensitivity : 0.8065
## Specificity : 0.7847
## Pos Pred Value : 0.7967
## Neg Pred Value : 0.7950
## Prevalence : 0.5112
## Detection Rate : 0.4123
## Detection Prevalence : 0.5175
## Balanced Accuracy : 0.7956
##
## 'Positive' Class : 1
##
We continue to ROC and AUC
forest_roc <- ROCR::prediction(predictions = forest_val_pred$predictions,
labels = train_exp_val$transported)perf_forest_roc <- ROCR::performance(
prediction.obj = forest_roc,
measure = "tpr",
x.measure = "fpr")plot(perf_forest_roc, main="Random Forest ROC Curve")
abline(0,1,lty = 8)forest_auc <- ROCR::performance(prediction.obj = forest_roc, measure = "auc")
forest_auc@y.values## [[1]]
## [1] 0.795615
naive_confmat$overall[1]## Accuracy
## 0.6808511
tree_confmat$overall[1]## Accuracy
## 0.7941346
forest_confmat$overall[1]## Accuracy
## 0.7958597
naive_confmat$byClass## Sensitivity Specificity Pos Pred Value
## 0.9482565 0.4011765 0.6235207
## Neg Pred Value Precision Recall
## 0.8811370 0.6235207 0.9482565
## F1 Prevalence Detection Rate
## 0.7523427 0.5112133 0.4847614
## Detection Prevalence Balanced Accuracy
## 0.7774583 0.6747165
tree_confmat$byClass## Sensitivity Specificity Pos Pred Value
## 0.7635294 0.8233971 0.8052109
## Neg Pred Value Precision Recall
## 0.7845659 0.8052109 0.7635294
## F1 Prevalence Detection Rate
## 0.7838164 0.4887867 0.3732030
## Detection Prevalence Balanced Accuracy
## 0.4634848 0.7934632
forest_confmat$byClass## Sensitivity Specificity Pos Pred Value
## 0.8065242 0.7847059 0.7966667
## Neg Pred Value Precision Recall
## 0.7949940 0.7966667 0.8065242
## F1 Prevalence Detection Rate
## 0.8015651 0.5112133 0.4123059
## Detection Prevalence Balanced Accuracy
## 0.5175388 0.7956150
ROC
# ROC curve
plot(perf_naive_roc, main="Naive Bayes ROC Curve")plot(perf_tree_roc, main="Decision Tree ROC Curve")plot(perf_forest_roc, main="Random Forest ROC Curve")AUC
# AUC value
naive_auc@y.values## [[1]]
## [1] 0.8537365
tree_auc@y.values## [[1]]
## [1] 0.7934632
forest_auc@y.values## [[1]]
## [1] 0.795615
The random forest has the best performance here. Next, let’s use it to predict.
For the prediction submission I have to use Logistic Regression since it gives better result at the validation data. But before I put it to the test data, I have to improve the model.
forest_spaceship## Ranger result
##
## Call:
## ranger(formula = formula_topf, data = train_exp_learn, importance = "impurity", min.node.size = 1, seed = 123, num.trees = 10000)
##
## Type: Classification
## Number of trees: 10000
## Sample size: 6954
## Number of independent variables: 16
## Mtry: 4
## Target node size: 1
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 20.72 %
Let’s increase the: number of trees, mtry, Let’s set to false to: oob.error to save computation time
forest_spacecruise <- ranger(formula = formula_topf,
num.trees = 20000,
mtry = 10,
data = train_exp_learn,
importance = "impurity",
min.node.size = 1,
seed = 123,
oob.error = FALSE) Predict.
forest_submission_pred <- predict(forest_spacecruise, test_expanded)test_expanded$Transported <-
ifelse(forest_submission_pred$predictions==1, "True", "False")
test_submission <- test_expanded %>%
select(passenger_id, Transported) %>%
rename(PassengerId = passenger_id)test_submission[1:10,]## # A tibble: 10 x 2
## PassengerId Transported
## <fct> <chr>
## 1 0013_01 False
## 2 0018_01 False
## 3 0019_01 True
## 4 0021_01 True
## 5 0023_01 True
## 6 0027_01 False
## 7 0029_01 True
## 8 0032_01 True
## 9 0032_02 True
## 10 0033_01 True
Save submission to csv.
write_csv(x = test_submission, file = "husada_random_forest_space_titanic_submission.csv")knitr::include_graphics("random_forest_result.JPG")