Data Preparation

Introduction

In 2912, space-faring humanity experienced a terrible calamity. In an emigration voyage to 55 Canri E, the Spaceship Titanic collided with an anomaly hiden in a dust cloud. That accident caused half of the ship’s passengers to vanish, transported to an alternate dimension.

The task is to predict whether a passenger was vanished or not.

Import Data

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(DataExplorer)
library(missRanger)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(performance)
library(see)
library(Boruta)
library(dlookr)
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:base':
## 
##     transform
library(class)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(gtools)
library(ROCR)
## 
## Attaching package: 'ROCR'
## The following object is masked from 'package:performance':
## 
##     performance
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:gtools':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(broom)
train <- read_csv("train.csv")
## Rows: 8693 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (3): CryoSleep, VIP, Transported
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv("test.csv")
## Rows: 4277 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (2): CryoSleep, VIP
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(train, 3)
## # A tibble: 3 x 14
##   PassengerId HomePlanet CryoSleep Cabin Destination   Age VIP   RoomService
##   <chr>       <chr>      <lgl>     <chr> <chr>       <dbl> <lgl>       <dbl>
## 1 0001_01     Europa     FALSE     B/0/P TRAPPIST-1e    39 FALSE           0
## 2 0002_01     Earth      FALSE     F/0/S TRAPPIST-1e    24 FALSE         109
## 3 0003_01     Europa     FALSE     A/0/S TRAPPIST-1e    58 TRUE           43
## # ... with 6 more variables: FoodCourt <dbl>, ShoppingMall <dbl>, Spa <dbl>,
## #   VRDeck <dbl>, Name <chr>, Transported <lgl>
head(test, 3)
## # A tibble: 3 x 13
##   PassengerId HomePlanet CryoSleep Cabin Destination   Age VIP   RoomService
##   <chr>       <chr>      <lgl>     <chr> <chr>       <dbl> <lgl>       <dbl>
## 1 0013_01     Earth      TRUE      G/3/S TRAPPIST-1e    27 FALSE           0
## 2 0018_01     Earth      FALSE     F/4/S TRAPPIST-1e    19 FALSE           0
## 3 0019_01     Europa     TRUE      C/0/S 55 Cancri e    31 FALSE           0
## # ... with 5 more variables: FoodCourt <dbl>, ShoppingMall <dbl>, Spa <dbl>,
## #   VRDeck <dbl>, Name <chr>

To understand and process the data as a whole, first, we need to join them together

whole <- bind_rows(train, test)

Initial Cleansing

First, clean the column names.

whole <- whole %>% clean_names()
str(whole)
## spec_tbl_df [12,970 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ passenger_id : chr [1:12970] "0001_01" "0002_01" "0003_01" "0003_02" ...
##  $ home_planet  : chr [1:12970] "Europa" "Earth" "Europa" "Europa" ...
##  $ cryo_sleep   : logi [1:12970] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ cabin        : chr [1:12970] "B/0/P" "F/0/S" "A/0/S" "A/0/S" ...
##  $ destination  : chr [1:12970] "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" ...
##  $ age          : num [1:12970] 39 24 58 33 16 44 26 28 35 14 ...
##  $ vip          : logi [1:12970] FALSE FALSE TRUE FALSE FALSE FALSE ...
##  $ room_service : num [1:12970] 0 109 43 0 303 0 42 0 0 0 ...
##  $ food_court   : num [1:12970] 0 9 3576 1283 70 ...
##  $ shopping_mall: num [1:12970] 0 25 0 371 151 0 3 0 17 0 ...
##  $ spa          : num [1:12970] 0 549 6715 3329 565 ...
##  $ vr_deck      : num [1:12970] 0 44 49 193 2 0 0 NA 0 0 ...
##  $ name         : chr [1:12970] "Maham Ofracculy" "Juanna Vines" "Altark Susent" "Solam Susent" ...
##  $ transported  : logi [1:12970] FALSE TRUE FALSE FALSE TRUE TRUE ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   PassengerId = col_character(),
##   ..   HomePlanet = col_character(),
##   ..   CryoSleep = col_logical(),
##   ..   Cabin = col_character(),
##   ..   Destination = col_character(),
##   ..   Age = col_double(),
##   ..   VIP = col_logical(),
##   ..   RoomService = col_double(),
##   ..   FoodCourt = col_double(),
##   ..   ShoppingMall = col_double(),
##   ..   Spa = col_double(),
##   ..   VRDeck = col_double(),
##   ..   Name = col_character(),
##   ..   Transported = col_logical()
##   .. )
##  - attr(*, "problems")=<externalptr>

Let’s check for missing values.

plot_missing(whole)

The missing values are not significant, it’s good enough percentage to be omitted. But to be safe, I will impute them later. For now, let’s consider how many unique value in the data, to conclude whether some variables need to be converted into factor or not.

sapply(whole, function(x) n_distinct(x))
##  passenger_id   home_planet    cryo_sleep         cabin   destination 
##         12970             4             3          9826             4 
##           age           vip  room_service    food_court shopping_mall 
##            81             3          1579          1954          1368 
##           spa       vr_deck          name   transported 
##          1680          1643         12630             3
str(whole)
## spec_tbl_df [12,970 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ passenger_id : chr [1:12970] "0001_01" "0002_01" "0003_01" "0003_02" ...
##  $ home_planet  : chr [1:12970] "Europa" "Earth" "Europa" "Europa" ...
##  $ cryo_sleep   : logi [1:12970] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ cabin        : chr [1:12970] "B/0/P" "F/0/S" "A/0/S" "A/0/S" ...
##  $ destination  : chr [1:12970] "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" ...
##  $ age          : num [1:12970] 39 24 58 33 16 44 26 28 35 14 ...
##  $ vip          : logi [1:12970] FALSE FALSE TRUE FALSE FALSE FALSE ...
##  $ room_service : num [1:12970] 0 109 43 0 303 0 42 0 0 0 ...
##  $ food_court   : num [1:12970] 0 9 3576 1283 70 ...
##  $ shopping_mall: num [1:12970] 0 25 0 371 151 0 3 0 17 0 ...
##  $ spa          : num [1:12970] 0 549 6715 3329 565 ...
##  $ vr_deck      : num [1:12970] 0 44 49 193 2 0 0 NA 0 0 ...
##  $ name         : chr [1:12970] "Maham Ofracculy" "Juanna Vines" "Altark Susent" "Solam Susent" ...
##  $ transported  : logi [1:12970] FALSE TRUE FALSE FALSE TRUE TRUE ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   PassengerId = col_character(),
##   ..   HomePlanet = col_character(),
##   ..   CryoSleep = col_logical(),
##   ..   Cabin = col_character(),
##   ..   Destination = col_character(),
##   ..   Age = col_double(),
##   ..   VIP = col_logical(),
##   ..   RoomService = col_double(),
##   ..   FoodCourt = col_double(),
##   ..   ShoppingMall = col_double(),
##   ..   Spa = col_double(),
##   ..   VRDeck = col_double(),
##   ..   Name = col_character(),
##   ..   Transported = col_logical()
##   .. )
##  - attr(*, "problems")=<externalptr>

Since cryo_sleep and vip are already logical class, we only need to convert home_planet and destination from character to factor.

whole <- whole %>% 
  mutate(home_planet = as.factor(home_planet),
         destination = as.factor(destination)
         )

Now let’s impute the missing data with what we have using missRanger. We create formula to target every columns except passenger id and transported. But for the predictor, we also have to dismissed any unordered factor with more than 53 level.

init_imp = 
  home_planet + cryo_sleep + cabin + destination + age + vip + room_service + food_court + shopping_mall + spa + vr_deck + name ~ home_planet + cryo_sleep + + destination + vip + room_service + food_court + shopping_mall + spa + vr_deck

whole_init_imp <- missRanger(data = whole,
                             formula = init_imp,
                             pmm.k = 3, # to round imputation result with predictive mean matching
                             verbose = 2, 
                             num.trees = 50, # tweaks to make things faster
                             sample.fraction = 0.1, # tweaks to make things faster
                             splitrule = "extratrees", # tweaks to make things faster
                             max.depth = 6, # tweaks to make things faster
                             min.node.size = 10000, # tweaks to make things faster
                             maxiter = 2)# tweaks to make things faster
## 
## Missing value imputation by random forests
## 
##   Variables to impute:       home_planet, cryo_sleep, cabin, destination, age, vip, room_service, food_court, shopping_mall, spa, vr_deck, name
##   Variables used to impute:  home_planet, cryo_sleep, destination, vip, room_service, food_court, shopping_mall, spa, vr_deck
##  rm_srv  vr_dck  age dstntn  spa hm_pln  fd_crt  name    vip cabin   shppn_  cry_sl
## iter 1:  1.0000  1.0001  1.0001  0.3013  1.0002  0.4587  1.0001  1.0000  0.0215  0.9999  1.0000  0.3618  
## iter 2:  1.0001  1.0001  1.0001  0.3013  1.0001  0.4587  1.0001  1.0000  0.0215  0.9998  1.0001  0.3618  
colSums(is.na(whole_init_imp))
##  passenger_id   home_planet    cryo_sleep         cabin   destination 
##             0             0             0             0             0 
##           age           vip  room_service    food_court shopping_mall 
##             0             0             0             0             0 
##           spa       vr_deck          name   transported 
##             0             0             0          4277

Feature Engineering

With the initial imputation done, we can then proceed to extract other features from them. The first four numbers in passenger_id represent groupings, while the last two numbers are their numbers within each group. The next column to extract is cabin, that can be extracted into deck, number, and side. Then we can create more columns that account to the various calculations regarding how the passenger spent their money on the ship’s luxury amenities. Perhaps, we need to take a look at the passenger’s first and last name as well.

lux <- c("room_service", "food_court", "shopping_mall", "spa", "vr_deck")

whole_init_exp <- whole_init_imp %>% 
  mutate(passenger_id = as.factor(passenger_id), # turn into factor so it doesn't removed later
         passenger_group = str_sub(passenger_id, 1, 4),
         passenger_number_within_group = 
           as.factor(str_sub(passenger_id, 6, 7)),
         cabin_deck = as.factor(str_sub(cabin, 1, 1)),
         cabin_number = as.factor(str_sub(cabin, 3, 3)),
         cabin_side = as.factor(str_sub(cabin, -1))) %>% 
  separate(col = name,
           sep = " ",
           into = c("name_first","name_last"),) %>% 
  mutate(name_last = as.factor(name_last)) %>% 
  rowwise() %>% 
  mutate(lux_spent_average = mean(c_across(cols = all_of(lux))),
         lux_spent_sum = sum(c_across(cols = all_of(lux))),
         lux_spent_median = median(c_across(cols = all_of(lux))),
         lux_spent_var = var(c_across(cols = all_of(lux))),
         lux_spent_sd = sd(c_across(cols = all_of(lux))),
         lux_spent_iqr = IQR(c_across(cols = all_of(lux)))
         ) %>% 
  ungroup()

head(whole_init_exp)[,]
## # A tibble: 6 x 26
##   passenger_id home_planet cryo_sleep cabin destination   age vip   room_service
##   <fct>        <fct>       <lgl>      <chr> <fct>       <dbl> <lgl>        <dbl>
## 1 0001_01      Europa      FALSE      B/0/P TRAPPIST-1e    39 FALSE            0
## 2 0002_01      Earth       FALSE      F/0/S TRAPPIST-1e    24 FALSE          109
## 3 0003_01      Europa      FALSE      A/0/S TRAPPIST-1e    58 TRUE            43
## 4 0003_02      Europa      FALSE      A/0/S TRAPPIST-1e    33 FALSE            0
## 5 0004_01      Earth       FALSE      F/1/S TRAPPIST-1e    16 FALSE          303
## 6 0005_01      Earth       FALSE      F/0/P PSO J318.5~    44 FALSE            0
## # ... with 18 more variables: food_court <dbl>, shopping_mall <dbl>, spa <dbl>,
## #   vr_deck <dbl>, name_first <chr>, name_last <fct>, transported <lgl>,
## #   passenger_group <chr>, passenger_number_within_group <fct>,
## #   cabin_deck <fct>, cabin_number <fct>, cabin_side <fct>,
## #   lux_spent_average <dbl>, lux_spent_sum <dbl>, lux_spent_median <dbl>,
## #   lux_spent_var <dbl>, lux_spent_sd <dbl>, lux_spent_iqr <dbl>

Remove character class from the data frame. The feature passenger_id is kept around for the test submission later.

whole_init_exp <- whole_init_exp %>% select(-where(is.character)) # passenger id is kept 
head(whole_init_exp,2)
## # A tibble: 2 x 23
##   passenger_id home_planet cryo_sleep destination   age vip   room_service
##   <fct>        <fct>       <lgl>      <fct>       <dbl> <lgl>        <dbl>
## 1 0001_01      Europa      FALSE      TRAPPIST-1e    39 FALSE            0
## 2 0002_01      Earth       FALSE      TRAPPIST-1e    24 FALSE          109
## # ... with 16 more variables: food_court <dbl>, shopping_mall <dbl>, spa <dbl>,
## #   vr_deck <dbl>, name_last <fct>, transported <lgl>,
## #   passenger_number_within_group <fct>, cabin_deck <fct>, cabin_number <fct>,
## #   cabin_side <fct>, lux_spent_average <dbl>, lux_spent_sum <dbl>,
## #   lux_spent_median <dbl>, lux_spent_var <dbl>, lux_spent_sd <dbl>,
## #   lux_spent_iqr <dbl>

Analysis

Exploratory Data Analysis

Let’s see the expanded dataset’s summary.

plot_intro(whole_init_exp)

Let’s visualize the discrete features that have less than 50 categories

plot_bar(whole_init_exp, 
         by = "transported", 
         by_position = "fill",
         nrow = 2,
         ncol = 2)
## 2 columns ignored with more than 50 categories.
## passenger_id: 12970 categories
## name_last: 2406 categories

From the quick peek above, vip passengers that slept in the cryo were much more likely to be not transported away. So did, having a cabin in deck E and number 0.

Then we take a look at the continuous numbers.

plot_boxplot(whole_init_exp,
             by = "transported", 
             nrow = 2, 
             ncol = 2)

Passengers that spent their money in crowded open space like food_court and shopping_mall were more likely to be vanished. Meanwhile, spending money in more private situation like room_service, spa, and vr_deck helped their chance to be not taken away into some alternate dimension. Was there something contagious with the way the vanishing happened?

They’re mostly not normally distributed, but to be sure, let’s see them in qq plot.

plot_qq(whole_init_exp, 
        by = "transported",
        nrow = 2,
        ncol = 2
        )

Only age that has resemblance of normal distribution. The rest are skewed.

Cross-Validation

We have to split the train data into two parts:

  • Train - Learn : We use this to train the model
  • Train - Validation : Then we evaluate the model in this data set
train_expanded <- whole_init_exp %>% filter(!is.na(transported))
test_expanded <- whole_init_exp %>% filter(is.na(transported))
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(1)
idx <- sample(nrow(train_expanded),
              0.8 * nrow(train_expanded))
train_exp_learn <- train_expanded[idx,-1]
train_exp_val <- train_expanded[-idx,-1]

Let’s check the target balance.

prop.table(table(train_exp_learn$transported))
## 
##     FALSE      TRUE 
## 0.4982744 0.5017256

Cool, it’s already balanced.

Modeling and Validation

Features Selection

The number of rows is too much for my hardware to process model that includes all the features. So here I’m using Boruta to evaluate which features are important and should be integrated in the actual model.

boruta_output <- Boruta(transported ~ ., data=na.omit(train_exp_learn), doTrace=1)
## After 12 iterations, +19 secs:
##  confirmed 19 attributes: age, cabin_deck, cabin_number, cabin_side, cryo_sleep and 14 more;
##  rejected 1 attribute: vip;
##  still have 1 attribute left.

Boruta created shadow features from randomized copies of the original features. It then applied random forest classifier on them and applies feature importance measures. Boruta compares the best of the shadows and the origins. Those original features that lost to the shadows are rejected.

plotImpHistory(boruta_output)

plot(boruta_output, cex.axis=.8, las=2, xlab="", main="Variable Importance")

The plot above shows 19 confirmed important features.The boruta results agree with the previous observations that how public or private their behaviors are quite important since food_court, spa, and cryo_sleep is in the top five confirmed list.

Let’s pull the top 19 important features.

topf <- boruta_output$ImpHistory %>% 
  as.data.frame %>% 
  summarise_all(sum) %>% 
  pivot_longer(cols = 1:24,
               names_to = "features",
               values_to = "impsum") %>%
  arrange(desc(impsum)) %>%
  top_n(19) %>%
  pull(features)
## Selecting by impsum
topf
##  [1] "food_court"                    "home_planet"                  
##  [3] "cabin_deck"                    "cryo_sleep"                   
##  [5] "vr_deck"                       "spa"                          
##  [7] "room_service"                  "lux_spent_sum"                
##  [9] "lux_spent_average"             "lux_spent_sd"                 
## [11] "lux_spent_var"                 "shopping_mall"                
## [13] "age"                           "cabin_number"                 
## [15] "lux_spent_iqr"                 "cabin_side"                   
## [17] "destination"                   "lux_spent_median"             
## [19] "passenger_number_within_group"

Logistic Regression Model

For Logistic Regression, we need a formula. Let’s turn topf to formula.

formula_topf <- as.formula(paste("transported ~ ", paste(topf, collapse = "+"))) 
formula_topf
## transported ~ food_court + home_planet + cabin_deck + cryo_sleep + 
##     vr_deck + spa + room_service + lux_spent_sum + lux_spent_average + 
##     lux_spent_sd + lux_spent_var + shopping_mall + age + cabin_number + 
##     lux_spent_iqr + cabin_side + destination + lux_spent_median + 
##     passenger_number_within_group
dim(train_exp_learn)
## [1] 6954   22
logistic_model_null <- glm(transported ~ 1,
                  data = train_exp_learn,
                  family = binomial)
logistic_model_topf <- glm(formula_topf,
                  data = train_exp_learn,
                  family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logistic_model_stepboth <-
  step(object = logistic_model_null, direction = "both", 
       scope = list(lower = logistic_model_null, 
                    upper = logistic_model_topf), trace = 0)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logistic_model_stepboth
## 
## Call:  glm(formula = transported ~ cryo_sleep + spa + cabin_deck + vr_deck + 
##     room_service + lux_spent_sd + cabin_side + cabin_number + 
##     home_planet + destination + age + lux_spent_median + passenger_number_within_group, 
##     family = binomial, data = train_exp_learn)
## 
## Coefficients:
##                     (Intercept)                   cryo_sleepTRUE  
##                      -0.6495333                        1.4522761  
##                             spa                      cabin_deckB  
##                      -0.0024830                        1.3620632  
##                     cabin_deckC                      cabin_deckD  
##                       2.6511879                        0.5933610  
##                     cabin_deckE                      cabin_deckF  
##                       0.1607195                        0.5084976  
##                     cabin_deckG                      cabin_deckT  
##                       0.1061326                       -0.4406528  
##                         vr_deck                     room_service  
##                      -0.0023353                       -0.0019734  
##                    lux_spent_sd                      cabin_sideS  
##                       0.0016113                        0.5935557  
##                   cabin_number1                    cabin_number2  
##                       0.2215891                       -0.0929036  
##                   cabin_number3                    cabin_number4  
##                      -0.1247500                       -0.2811728  
##                   cabin_number5                    cabin_number6  
##                       0.2790346                        0.2325812  
##                   cabin_number7                    cabin_number8  
##                       0.5171484                        0.6521658  
##                   cabin_number9                home_planetEuropa  
##                       1.0956275                        1.2489504  
##                 home_planetMars         destinationPSO J318.5-22  
##                       0.5717192                       -0.4169489  
##          destinationTRAPPIST-1e                              age  
##                      -0.4858853                       -0.0093622  
##                lux_spent_median  passenger_number_within_group02  
##                       0.0006304                        0.1801495  
## passenger_number_within_group03  passenger_number_within_group04  
##                       0.4206803                        0.1890521  
## passenger_number_within_group05  passenger_number_within_group06  
##                      -0.1915441                        0.0654063  
## passenger_number_within_group07  passenger_number_within_group08  
##                      -0.3746677                       -0.6698812  
## 
## Degrees of Freedom: 6953 Total (i.e. Null);  6918 Residual
## Null Deviance:       9640 
## Residual Deviance: 5948  AIC: 6020
coefficients(logistic_model_stepboth) %>% sort(decreasing = T)
##                     cabin_deckC                  cryo_sleepTRUE 
##                    2.6511878983                    1.4522760704 
##                     cabin_deckB               home_planetEuropa 
##                    1.3620632217                    1.2489503646 
##                   cabin_number9                   cabin_number8 
##                    1.0956275223                    0.6521658126 
##                     cabin_sideS                     cabin_deckD 
##                    0.5935556910                    0.5933609589 
##                 home_planetMars                   cabin_number7 
##                    0.5717191621                    0.5171484437 
##                     cabin_deckF passenger_number_within_group03 
##                    0.5084976393                    0.4206802595 
##                   cabin_number5                   cabin_number6 
##                    0.2790345632                    0.2325812416 
##                   cabin_number1 passenger_number_within_group04 
##                    0.2215890627                    0.1890520848 
## passenger_number_within_group02                     cabin_deckE 
##                    0.1801494949                    0.1607195356 
##                     cabin_deckG passenger_number_within_group06 
##                    0.1061326081                    0.0654063140 
##                    lux_spent_sd                lux_spent_median 
##                    0.0016113212                    0.0006303839 
##                    room_service                         vr_deck 
##                   -0.0019734229                   -0.0023353082 
##                             spa                             age 
##                   -0.0024829941                   -0.0093621898 
##                   cabin_number2                   cabin_number3 
##                   -0.0929036391                   -0.1247499942 
## passenger_number_within_group05                   cabin_number4 
##                   -0.1915440853                   -0.2811728467 
## passenger_number_within_group07        destinationPSO J318.5-22 
##                   -0.3746676521                   -0.4169489433 
##                     cabin_deckT          destinationTRAPPIST-1e 
##                   -0.4406527777                   -0.4858852565 
##                     (Intercept) passenger_number_within_group08 
##                   -0.6495332598                   -0.6698811713

Let’s take a look at the highest coefficient which is Cabin Deck C and the lowest coefficient which is Cabin Deck T.

inv.logit(2.661411466) # C
## [1] 0.9347109
inv.logit(-0.697263538) # T
## [1] 0.3324192

A passenger assigned on Deck C, has 93% chance to be transported away to alternate dimension. A passenger assigned on Deck T, has % chance to be transported away to alternate dimension.

Logistic Regression Validation

Now we can use the train_exp_val for validation

res_val_logres_1 <- predict(logistic_model_stepboth, 
                             train_exp_val[], 
                             type = "response")
transported_pred <- ifelse(res_val_logres_1 > 0.5, TRUE, FALSE) 

Now create confusion matrix.

logres_confmat <- confusionMatrix(data = as.factor(transported_pred), 
                reference = as.factor(train_exp_val$transported),
                positive = "TRUE")

logres_confmat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   649  164
##      TRUE    201  725
##                                          
##                Accuracy : 0.7901         
##                  95% CI : (0.7702, 0.809)
##     No Information Rate : 0.5112         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.5796         
##                                          
##  Mcnemar's Test P-Value : 0.05952        
##                                          
##             Sensitivity : 0.8155         
##             Specificity : 0.7635         
##          Pos Pred Value : 0.7829         
##          Neg Pred Value : 0.7983         
##              Prevalence : 0.5112         
##          Detection Rate : 0.4169         
##    Detection Prevalence : 0.5325         
##       Balanced Accuracy : 0.7895         
##                                          
##        'Positive' Class : TRUE           
## 

K-Nearest Neighbors Model

We can only proceed with numerical features for KNN, and they need to be scaled as well. Performed transformation on the train, validation, and test.

knn_train_exp_learn <- train_exp_learn %>% 
  mutate(cryo_sleep = as.numeric(cryo_sleep),
         vip = as.numeric(vip)) %>% 
  select(where(is.numeric)) %>% scale(.) 
head(knn_train_exp_learn,5)
##      cryo_sleep        age        vip room_service food_court shopping_mall
## [1,] -0.7479719 -0.2106571 -0.1529581    0.4798134 -0.1639743    -0.2362479
## [2,] -0.7479719 -2.0150746 -0.1529581   -0.3374959 -0.2876490    -0.2849046
## [3,] -0.7479719  0.2057469 -0.1529581    6.6244392 -0.2876490     0.4433242
## [4,]  1.3367563 -0.8352632 -0.1529581   -0.3374959 -0.2876490    -0.2849046
## [5,]  1.3367563 -1.7374719 -0.1529581   -0.3374959 -0.2876490    -0.2849046
##              spa    vr_deck lux_spent_average lux_spent_sum lux_spent_median
## [1,] -0.27106492 -0.2624567        -0.2432412    -0.2432412       -0.1415593
## [2,] -0.27106492 -0.2633274        -0.5244943    -0.5244943       -0.2592790
## [3,] -0.09508641 -0.2633274         1.4137927     1.4137927        0.4862791
## [4,] -0.27106492 -0.2633274        -0.5244943    -0.5244943       -0.2592790
## [5,] -0.27106492 -0.2633274        -0.5244943    -0.5244943       -0.2592790
##      lux_spent_var lux_spent_sd lux_spent_iqr
## [1,]    -0.1843014   -0.2707499    -0.1044475
## [2,]    -0.1944667   -0.5294325    -0.3661898
## [3,]     0.5663459    1.7085028     0.2491102
## [4,]    -0.1944667   -0.5294325    -0.3661898
## [5,]    -0.1944667   -0.5294325    -0.3661898
knn_train_exp_val <- train_exp_val %>% 
  mutate(cryo_sleep = as.numeric(cryo_sleep),
         vip = as.numeric(vip)) %>% 
  select(where(is.numeric)) 

knn_train_exp_val <- scale(x = knn_train_exp_val,
                           center = attr(knn_train_exp_learn,"scaled:center"),
                           scale = attr(knn_train_exp_learn,"scaled:scale"))
head(knn_train_exp_val,5)
##      cryo_sleep        age        vip room_service food_court shopping_mall
## [1,] -0.7479719 -0.3494584 -0.1529581   -0.1761069 -0.2818517    -0.2443574
## [2,] -0.7479719  2.0101645  6.5367961   -0.2738287  2.0157919    -0.2849046
## [3,] -0.7479719  1.3161577 -0.1529581    0.7270790 -0.2870048    -0.1794818
## [4,]  1.3367563  1.1079557 -0.1529581   -0.3374959 -0.2876490    -0.2849046
## [5,]  1.3367563  1.3855584 -0.1529581   -0.3374959 -0.2876490    -0.2849046
##             spa    vr_deck lux_spent_average lux_spent_sum lux_spent_median
## [1,]  0.2374204 -0.2250162        -0.2573946    -0.2573946      -0.08662347
## [2,]  5.9483861 -0.2206627         3.2435705     3.2435705      -0.06700352
## [3,] -0.2710649 -0.2424304        -0.2309024    -0.2309024      -0.16510327
## [4,] -0.2710649 -0.2633274        -0.5244943    -0.5244943      -0.25927904
## [5,] -0.2710649 -0.2633274        -0.5244943    -0.5244943      -0.25927904
##      lux_spent_var lux_spent_sd lux_spent_iqr
## [1,]    -0.1849762   -0.2794830    -0.2510780
## [2,]     1.4661160    2.7768422     4.4753577
## [3,]    -0.1765968   -0.1864524    -0.2784856
## [4,]    -0.1944667   -0.5294325    -0.3661898
## [5,]    -0.1944667   -0.5294325    -0.3661898

Extract the response variable from predictor

knn_train_learn_target <- train_exp_learn %>% select(transported) %>% pull() %>% as.numeric()
knn_train_val_target <- train_exp_val %>% select(transported) %>% pull() %>% as.numeric() # for reference

K-Nearest Neighbors Validation

We pick the number of K.

nrow(knn_train_exp_learn)
## [1] 6954
sqrt(nrow(knn_train_exp_learn)) %>% round()
## [1] 83

Now let’s see the validation

res_val_knn_1 <- knn(train = knn_train_exp_learn,
                 test = knn_train_exp_val,
                   cl = knn_train_learn_target,
                   k = 83)
res_val_knn_1[1:20]
##  [1] 0 0 0 1 1 1 1 0 1 0 0 1 0 0 1 0 0 1 1 1
## Levels: 0 1

Create confusion matrix.

knn_confmat <- confusionMatrix(data = res_val_knn_1, reference = as.factor(knn_train_val_target), positive = "1")

knn_confmat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 629 149
##          1 221 740
##                                           
##                Accuracy : 0.7872          
##                  95% CI : (0.7672, 0.8063)
##     No Information Rate : 0.5112          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5735          
##                                           
##  Mcnemar's Test P-Value : 0.0002233       
##                                           
##             Sensitivity : 0.8324          
##             Specificity : 0.7400          
##          Pos Pred Value : 0.7700          
##          Neg Pred Value : 0.8085          
##              Prevalence : 0.5112          
##          Detection Rate : 0.4255          
##    Detection Prevalence : 0.5526          
##       Balanced Accuracy : 0.7862          
##                                           
##        'Positive' Class : 1               
## 

Evaluation

Confusion Matrix Comparison
logres_confmat$overall[1]
##  Accuracy 
## 0.7901093
knn_confmat$overall[1]
## Accuracy 
## 0.787234
logres_confmat$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.8155231            0.7635294            0.7829374 
##       Neg Pred Value            Precision               Recall 
##            0.7982780            0.7829374            0.8155231 
##                   F1           Prevalence       Detection Rate 
##            0.7988981            0.5112133            0.4169063 
## Detection Prevalence    Balanced Accuracy 
##            0.5324899            0.7895262
knn_confmat$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.8323960            0.7400000            0.7700312 
##       Neg Pred Value            Precision               Recall 
##            0.8084833            0.7700312            0.8323960 
##                   F1           Prevalence       Detection Rate 
##            0.8000000            0.5112133            0.4255319 
## Detection Prevalence    Balanced Accuracy 
##            0.5526164            0.7861980

The Logistic Regression has given better result in accuracy and precision (pos pred value) but KNN is better at recall (sensitivity).

ROC & AUC Comparison

ROC

# objek prediction
logress_roc_pred <- prediction(predictions = as.numeric(transported_pred), 
                               labels = as.numeric(train_exp_val$transported))
knn_roc_pred <- prediction(predictions = as.numeric(res_val_knn_1), 
                               labels = as.numeric(knn_train_val_target))
# ROC curve
plot(performance(prediction.obj = logress_roc_pred, measure = "tpr", x.measure = "fpr"),
     main = "Logistic Regression ROC Curve")

plot(performance(prediction.obj = knn_roc_pred, measure = "tpr", x.measure = "fpr"),
     main = "K-Nearest Neighbors ROC Curve")

AUC

# AUC value
logress_auc <- performance(prediction.obj = logress_roc_pred, measure = "auc")
logress_auc@y.values
## [[1]]
## [1] 0.7895262
knn_auc <- performance(prediction.obj = knn_roc_pred, measure = "auc")
knn_auc@y.values
## [[1]]
## [1] 0.786198
Decision

For the prediction submission I have to use Logistic Regression since it gives better result at the validation data. But before I put it to the test data, I have to improve the model.

Assumption Check and Improvement

Linearity

There’s a linear relationship between the dependent and the independent variables. If the alternative hypothesis is true, these predictors pass the test.

cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$cryo_sleep))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$cryo_sleep)
## t = 42.914, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4388455 0.4760134
## sample estimates:
##       cor 
## 0.4576293
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$spa))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$spa)
## t = -18.344, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2371786 -0.1923395
## sample estimates:
##        cor 
## -0.2148723
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$cabin_deck))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$cabin_deck)
## t = -10.299, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.14567794 -0.09937576
## sample estimates:
##        cor 
## -0.1225936
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$vr_deck))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$vr_deck)
## t = -17.265, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2251959 -0.1801193
## sample estimates:
##       cor 
## -0.202765
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$room_service))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$room_service)
## t = -20.343, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2590906 -0.2147220
## sample estimates:
##        cor 
## -0.2370299
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$lux_spent_sd))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$lux_spent_sd)
## t = -14.233, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1910192 -0.1453412
## sample estimates:
##        cor 
## -0.1682705
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$cabin_side))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$cabin_side)
## t = 8.0924, df = 6952, p-value = 6.853e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.07326383 0.11983368
## sample estimates:
##        cor 
## 0.09660163
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$cabin_number))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$cabin_number)
## t = 3.9564, df = 6952, p-value = 7.683e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02392073 0.07082347
## sample estimates:
##        cor 
## 0.04739822
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$home_planet))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$home_planet)
## t = 10.399, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1005491 0.1468378
## sample estimates:
##       cor 
## 0.1237608
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$destination))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$destination)
## t = -8.2971, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.12224132 -0.07569371
## sample estimates:
##         cor 
## -0.09902169
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$age))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$age)
## t = -6.0834, df = 6952, p-value = 1.239e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09610785 -0.04934834
## sample estimates:
##         cor 
## -0.07276808
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$passenger_number_within_group))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$passenger_number_within_group)
## t = 5.5778, df = 6952, p-value = 2.527e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.04331197 0.09011094
## sample estimates:
##        cor 
## 0.06674817
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$lux_spent_median))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$lux_spent_median)
## t = -12.345, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1693851 -0.1233847
## sample estimates:
##       cor 
## -0.146464

All true.

Multicollinearity

There should be no value above 10.

vif(logistic_model_stepboth,)
##                                   GVIF Df GVIF^(1/(2*Df))
## cryo_sleep                    1.610049  1        1.268877
## spa                           1.628082  1        1.275963
## cabin_deck                    8.748344  7        1.167563
## vr_deck                       1.811430  1        1.345894
## room_service                  1.515327  1        1.230986
## lux_spent_sd                  2.494686  1        1.579457
## cabin_side                    1.027255  1        1.013536
## cabin_number                  1.315425  9        1.015348
## home_planet                   5.827345  2        1.553702
## destination                   1.107361  2        1.025823
## age                           1.148340  1        1.071606
## lux_spent_median              1.476110  1        1.214953
## passenger_number_within_group 1.118837  7        1.008053

All fine. No value above 10.

Influential Values

Not all extreme data affect model quality, but one that does is called influential value. Bellow is an ilustration of how far each data point astrays in term of Cook’s distance.

plot(logistic_model_stepboth, which = 4, id.n = 3)

Here are the passengers with the top 3 Cook’s distance value. But they’re not necessarily influential.

logistic_model_stepboth.data <- augment(logistic_model_stepboth) %>% mutate(index=1:n()) 
logistic_model_stepboth.data %>% arrange(desc(.cooksd)) %>% slice(1:3)
## # A tibble: 3 x 21
##   transported cryo_sleep   spa cabin_deck vr_deck room_service lux_spent_sd
##   <lgl>       <lgl>      <dbl> <fct>        <dbl>        <dbl>        <dbl>
## 1 TRUE        FALSE         26 T                3            0        1399.
## 2 FALSE       FALSE         14 T               60          415         566.
## 3 FALSE       TRUE           0 E                0            0           0 
## # ... with 14 more variables: cabin_side <fct>, cabin_number <fct>,
## #   home_planet <fct>, destination <fct>, age <dbl>, lux_spent_median <dbl>,
## #   passenger_number_within_group <fct>, .fitted <dbl>, .resid <dbl>,
## #   .std.resid <dbl>, .hat <dbl>, .sigma <dbl>, .cooksd <dbl>, index <int>

Let’s visualize to see whether or not there are potential .std.resid above 3.

ggplot(logistic_model_stepboth.data, aes(index, .std.resid)) + 
  geom_point(aes(color = transported), alpha = .5) 

Influential data point has .std.resid above 3.

logistic_model_stepboth.data %>% filter(.std.resid > 3)
## # A tibble: 1 x 21
##   transported cryo_sleep   spa cabin_deck vr_deck room_service lux_spent_sd
##   <lgl>       <lgl>      <dbl> <fct>        <dbl>        <dbl>        <dbl>
## 1 TRUE        FALSE       3131 C              498         2320        1985.
## # ... with 14 more variables: cabin_side <fct>, cabin_number <fct>,
## #   home_planet <fct>, destination <fct>, age <dbl>, lux_spent_median <dbl>,
## #   passenger_number_within_group <fct>, .fitted <dbl>, .resid <dbl>,
## #   .std.resid <dbl>, .hat <dbl>, .sigma <dbl>, .cooksd <dbl>, index <int>

There’s a passenger from Europa assigned on Deck C that disrupted the model as an influential data point. We have to remove him/her/they to improve our model.

train_exp_learn_indexed <- train_exp_learn %>% 
  mutate(index = 1:n())

train_exp_learn_indexed %>%
  filter(
    cabin_number==1,
    home_planet=="Europa",
    destination=="55 Cancri e",
    age==40,
    passenger_number_within_group=="04"
  )
## # A tibble: 1 x 23
##   home_planet cryo_sleep destination   age vip   room_service food_court
##   <fct>       <lgl>      <fct>       <dbl> <lgl>        <dbl>      <dbl>
## 1 Europa      FALSE      55 Cancri e    40 FALSE         2320          0
## # ... with 16 more variables: shopping_mall <dbl>, spa <dbl>, vr_deck <dbl>,
## #   name_last <fct>, transported <lgl>, passenger_number_within_group <fct>,
## #   cabin_deck <fct>, cabin_number <fct>, cabin_side <fct>,
## #   lux_spent_average <dbl>, lux_spent_sum <dbl>, lux_spent_median <dbl>,
## #   lux_spent_var <dbl>, lux_spent_sd <dbl>, lux_spent_iqr <dbl>, index <int>
train_exp_learn_without_infv <- train_exp_learn_indexed[train_exp_learn_indexed$index!=4391,]

Prediction

Re-train the model with improved data.

logistic_model_stepboth2 <- glm(formula = transported ~ cryo_sleep + spa + cabin_deck + vr_deck + 
    room_service + lux_spent_sd + cabin_side + cabin_number + 
    home_planet + destination + age + passenger_number_within_group + 
    lux_spent_median, family = binomial, data = train_exp_learn_without_infv)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
test_expanded$pred <- predict(logistic_model_stepboth2, test_expanded,type = "response")
test_expanded$Transported <- ifelse(test_expanded$pred > 0.5, "True", "False")
test_submission <- test_expanded %>% 
  select(passenger_id, Transported) %>% 
  rename(PassengerId = passenger_id)
test_submission[1:10,]
## # A tibble: 10 x 2
##    PassengerId Transported
##    <fct>       <chr>      
##  1 0013_01     True       
##  2 0018_01     False      
##  3 0019_01     True       
##  4 0021_01     True       
##  5 0023_01     True       
##  6 0027_01     True       
##  7 0029_01     True       
##  8 0032_01     True       
##  9 0032_02     True       
## 10 0033_01     True

Save submission to csv.

write_csv(x = test_submission, file = "husada_logistic_space_titanic_submission.csv")

Result

knitr::include_graphics("logistic_result.JPG")