In 2912, space-faring humanity experienced a terrible calamity. In an emigration voyage to 55 Canri E, the Spaceship Titanic collided with an anomaly hiden in a dust cloud. That accident caused half of the ship’s passengers to vanish, transported to an alternate dimension.
The task is to predict whether a passenger was vanished or not.
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(janitor)##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(DataExplorer)
library(missRanger)
library(GGally)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(performance)
library(see)
library(Boruta)
library(dlookr)##
## Attaching package: 'dlookr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:base':
##
## transform
library(class)
library(caret)## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(gtools)
library(ROCR)##
## Attaching package: 'ROCR'
## The following object is masked from 'package:performance':
##
## performance
library(car)## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:gtools':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(broom)train <- read_csv("train.csv")## Rows: 8693 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (3): CryoSleep, VIP, Transported
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv("test.csv")## Rows: 4277 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): PassengerId, HomePlanet, Cabin, Destination, Name
## dbl (6): Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
## lgl (2): CryoSleep, VIP
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(train, 3)## # A tibble: 3 x 14
## PassengerId HomePlanet CryoSleep Cabin Destination Age VIP RoomService
## <chr> <chr> <lgl> <chr> <chr> <dbl> <lgl> <dbl>
## 1 0001_01 Europa FALSE B/0/P TRAPPIST-1e 39 FALSE 0
## 2 0002_01 Earth FALSE F/0/S TRAPPIST-1e 24 FALSE 109
## 3 0003_01 Europa FALSE A/0/S TRAPPIST-1e 58 TRUE 43
## # ... with 6 more variables: FoodCourt <dbl>, ShoppingMall <dbl>, Spa <dbl>,
## # VRDeck <dbl>, Name <chr>, Transported <lgl>
head(test, 3)## # A tibble: 3 x 13
## PassengerId HomePlanet CryoSleep Cabin Destination Age VIP RoomService
## <chr> <chr> <lgl> <chr> <chr> <dbl> <lgl> <dbl>
## 1 0013_01 Earth TRUE G/3/S TRAPPIST-1e 27 FALSE 0
## 2 0018_01 Earth FALSE F/4/S TRAPPIST-1e 19 FALSE 0
## 3 0019_01 Europa TRUE C/0/S 55 Cancri e 31 FALSE 0
## # ... with 5 more variables: FoodCourt <dbl>, ShoppingMall <dbl>, Spa <dbl>,
## # VRDeck <dbl>, Name <chr>
To understand and process the data as a whole, first, we need to join them together
whole <- bind_rows(train, test)First, clean the column names.
whole <- whole %>% clean_names()str(whole)## spec_tbl_df [12,970 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ passenger_id : chr [1:12970] "0001_01" "0002_01" "0003_01" "0003_02" ...
## $ home_planet : chr [1:12970] "Europa" "Earth" "Europa" "Europa" ...
## $ cryo_sleep : logi [1:12970] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ cabin : chr [1:12970] "B/0/P" "F/0/S" "A/0/S" "A/0/S" ...
## $ destination : chr [1:12970] "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" ...
## $ age : num [1:12970] 39 24 58 33 16 44 26 28 35 14 ...
## $ vip : logi [1:12970] FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ room_service : num [1:12970] 0 109 43 0 303 0 42 0 0 0 ...
## $ food_court : num [1:12970] 0 9 3576 1283 70 ...
## $ shopping_mall: num [1:12970] 0 25 0 371 151 0 3 0 17 0 ...
## $ spa : num [1:12970] 0 549 6715 3329 565 ...
## $ vr_deck : num [1:12970] 0 44 49 193 2 0 0 NA 0 0 ...
## $ name : chr [1:12970] "Maham Ofracculy" "Juanna Vines" "Altark Susent" "Solam Susent" ...
## $ transported : logi [1:12970] FALSE TRUE FALSE FALSE TRUE TRUE ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_character(),
## .. HomePlanet = col_character(),
## .. CryoSleep = col_logical(),
## .. Cabin = col_character(),
## .. Destination = col_character(),
## .. Age = col_double(),
## .. VIP = col_logical(),
## .. RoomService = col_double(),
## .. FoodCourt = col_double(),
## .. ShoppingMall = col_double(),
## .. Spa = col_double(),
## .. VRDeck = col_double(),
## .. Name = col_character(),
## .. Transported = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
Let’s check for missing values.
plot_missing(whole)
The missing values are not significant, it’s good enough percentage to
be omitted. But to be safe, I will impute them later. For now, let’s
consider how many unique value in the data, to conclude whether some
variables need to be converted into factor or not.
sapply(whole, function(x) n_distinct(x))## passenger_id home_planet cryo_sleep cabin destination
## 12970 4 3 9826 4
## age vip room_service food_court shopping_mall
## 81 3 1579 1954 1368
## spa vr_deck name transported
## 1680 1643 12630 3
str(whole)## spec_tbl_df [12,970 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ passenger_id : chr [1:12970] "0001_01" "0002_01" "0003_01" "0003_02" ...
## $ home_planet : chr [1:12970] "Europa" "Earth" "Europa" "Europa" ...
## $ cryo_sleep : logi [1:12970] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ cabin : chr [1:12970] "B/0/P" "F/0/S" "A/0/S" "A/0/S" ...
## $ destination : chr [1:12970] "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" "TRAPPIST-1e" ...
## $ age : num [1:12970] 39 24 58 33 16 44 26 28 35 14 ...
## $ vip : logi [1:12970] FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ room_service : num [1:12970] 0 109 43 0 303 0 42 0 0 0 ...
## $ food_court : num [1:12970] 0 9 3576 1283 70 ...
## $ shopping_mall: num [1:12970] 0 25 0 371 151 0 3 0 17 0 ...
## $ spa : num [1:12970] 0 549 6715 3329 565 ...
## $ vr_deck : num [1:12970] 0 44 49 193 2 0 0 NA 0 0 ...
## $ name : chr [1:12970] "Maham Ofracculy" "Juanna Vines" "Altark Susent" "Solam Susent" ...
## $ transported : logi [1:12970] FALSE TRUE FALSE FALSE TRUE TRUE ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_character(),
## .. HomePlanet = col_character(),
## .. CryoSleep = col_logical(),
## .. Cabin = col_character(),
## .. Destination = col_character(),
## .. Age = col_double(),
## .. VIP = col_logical(),
## .. RoomService = col_double(),
## .. FoodCourt = col_double(),
## .. ShoppingMall = col_double(),
## .. Spa = col_double(),
## .. VRDeck = col_double(),
## .. Name = col_character(),
## .. Transported = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
Since cryo_sleep and vip are already logical class, we only need to convert home_planet and destination from character to factor.
whole <- whole %>%
mutate(home_planet = as.factor(home_planet),
destination = as.factor(destination)
)Now let’s impute the missing data with what we have using missRanger. We create formula to target every columns except passenger id and transported. But for the predictor, we also have to dismissed any unordered factor with more than 53 level.
init_imp =
home_planet + cryo_sleep + cabin + destination + age + vip + room_service + food_court + shopping_mall + spa + vr_deck + name ~ home_planet + cryo_sleep + + destination + vip + room_service + food_court + shopping_mall + spa + vr_deck
whole_init_imp <- missRanger(data = whole,
formula = init_imp,
pmm.k = 3, # to round imputation result with predictive mean matching
verbose = 2,
num.trees = 50, # tweaks to make things faster
sample.fraction = 0.1, # tweaks to make things faster
splitrule = "extratrees", # tweaks to make things faster
max.depth = 6, # tweaks to make things faster
min.node.size = 10000, # tweaks to make things faster
maxiter = 2)# tweaks to make things faster##
## Missing value imputation by random forests
##
## Variables to impute: home_planet, cryo_sleep, cabin, destination, age, vip, room_service, food_court, shopping_mall, spa, vr_deck, name
## Variables used to impute: home_planet, cryo_sleep, destination, vip, room_service, food_court, shopping_mall, spa, vr_deck
## rm_srv vr_dck age dstntn spa hm_pln fd_crt name vip cabin shppn_ cry_sl
## iter 1: 1.0000 1.0001 1.0001 0.3013 1.0002 0.4587 1.0001 1.0000 0.0215 0.9999 1.0000 0.3618
## iter 2: 1.0001 1.0001 1.0001 0.3013 1.0001 0.4587 1.0001 1.0000 0.0215 0.9998 1.0001 0.3618
colSums(is.na(whole_init_imp))## passenger_id home_planet cryo_sleep cabin destination
## 0 0 0 0 0
## age vip room_service food_court shopping_mall
## 0 0 0 0 0
## spa vr_deck name transported
## 0 0 0 4277
With the initial imputation done, we can then proceed to extract other features from them. The first four numbers in passenger_id represent groupings, while the last two numbers are their numbers within each group. The next column to extract is cabin, that can be extracted into deck, number, and side. Then we can create more columns that account to the various calculations regarding how the passenger spent their money on the ship’s luxury amenities. Perhaps, we need to take a look at the passenger’s first and last name as well.
lux <- c("room_service", "food_court", "shopping_mall", "spa", "vr_deck")
whole_init_exp <- whole_init_imp %>%
mutate(passenger_id = as.factor(passenger_id), # turn into factor so it doesn't removed later
passenger_group = str_sub(passenger_id, 1, 4),
passenger_number_within_group =
as.factor(str_sub(passenger_id, 6, 7)),
cabin_deck = as.factor(str_sub(cabin, 1, 1)),
cabin_number = as.factor(str_sub(cabin, 3, 3)),
cabin_side = as.factor(str_sub(cabin, -1))) %>%
separate(col = name,
sep = " ",
into = c("name_first","name_last"),) %>%
mutate(name_last = as.factor(name_last)) %>%
rowwise() %>%
mutate(lux_spent_average = mean(c_across(cols = all_of(lux))),
lux_spent_sum = sum(c_across(cols = all_of(lux))),
lux_spent_median = median(c_across(cols = all_of(lux))),
lux_spent_var = var(c_across(cols = all_of(lux))),
lux_spent_sd = sd(c_across(cols = all_of(lux))),
lux_spent_iqr = IQR(c_across(cols = all_of(lux)))
) %>%
ungroup()
head(whole_init_exp)[,]## # A tibble: 6 x 26
## passenger_id home_planet cryo_sleep cabin destination age vip room_service
## <fct> <fct> <lgl> <chr> <fct> <dbl> <lgl> <dbl>
## 1 0001_01 Europa FALSE B/0/P TRAPPIST-1e 39 FALSE 0
## 2 0002_01 Earth FALSE F/0/S TRAPPIST-1e 24 FALSE 109
## 3 0003_01 Europa FALSE A/0/S TRAPPIST-1e 58 TRUE 43
## 4 0003_02 Europa FALSE A/0/S TRAPPIST-1e 33 FALSE 0
## 5 0004_01 Earth FALSE F/1/S TRAPPIST-1e 16 FALSE 303
## 6 0005_01 Earth FALSE F/0/P PSO J318.5~ 44 FALSE 0
## # ... with 18 more variables: food_court <dbl>, shopping_mall <dbl>, spa <dbl>,
## # vr_deck <dbl>, name_first <chr>, name_last <fct>, transported <lgl>,
## # passenger_group <chr>, passenger_number_within_group <fct>,
## # cabin_deck <fct>, cabin_number <fct>, cabin_side <fct>,
## # lux_spent_average <dbl>, lux_spent_sum <dbl>, lux_spent_median <dbl>,
## # lux_spent_var <dbl>, lux_spent_sd <dbl>, lux_spent_iqr <dbl>
Remove character class from the data frame. The feature passenger_id is kept around for the test submission later.
whole_init_exp <- whole_init_exp %>% select(-where(is.character)) # passenger id is kept
head(whole_init_exp,2)## # A tibble: 2 x 23
## passenger_id home_planet cryo_sleep destination age vip room_service
## <fct> <fct> <lgl> <fct> <dbl> <lgl> <dbl>
## 1 0001_01 Europa FALSE TRAPPIST-1e 39 FALSE 0
## 2 0002_01 Earth FALSE TRAPPIST-1e 24 FALSE 109
## # ... with 16 more variables: food_court <dbl>, shopping_mall <dbl>, spa <dbl>,
## # vr_deck <dbl>, name_last <fct>, transported <lgl>,
## # passenger_number_within_group <fct>, cabin_deck <fct>, cabin_number <fct>,
## # cabin_side <fct>, lux_spent_average <dbl>, lux_spent_sum <dbl>,
## # lux_spent_median <dbl>, lux_spent_var <dbl>, lux_spent_sd <dbl>,
## # lux_spent_iqr <dbl>
Let’s see the expanded dataset’s summary.
plot_intro(whole_init_exp)Let’s visualize the discrete features that have less than 50 categories
plot_bar(whole_init_exp,
by = "transported",
by_position = "fill",
nrow = 2,
ncol = 2)## 2 columns ignored with more than 50 categories.
## passenger_id: 12970 categories
## name_last: 2406 categories
From the quick peek above, vip passengers that slept in the cryo were
much more likely to be not transported away. So did, having a cabin in
deck E and number 0.
Then we take a look at the continuous numbers.
plot_boxplot(whole_init_exp,
by = "transported",
nrow = 2,
ncol = 2)
Passengers that spent their money in crowded open space like food_court
and shopping_mall were more likely to be vanished. Meanwhile, spending
money in more private situation like room_service, spa, and vr_deck
helped their chance to be not taken away into some alternate dimension.
Was there something contagious with the way the vanishing happened?
They’re mostly not normally distributed, but to be sure, let’s see them in qq plot.
plot_qq(whole_init_exp,
by = "transported",
nrow = 2,
ncol = 2
)
Only age that has resemblance of normal distribution. The rest are
skewed.
We have to split the train data into two parts:
train_expanded <- whole_init_exp %>% filter(!is.na(transported))
test_expanded <- whole_init_exp %>% filter(is.na(transported))RNGkind(sample.kind = "Rounding")## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(1)
idx <- sample(nrow(train_expanded),
0.8 * nrow(train_expanded))
train_exp_learn <- train_expanded[idx,-1]
train_exp_val <- train_expanded[-idx,-1]Let’s check the target balance.
prop.table(table(train_exp_learn$transported))##
## FALSE TRUE
## 0.4982744 0.5017256
Cool, it’s already balanced.
The number of rows is too much for my hardware to process model that includes all the features. So here I’m using Boruta to evaluate which features are important and should be integrated in the actual model.
boruta_output <- Boruta(transported ~ ., data=na.omit(train_exp_learn), doTrace=1)## After 12 iterations, +19 secs:
## confirmed 19 attributes: age, cabin_deck, cabin_number, cabin_side, cryo_sleep and 14 more;
## rejected 1 attribute: vip;
## still have 1 attribute left.
Boruta created shadow features from randomized copies of the original features. It then applied random forest classifier on them and applies feature importance measures. Boruta compares the best of the shadows and the origins. Those original features that lost to the shadows are rejected.
plotImpHistory(boruta_output)plot(boruta_output, cex.axis=.8, las=2, xlab="", main="Variable Importance")The plot above shows 19 confirmed important features.The boruta results agree with the previous observations that how public or private their behaviors are quite important since food_court, spa, and cryo_sleep is in the top five confirmed list.
Let’s pull the top 19 important features.
topf <- boruta_output$ImpHistory %>%
as.data.frame %>%
summarise_all(sum) %>%
pivot_longer(cols = 1:24,
names_to = "features",
values_to = "impsum") %>%
arrange(desc(impsum)) %>%
top_n(19) %>%
pull(features)## Selecting by impsum
topf## [1] "food_court" "home_planet"
## [3] "cabin_deck" "cryo_sleep"
## [5] "vr_deck" "spa"
## [7] "room_service" "lux_spent_sum"
## [9] "lux_spent_average" "lux_spent_sd"
## [11] "lux_spent_var" "shopping_mall"
## [13] "age" "cabin_number"
## [15] "lux_spent_iqr" "cabin_side"
## [17] "destination" "lux_spent_median"
## [19] "passenger_number_within_group"
For Logistic Regression, we need a formula. Let’s turn topf to formula.
formula_topf <- as.formula(paste("transported ~ ", paste(topf, collapse = "+")))
formula_topf## transported ~ food_court + home_planet + cabin_deck + cryo_sleep +
## vr_deck + spa + room_service + lux_spent_sum + lux_spent_average +
## lux_spent_sd + lux_spent_var + shopping_mall + age + cabin_number +
## lux_spent_iqr + cabin_side + destination + lux_spent_median +
## passenger_number_within_group
dim(train_exp_learn)## [1] 6954 22
logistic_model_null <- glm(transported ~ 1,
data = train_exp_learn,
family = binomial)
logistic_model_topf <- glm(formula_topf,
data = train_exp_learn,
family = binomial)## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logistic_model_stepboth <-
step(object = logistic_model_null, direction = "both",
scope = list(lower = logistic_model_null,
upper = logistic_model_topf), trace = 0)## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logistic_model_stepboth##
## Call: glm(formula = transported ~ cryo_sleep + spa + cabin_deck + vr_deck +
## room_service + lux_spent_sd + cabin_side + cabin_number +
## home_planet + destination + age + lux_spent_median + passenger_number_within_group,
## family = binomial, data = train_exp_learn)
##
## Coefficients:
## (Intercept) cryo_sleepTRUE
## -0.6495333 1.4522761
## spa cabin_deckB
## -0.0024830 1.3620632
## cabin_deckC cabin_deckD
## 2.6511879 0.5933610
## cabin_deckE cabin_deckF
## 0.1607195 0.5084976
## cabin_deckG cabin_deckT
## 0.1061326 -0.4406528
## vr_deck room_service
## -0.0023353 -0.0019734
## lux_spent_sd cabin_sideS
## 0.0016113 0.5935557
## cabin_number1 cabin_number2
## 0.2215891 -0.0929036
## cabin_number3 cabin_number4
## -0.1247500 -0.2811728
## cabin_number5 cabin_number6
## 0.2790346 0.2325812
## cabin_number7 cabin_number8
## 0.5171484 0.6521658
## cabin_number9 home_planetEuropa
## 1.0956275 1.2489504
## home_planetMars destinationPSO J318.5-22
## 0.5717192 -0.4169489
## destinationTRAPPIST-1e age
## -0.4858853 -0.0093622
## lux_spent_median passenger_number_within_group02
## 0.0006304 0.1801495
## passenger_number_within_group03 passenger_number_within_group04
## 0.4206803 0.1890521
## passenger_number_within_group05 passenger_number_within_group06
## -0.1915441 0.0654063
## passenger_number_within_group07 passenger_number_within_group08
## -0.3746677 -0.6698812
##
## Degrees of Freedom: 6953 Total (i.e. Null); 6918 Residual
## Null Deviance: 9640
## Residual Deviance: 5948 AIC: 6020
coefficients(logistic_model_stepboth) %>% sort(decreasing = T)## cabin_deckC cryo_sleepTRUE
## 2.6511878983 1.4522760704
## cabin_deckB home_planetEuropa
## 1.3620632217 1.2489503646
## cabin_number9 cabin_number8
## 1.0956275223 0.6521658126
## cabin_sideS cabin_deckD
## 0.5935556910 0.5933609589
## home_planetMars cabin_number7
## 0.5717191621 0.5171484437
## cabin_deckF passenger_number_within_group03
## 0.5084976393 0.4206802595
## cabin_number5 cabin_number6
## 0.2790345632 0.2325812416
## cabin_number1 passenger_number_within_group04
## 0.2215890627 0.1890520848
## passenger_number_within_group02 cabin_deckE
## 0.1801494949 0.1607195356
## cabin_deckG passenger_number_within_group06
## 0.1061326081 0.0654063140
## lux_spent_sd lux_spent_median
## 0.0016113212 0.0006303839
## room_service vr_deck
## -0.0019734229 -0.0023353082
## spa age
## -0.0024829941 -0.0093621898
## cabin_number2 cabin_number3
## -0.0929036391 -0.1247499942
## passenger_number_within_group05 cabin_number4
## -0.1915440853 -0.2811728467
## passenger_number_within_group07 destinationPSO J318.5-22
## -0.3746676521 -0.4169489433
## cabin_deckT destinationTRAPPIST-1e
## -0.4406527777 -0.4858852565
## (Intercept) passenger_number_within_group08
## -0.6495332598 -0.6698811713
Let’s take a look at the highest coefficient which is Cabin Deck C and the lowest coefficient which is Cabin Deck T.
inv.logit(2.661411466) # C## [1] 0.9347109
inv.logit(-0.697263538) # T## [1] 0.3324192
A passenger assigned on Deck C, has 93% chance to be transported away to alternate dimension. A passenger assigned on Deck T, has % chance to be transported away to alternate dimension.
Now we can use the train_exp_val for validation
res_val_logres_1 <- predict(logistic_model_stepboth,
train_exp_val[],
type = "response")
transported_pred <- ifelse(res_val_logres_1 > 0.5, TRUE, FALSE) Now create confusion matrix.
logres_confmat <- confusionMatrix(data = as.factor(transported_pred),
reference = as.factor(train_exp_val$transported),
positive = "TRUE")
logres_confmat## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 649 164
## TRUE 201 725
##
## Accuracy : 0.7901
## 95% CI : (0.7702, 0.809)
## No Information Rate : 0.5112
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.5796
##
## Mcnemar's Test P-Value : 0.05952
##
## Sensitivity : 0.8155
## Specificity : 0.7635
## Pos Pred Value : 0.7829
## Neg Pred Value : 0.7983
## Prevalence : 0.5112
## Detection Rate : 0.4169
## Detection Prevalence : 0.5325
## Balanced Accuracy : 0.7895
##
## 'Positive' Class : TRUE
##
We can only proceed with numerical features for KNN, and they need to be scaled as well. Performed transformation on the train, validation, and test.
knn_train_exp_learn <- train_exp_learn %>%
mutate(cryo_sleep = as.numeric(cryo_sleep),
vip = as.numeric(vip)) %>%
select(where(is.numeric)) %>% scale(.)
head(knn_train_exp_learn,5)## cryo_sleep age vip room_service food_court shopping_mall
## [1,] -0.7479719 -0.2106571 -0.1529581 0.4798134 -0.1639743 -0.2362479
## [2,] -0.7479719 -2.0150746 -0.1529581 -0.3374959 -0.2876490 -0.2849046
## [3,] -0.7479719 0.2057469 -0.1529581 6.6244392 -0.2876490 0.4433242
## [4,] 1.3367563 -0.8352632 -0.1529581 -0.3374959 -0.2876490 -0.2849046
## [5,] 1.3367563 -1.7374719 -0.1529581 -0.3374959 -0.2876490 -0.2849046
## spa vr_deck lux_spent_average lux_spent_sum lux_spent_median
## [1,] -0.27106492 -0.2624567 -0.2432412 -0.2432412 -0.1415593
## [2,] -0.27106492 -0.2633274 -0.5244943 -0.5244943 -0.2592790
## [3,] -0.09508641 -0.2633274 1.4137927 1.4137927 0.4862791
## [4,] -0.27106492 -0.2633274 -0.5244943 -0.5244943 -0.2592790
## [5,] -0.27106492 -0.2633274 -0.5244943 -0.5244943 -0.2592790
## lux_spent_var lux_spent_sd lux_spent_iqr
## [1,] -0.1843014 -0.2707499 -0.1044475
## [2,] -0.1944667 -0.5294325 -0.3661898
## [3,] 0.5663459 1.7085028 0.2491102
## [4,] -0.1944667 -0.5294325 -0.3661898
## [5,] -0.1944667 -0.5294325 -0.3661898
knn_train_exp_val <- train_exp_val %>%
mutate(cryo_sleep = as.numeric(cryo_sleep),
vip = as.numeric(vip)) %>%
select(where(is.numeric))
knn_train_exp_val <- scale(x = knn_train_exp_val,
center = attr(knn_train_exp_learn,"scaled:center"),
scale = attr(knn_train_exp_learn,"scaled:scale"))
head(knn_train_exp_val,5)## cryo_sleep age vip room_service food_court shopping_mall
## [1,] -0.7479719 -0.3494584 -0.1529581 -0.1761069 -0.2818517 -0.2443574
## [2,] -0.7479719 2.0101645 6.5367961 -0.2738287 2.0157919 -0.2849046
## [3,] -0.7479719 1.3161577 -0.1529581 0.7270790 -0.2870048 -0.1794818
## [4,] 1.3367563 1.1079557 -0.1529581 -0.3374959 -0.2876490 -0.2849046
## [5,] 1.3367563 1.3855584 -0.1529581 -0.3374959 -0.2876490 -0.2849046
## spa vr_deck lux_spent_average lux_spent_sum lux_spent_median
## [1,] 0.2374204 -0.2250162 -0.2573946 -0.2573946 -0.08662347
## [2,] 5.9483861 -0.2206627 3.2435705 3.2435705 -0.06700352
## [3,] -0.2710649 -0.2424304 -0.2309024 -0.2309024 -0.16510327
## [4,] -0.2710649 -0.2633274 -0.5244943 -0.5244943 -0.25927904
## [5,] -0.2710649 -0.2633274 -0.5244943 -0.5244943 -0.25927904
## lux_spent_var lux_spent_sd lux_spent_iqr
## [1,] -0.1849762 -0.2794830 -0.2510780
## [2,] 1.4661160 2.7768422 4.4753577
## [3,] -0.1765968 -0.1864524 -0.2784856
## [4,] -0.1944667 -0.5294325 -0.3661898
## [5,] -0.1944667 -0.5294325 -0.3661898
Extract the response variable from predictor
knn_train_learn_target <- train_exp_learn %>% select(transported) %>% pull() %>% as.numeric()
knn_train_val_target <- train_exp_val %>% select(transported) %>% pull() %>% as.numeric() # for referenceWe pick the number of K.
nrow(knn_train_exp_learn)## [1] 6954
sqrt(nrow(knn_train_exp_learn)) %>% round()## [1] 83
Now let’s see the validation
res_val_knn_1 <- knn(train = knn_train_exp_learn,
test = knn_train_exp_val,
cl = knn_train_learn_target,
k = 83)
res_val_knn_1[1:20]## [1] 0 0 0 1 1 1 1 0 1 0 0 1 0 0 1 0 0 1 1 1
## Levels: 0 1
Create confusion matrix.
knn_confmat <- confusionMatrix(data = res_val_knn_1, reference = as.factor(knn_train_val_target), positive = "1")
knn_confmat## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 629 149
## 1 221 740
##
## Accuracy : 0.7872
## 95% CI : (0.7672, 0.8063)
## No Information Rate : 0.5112
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5735
##
## Mcnemar's Test P-Value : 0.0002233
##
## Sensitivity : 0.8324
## Specificity : 0.7400
## Pos Pred Value : 0.7700
## Neg Pred Value : 0.8085
## Prevalence : 0.5112
## Detection Rate : 0.4255
## Detection Prevalence : 0.5526
## Balanced Accuracy : 0.7862
##
## 'Positive' Class : 1
##
logres_confmat$overall[1]## Accuracy
## 0.7901093
knn_confmat$overall[1]## Accuracy
## 0.787234
logres_confmat$byClass## Sensitivity Specificity Pos Pred Value
## 0.8155231 0.7635294 0.7829374
## Neg Pred Value Precision Recall
## 0.7982780 0.7829374 0.8155231
## F1 Prevalence Detection Rate
## 0.7988981 0.5112133 0.4169063
## Detection Prevalence Balanced Accuracy
## 0.5324899 0.7895262
knn_confmat$byClass## Sensitivity Specificity Pos Pred Value
## 0.8323960 0.7400000 0.7700312
## Neg Pred Value Precision Recall
## 0.8084833 0.7700312 0.8323960
## F1 Prevalence Detection Rate
## 0.8000000 0.5112133 0.4255319
## Detection Prevalence Balanced Accuracy
## 0.5526164 0.7861980
The Logistic Regression has given better result in accuracy and precision (pos pred value) but KNN is better at recall (sensitivity).
ROC
# objek prediction
logress_roc_pred <- prediction(predictions = as.numeric(transported_pred),
labels = as.numeric(train_exp_val$transported))
knn_roc_pred <- prediction(predictions = as.numeric(res_val_knn_1),
labels = as.numeric(knn_train_val_target))# ROC curve
plot(performance(prediction.obj = logress_roc_pred, measure = "tpr", x.measure = "fpr"),
main = "Logistic Regression ROC Curve")plot(performance(prediction.obj = knn_roc_pred, measure = "tpr", x.measure = "fpr"),
main = "K-Nearest Neighbors ROC Curve")AUC
# AUC value
logress_auc <- performance(prediction.obj = logress_roc_pred, measure = "auc")
logress_auc@y.values## [[1]]
## [1] 0.7895262
knn_auc <- performance(prediction.obj = knn_roc_pred, measure = "auc")
knn_auc@y.values## [[1]]
## [1] 0.786198
For the prediction submission I have to use Logistic Regression since it gives better result at the validation data. But before I put it to the test data, I have to improve the model.
There’s a linear relationship between the dependent and the independent variables. If the alternative hypothesis is true, these predictors pass the test.
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$cryo_sleep))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$cryo_sleep)
## t = 42.914, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4388455 0.4760134
## sample estimates:
## cor
## 0.4576293
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$spa))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$spa)
## t = -18.344, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2371786 -0.1923395
## sample estimates:
## cor
## -0.2148723
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$cabin_deck))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$cabin_deck)
## t = -10.299, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.14567794 -0.09937576
## sample estimates:
## cor
## -0.1225936
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$vr_deck))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$vr_deck)
## t = -17.265, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2251959 -0.1801193
## sample estimates:
## cor
## -0.202765
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$room_service))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$room_service)
## t = -20.343, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2590906 -0.2147220
## sample estimates:
## cor
## -0.2370299
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$lux_spent_sd))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$lux_spent_sd)
## t = -14.233, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1910192 -0.1453412
## sample estimates:
## cor
## -0.1682705
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$cabin_side))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$cabin_side)
## t = 8.0924, df = 6952, p-value = 6.853e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.07326383 0.11983368
## sample estimates:
## cor
## 0.09660163
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$cabin_number))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$cabin_number)
## t = 3.9564, df = 6952, p-value = 7.683e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02392073 0.07082347
## sample estimates:
## cor
## 0.04739822
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$home_planet))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$home_planet)
## t = 10.399, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1005491 0.1468378
## sample estimates:
## cor
## 0.1237608
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$destination))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$destination)
## t = -8.2971, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.12224132 -0.07569371
## sample estimates:
## cor
## -0.09902169
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$age))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$age)
## t = -6.0834, df = 6952, p-value = 1.239e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09610785 -0.04934834
## sample estimates:
## cor
## -0.07276808
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$passenger_number_within_group))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$passenger_number_within_group)
## t = 5.5778, df = 6952, p-value = 2.527e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04331197 0.09011094
## sample estimates:
## cor
## 0.06674817
cor.test(as.numeric(train_exp_learn$transported), as.numeric(train_exp_learn$lux_spent_median))##
## Pearson's product-moment correlation
##
## data: as.numeric(train_exp_learn$transported) and as.numeric(train_exp_learn$lux_spent_median)
## t = -12.345, df = 6952, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1693851 -0.1233847
## sample estimates:
## cor
## -0.146464
All true.
There should be no value above 10.
vif(logistic_model_stepboth,)## GVIF Df GVIF^(1/(2*Df))
## cryo_sleep 1.610049 1 1.268877
## spa 1.628082 1 1.275963
## cabin_deck 8.748344 7 1.167563
## vr_deck 1.811430 1 1.345894
## room_service 1.515327 1 1.230986
## lux_spent_sd 2.494686 1 1.579457
## cabin_side 1.027255 1 1.013536
## cabin_number 1.315425 9 1.015348
## home_planet 5.827345 2 1.553702
## destination 1.107361 2 1.025823
## age 1.148340 1 1.071606
## lux_spent_median 1.476110 1 1.214953
## passenger_number_within_group 1.118837 7 1.008053
All fine. No value above 10.
Not all extreme data affect model quality, but one that does is called influential value. Bellow is an ilustration of how far each data point astrays in term of Cook’s distance.
plot(logistic_model_stepboth, which = 4, id.n = 3)Here are the passengers with the top 3 Cook’s distance value. But they’re not necessarily influential.
logistic_model_stepboth.data <- augment(logistic_model_stepboth) %>% mutate(index=1:n())
logistic_model_stepboth.data %>% arrange(desc(.cooksd)) %>% slice(1:3)## # A tibble: 3 x 21
## transported cryo_sleep spa cabin_deck vr_deck room_service lux_spent_sd
## <lgl> <lgl> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 TRUE FALSE 26 T 3 0 1399.
## 2 FALSE FALSE 14 T 60 415 566.
## 3 FALSE TRUE 0 E 0 0 0
## # ... with 14 more variables: cabin_side <fct>, cabin_number <fct>,
## # home_planet <fct>, destination <fct>, age <dbl>, lux_spent_median <dbl>,
## # passenger_number_within_group <fct>, .fitted <dbl>, .resid <dbl>,
## # .std.resid <dbl>, .hat <dbl>, .sigma <dbl>, .cooksd <dbl>, index <int>
Let’s visualize to see whether or not there are potential .std.resid above 3.
ggplot(logistic_model_stepboth.data, aes(index, .std.resid)) +
geom_point(aes(color = transported), alpha = .5) Influential data point has .std.resid above 3.
logistic_model_stepboth.data %>% filter(.std.resid > 3)## # A tibble: 1 x 21
## transported cryo_sleep spa cabin_deck vr_deck room_service lux_spent_sd
## <lgl> <lgl> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 TRUE FALSE 3131 C 498 2320 1985.
## # ... with 14 more variables: cabin_side <fct>, cabin_number <fct>,
## # home_planet <fct>, destination <fct>, age <dbl>, lux_spent_median <dbl>,
## # passenger_number_within_group <fct>, .fitted <dbl>, .resid <dbl>,
## # .std.resid <dbl>, .hat <dbl>, .sigma <dbl>, .cooksd <dbl>, index <int>
There’s a passenger from Europa assigned on Deck C that disrupted the model as an influential data point. We have to remove him/her/they to improve our model.
train_exp_learn_indexed <- train_exp_learn %>%
mutate(index = 1:n())
train_exp_learn_indexed %>%
filter(
cabin_number==1,
home_planet=="Europa",
destination=="55 Cancri e",
age==40,
passenger_number_within_group=="04"
)## # A tibble: 1 x 23
## home_planet cryo_sleep destination age vip room_service food_court
## <fct> <lgl> <fct> <dbl> <lgl> <dbl> <dbl>
## 1 Europa FALSE 55 Cancri e 40 FALSE 2320 0
## # ... with 16 more variables: shopping_mall <dbl>, spa <dbl>, vr_deck <dbl>,
## # name_last <fct>, transported <lgl>, passenger_number_within_group <fct>,
## # cabin_deck <fct>, cabin_number <fct>, cabin_side <fct>,
## # lux_spent_average <dbl>, lux_spent_sum <dbl>, lux_spent_median <dbl>,
## # lux_spent_var <dbl>, lux_spent_sd <dbl>, lux_spent_iqr <dbl>, index <int>
train_exp_learn_without_infv <- train_exp_learn_indexed[train_exp_learn_indexed$index!=4391,]Re-train the model with improved data.
logistic_model_stepboth2 <- glm(formula = transported ~ cryo_sleep + spa + cabin_deck + vr_deck +
room_service + lux_spent_sd + cabin_side + cabin_number +
home_planet + destination + age + passenger_number_within_group +
lux_spent_median, family = binomial, data = train_exp_learn_without_infv)## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
test_expanded$pred <- predict(logistic_model_stepboth2, test_expanded,type = "response")
test_expanded$Transported <- ifelse(test_expanded$pred > 0.5, "True", "False")
test_submission <- test_expanded %>%
select(passenger_id, Transported) %>%
rename(PassengerId = passenger_id)test_submission[1:10,]## # A tibble: 10 x 2
## PassengerId Transported
## <fct> <chr>
## 1 0013_01 True
## 2 0018_01 False
## 3 0019_01 True
## 4 0021_01 True
## 5 0023_01 True
## 6 0027_01 True
## 7 0029_01 True
## 8 0032_01 True
## 9 0032_02 True
## 10 0033_01 True
Save submission to csv.
write_csv(x = test_submission, file = "husada_logistic_space_titanic_submission.csv")knitr::include_graphics("logistic_result.JPG")