library(readr)
library(janitor)
library(tidyverse)
library(tidymodels)
library(skimr)
test <- read_csv("test.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## PassengerId = col_double(),
## Pclass = col_double(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## SibSp = col_double(),
## Parch = col_double(),
## Ticket = col_character(),
## Fare = col_double(),
## Cabin = col_character(),
## Embarked = col_character()
## )
train <- read_csv("train.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## PassengerId = col_double(),
## Survived = col_double(),
## Pclass = col_double(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## SibSp = col_double(),
## Parch = col_double(),
## Ticket = col_character(),
## Fare = col_double(),
## Cabin = col_character(),
## Embarked = col_character()
## )
full_dat <- (bind_rows(train, test)) %>%
clean_names() %>%
mutate(index = case_when(is.na(survived)=="FALSE"~"train",
TRUE ~ "test"),
survived = factor(survived,
levels = c("0", "1"),
labels = c("0", "1")),
pclass = factor(pclass,
levels = c("1", "2", "3"),
labels = c("1st", "2nd", "3rd")),
embarked = as.factor(embarked),
sex = as.factor(sex))
preprocess_fulldat <- recipe(~., data = full_dat) %>%
step_knnimpute(embarked) %>%
step_meanimpute(fare, age) %>%
prep()
fulldat2<- (bake(preprocess_fulldat, new_data = NULL))
train2 <- fulldat2[fulldat2$index=="train",]
test2 <- fulldat2[fulldat2$index=="test",]
explore survival status by other factors
skim(train2)
| Name | train2 |
| Number of rows | 891 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| factor | 8 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| survived | 0 | 1.00 | FALSE | 2 | 0: 549, 1: 342 |
| pclass | 0 | 1.00 | FALSE | 3 | 3rd: 491, 1st: 216, 2nd: 184 |
| name | 0 | 1.00 | FALSE | 891 | Abb: 1, Abb: 1, Abb: 1, Abe: 1 |
| sex | 0 | 1.00 | FALSE | 2 | mal: 577, fem: 314 |
| ticket | 0 | 1.00 | FALSE | 681 | 160: 7, 347: 7, CA.: 7, 310: 6 |
| cabin | 687 | 0.23 | FALSE | 147 | B96: 4, C23: 4, G6: 4, C22: 3 |
| embarked | 0 | 1.00 | FALSE | 3 | S: 644, C: 170, Q: 77 |
| index | 0 | 1.00 | FALSE | 1 | tra: 891, tes: 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| passenger_id | 0 | 1 | 446.00 | 257.35 | 1.00 | 223.50 | 446.00 | 668.5 | 891.00 | ▇▇▇▇▇ |
| age | 0 | 1 | 29.74 | 13.00 | 0.42 | 22.00 | 29.88 | 35.0 | 80.00 | ▂▇▃▁▁ |
| sib_sp | 0 | 1 | 0.52 | 1.10 | 0.00 | 0.00 | 0.00 | 1.0 | 8.00 | ▇▁▁▁▁ |
| parch | 0 | 1 | 0.38 | 0.81 | 0.00 | 0.00 | 0.00 | 0.0 | 6.00 | ▇▁▁▁▁ |
| fare | 0 | 1 | 32.20 | 49.69 | 0.00 | 7.91 | 14.45 | 31.0 | 512.33 | ▇▁▁▁▁ |
train2 %>% ggplot(aes(survived, fare, fill = sex)) +
geom_boxplot() +
scale_y_log10()
train2 %>% ggplot(aes(survived, age, fill = sex)) +
geom_boxplot()
train2 %>% ggplot(aes(survived, age, fill= survived)) +
geom_boxplot() +
facet_wrap(~embarked)
train2 %>%
group_by(survived, sex) %>%
summarise(mean_age = mean(age, na.rm = T)) %>%
pivot_wider(names_from = sex,
values_from = mean_age)
## # A tibble: 2 x 3
## # Groups: survived [2]
## survived female male
## <fct> <dbl> <dbl>
## 1 0 26.1 31.2
## 2 1 29.0 27.7
train2 %>%
group_by(survived, sex) %>%
summarise(mean_fare = mean(fare, na.rm = T)) %>%
pivot_wider(names_from = sex,
values_from = mean_fare)
## # A tibble: 2 x 3
## # Groups: survived [2]
## survived female male
## <fct> <dbl> <dbl>
## 1 0 23.0 22.0
## 2 1 51.9 40.8
train2 %>%
group_by(survived, sex) %>%
summarise(mean_parch = mean(parch, na.rm = T)) %>%
pivot_wider(names_from = sex,
values_from = mean_parch)
## # A tibble: 2 x 3
## # Groups: survived [2]
## survived female male
## <fct> <dbl> <dbl>
## 1 0 1.04 0.207
## 2 1 0.515 0.358
#STEP 2: BUILDING MODEL
set.seed(111)
ttn_split <- initial_split(train2, strata = survived)
ttn_training <- training(ttn_split)
ttn_tesing <- testing(ttn_split)
set.seed(123)
ttn_resampling <- bootstraps(ttn_training, strata = survived)
##2.3 buid preprocessing and function
library(themis) # to downsample
## Registered S3 methods overwritten by 'themis':
## method from
## bake.step_downsample recipes
## bake.step_upsample recipes
## prep.step_downsample recipes
## prep.step_upsample recipes
## tidy.step_downsample recipes
## tidy.step_upsample recipes
## tunable.step_downsample recipes
## tunable.step_upsample recipes
##
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
##
## step_downsample, step_upsample
ttn_rcp <- recipe(survived ~ ., data = ttn_training) %>%
update_role(passenger_id, new_role = "id") %>%
step_downsample(survived) %>% # we need to upsample since died and survived class are imbalance but we want to retain obs as much as possible
step_rm(cabin, ticket, name) # convert NA value to "unknown"
ttn_rcp
## Data Recipe
##
## Inputs:
##
## role #variables
## id 1
## outcome 1
## predictor 11
##
## Operations:
##
## Down-sampling based on survived
## Delete terms cabin, ticket, name
library(ranger)
rf_model <- rand_forest (trees = tune(),
mtry = tune(),
min_n = tune()) %>%
set_engine("ranger") %>%
set_mode("classification")
rf_model
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = tune()
## min_n = tune()
##
## Computational engine: ranger
ttn_wf <- workflow ()%>%
add_recipe(ttn_rcp) %>%
add_model(rf_model)
ttn_wf
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: rand_forest()
##
## -- Preprocessor ----------------------------------------------------------------
## 2 Recipe Steps
##
## * step_downsample()
## * step_rm()
##
## -- Model -----------------------------------------------------------------------
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = tune()
## min_n = tune()
##
## Computational engine: ranger
grid_srch <- grid_regular(min_n(range = c(1,20)),
mtry(range = c(1,8)),# Maximum is the number of variable
trees(range = c(50,1000)),
levels = 5)
tuned_model <- tune_grid(ttn_wf,
resamples = ttn_resampling,
grid = grid_srch,
metrics = metric_set(accuracy, roc_auc, sensitivity, specificity),
control = control_resamples(save_pred = T))
## Warning: package 'rlang' was built under R version 4.0.4
## Warning: package 'vctrs' was built under R version 4.0.4
show_best(tuned_model, "roc_auc")
## # A tibble: 5 x 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2 762 5 roc_auc binary 0.845 25 0.00353 Preprocessor1_Model0~
## 2 2 1000 10 roc_auc binary 0.845 25 0.00365 Preprocessor1_Model1~
## 3 2 762 1 roc_auc binary 0.845 25 0.00342 Preprocessor1_Model0~
## 4 2 525 1 roc_auc binary 0.845 25 0.00350 Preprocessor1_Model0~
## 5 2 1000 5 roc_auc binary 0.845 25 0.00359 Preprocessor1_Model1~
show_best(tuned_model, "accuracy")
## # A tibble: 5 x 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2 1000 1 accuracy binary 0.788 25 0.00375 Preprocessor1_Model~
## 2 2 525 1 accuracy binary 0.788 25 0.00364 Preprocessor1_Model~
## 3 4 762 20 accuracy binary 0.788 25 0.00407 Preprocessor1_Model~
## 4 4 287 15 accuracy binary 0.788 25 0.00386 Preprocessor1_Model~
## 5 4 1000 15 accuracy binary 0.787 25 0.00392 Preprocessor1_Model~
finalized_model <- finalize_model(rf_model, select_best(tuned_model, "accuracy"))
finalized_model
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = 2
## trees = 1000
## min_n = 1
##
## Computational engine: ranger
tunned_wf <- workflow() %>%
add_recipe(ttn_rcp) %>%
add_model(finalized_model)
final_fit <-tunned_wf %>% last_fit(ttn_split,
metrics = metric_set(accuracy, roc_auc))
final_fit %>% conf_mat_resampled()
## # A tibble: 4 x 3
## Prediction Truth Freq
## <fct> <fct> <dbl>
## 1 0 0 118
## 2 0 1 13
## 3 1 0 19
## 4 1 1 72
##3.1 Pull the workfows and fit it on testing data
saved_model <- final_fit$.workflow[[1]]
suvival_prediction <- predict(saved_model, test2)
write_csv(suvival_prediction, "submission.csv")