From the Kaggle site, https://www.kaggle.com/c/titanic, we can download the train and test set file. The test set missing on the column for Survived.
train <- read.csv(file = 'train.csv', header = TRUE, na.strings = '')
test <- read.csv(file = 'test.csv', header = TRUE, na.strings = '')
## fill in for Survived then bind two dataset
test$Survived <- NA
total_dataset <- rbind(train, test)
## quick look at the dataset
summary(total_dataset)
## PassengerId Survived Pclass Name
## Min. : 1 Min. :0.0000 Min. :1.000 Length:1309
## 1st Qu.: 328 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median : 655 Median :0.0000 Median :3.000 Mode :character
## Mean : 655 Mean :0.3838 Mean :2.295
## 3rd Qu.: 982 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :1309 Max. :1.0000 Max. :3.000
## NA's :418
## Sex Age SibSp Parch
## Length:1309 Min. : 0.17 Min. :0.0000 Min. :0.000
## Class :character 1st Qu.:21.00 1st Qu.:0.0000 1st Qu.:0.000
## Mode :character Median :28.00 Median :0.0000 Median :0.000
## Mean :29.88 Mean :0.4989 Mean :0.385
## 3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :80.00 Max. :8.0000 Max. :9.000
## NA's :263
## Ticket Fare Cabin Embarked
## Length:1309 Min. : 0.000 Length:1309 Length:1309
## Class :character 1st Qu.: 7.896 Class :character Class :character
## Mode :character Median : 14.454 Mode :character Mode :character
## Mean : 33.295
## 3rd Qu.: 31.275
## Max. :512.329
## NA's :1
str(total_dataset)
## 'data.frame': 1309 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr NA "C85" NA "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
library(tidyverse)
library(mice)
library(caret)
First we check the variable Name, we want to extract the title information from it, as it may indicate gender and age or some additional information maybe relevent for the prediction.
## create empty column
total_dataset$Title <- NA
## parse out the title
total_dataset$Title <- str_sub(total_dataset$Name, str_locate(total_dataset$Name, ",")[, 1] + 2, str_locate(total_dataset$Name, "\\.")[,1]-1)
## plot to see the frequency
total_dataset %>%
group_by(Title) %>%
summarise(n = n()) %>%
ggplot() +
geom_bar(aes(x= reorder(Title, desc(n)), y= n), stat = 'identity')
## `summarise()` ungrouping output (override with `.groups` argument)
We see there are only Mr, Miss, Mrs and Master are the most common, while other titles have negelibale frequency, therefore we can group the low frequency ones into the standard title.
## master as unmarried man
total_dataset$Title[total_dataset$Title %in% c('Mme', 'Lady', 'Ms', 'the Countess', 'Dona')] <- 'Mrs'
total_dataset$Title[total_dataset$Title %in% c('Capt', 'Don', 'Col', 'Dr', 'Jonkheer', 'Major', 'Rev', 'Sir')] <- 'Mr'
total_dataset$Title[total_dataset$Title %in% c('Mlle')] <- 'Miss'
## after grouping
table(total_dataset$Title)
##
## Master Miss Mr Mrs
## 61 262 783 203
We change all other character columns into factors
total_dataset$Survived <- as.factor(total_dataset$Survived)
total_dataset$Pclass <- as.factor(total_dataset$Pclass)
total_dataset$Sex <- as.factor(total_dataset$Sex)
total_dataset$Embarked <- as.factor(total_dataset$Embarked)
total_dataset$Title <- as.factor(total_dataset$Title)
For the numercial variables SibSp and Parch, we will create a new variable to group those family informations into a single factor.
## only consider total family size
total_dataset <- mutate(total_dataset, FamSize = SibSp + Parch)
The dataset has large missing values in Age (263), and a few in Fare(1), Embarked(2). We will use the mice package to impute the missing value.
set.seed(123)
total_dataset_clean <- total_dataset %>%
select(-c(PassengerId, Survived, Name, SibSp, Parch)) %>%
mice(print = FALSE)
## Warning: Number of logged events: 2
By default mice impute 5 sets of computation, after compare different entry with NA’s, chosse a sensible imputation.
dataset <- complete(total_dataset_clean, 1)
Select final columns. Ticket and Cabin can be considered, but there are too many missing values and their represents similiar information with Fare, so we will drop them for this analysis.
dataset <- dataset %>%
select(-c(Ticket, Cabin)) %>%
mutate(Survived = total_dataset$Survived)
Because we will use xgboost, so we will do one hot encoding for the categorical variables.
## dummy variable to pass the process
dummy_var <- dummyVars(~Pclass + Sex + Title,
data = dataset,
levelsOnly = TRUE)
dataset_dummy <- predict(dummy_var, dataset)
## drop the original variable
dataset2 <- dataset %>%
mutate(dataset_dummy) %>%
select(-c(Pclass, Sex, Title))
As the dataset was split already, we will use the row number to keep the training and testing set.
train_dataset <- dataset2[1:891,]
test_dataset <- dataset2[892:1309,]
Setup the train control for the caret model
## 10 fold cv, 3 times
train_control <- trainControl(method = 'repeatedcv',
number = 10,
repeats = 3,
search = 'grid')
We will try varies model, then compare their performance.
## logistics
model_lg <- train(Survived ~ .,
data = train_dataset,
method = 'glm',
family = binomial(),
tuneLength = 5,
trControl = train_control,
set.seed(1234))
## random forest (slow)
model_rf <- train(Survived ~ .,
data = train_dataset,
method = 'rf',
tuneLength = 5,
trControl = train_control,
set.seed(1234))
## Navie bayes
model_nb <- train(Survived ~ .,
data = train_dataset,
method = 'naive_bayes',
tuneLength = 5,
trControl = train_control,
set.seed(1234))
## knn
model_knn <- train(Survived ~ .,
data = train_dataset,
method = 'knn',
tuneLength = 5,
preProcess = c('center', 'scale'),
trControl = train_control)
## gradient boosting
model_gbm <- train(Survived ~ .,
data = train_dataset,
method = 'gbm',
tuneLength = 5,
trControl = train_control,
verbose = FALSE,
set.seed(1234))
## xgboost (slow)
model_xgb <- train(Survived ~ .,
data = train_dataset,
method = 'xgbTree',
tuneLength = 5,
trControl = train_control,
set.seed(1234))
We can use resamples from caret to compare models, then we can decide which model to use or group.
## compare models
results <- resamples(list(log=model_lg, rf=model_rf, nb=model_nb, knn = model_knn, xgb=model_xgb, gbm=model_gbm))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: log, rf, nb, knn, xgb, gbm
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## log 0.7528090 0.8112870 0.8314607 0.8353979 0.8647855 0.9222222 0
## rf 0.7191011 0.8128788 0.8323970 0.8379873 0.8651685 0.9000000 0
## nb 0.7191011 0.7733657 0.7966037 0.7945907 0.8089888 0.8651685 0
## knn 0.7528090 0.7977528 0.8146067 0.8159689 0.8426966 0.8750000 0
## xgb 0.7888889 0.8314607 0.8426966 0.8462723 0.8634831 0.9325843 0
## gbm 0.7865169 0.8186925 0.8426966 0.8425270 0.8662921 0.9438202 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## log 0.4764706 0.5928866 0.6348348 0.6480386 0.7074114 0.8337731 0
## rf 0.3948871 0.5887276 0.6348348 0.6457644 0.7095171 0.7826323 0
## nb 0.4336981 0.5177730 0.5717516 0.5685024 0.5965776 0.7176097 0
## knn 0.4822845 0.5592411 0.5921934 0.6041449 0.6649313 0.7290034 0
## xgb 0.5488127 0.6369323 0.6644218 0.6693879 0.7050655 0.8539387 0
## gbm 0.5348006 0.6025020 0.6591904 0.6602655 0.7129447 0.8789774 0
modelCor(results)
## log rf nb knn xgb gbm
## log 1.00000000 -0.04519476 -0.05458422 0.38706243 0.2793724 -0.06636379
## rf -0.04519476 1.00000000 -0.05383537 -0.05964235 0.1647165 -0.13359733
## nb -0.05458422 -0.05383537 1.00000000 -0.03405458 -0.1031276 -0.05869008
## knn 0.38706243 -0.05964235 -0.03405458 1.00000000 0.4181593 -0.07832586
## xgb 0.27937245 0.16471651 -0.10312762 0.41815934 1.0000000 -0.22633993
## gbm -0.06636379 -0.13359733 -0.05869008 -0.07832586 -0.2263399 1.00000000
Accuracy is similar between all models, except navie bayes being below 80%, while the corraltion matrix shows the models all differ, so it is useful to bag or stack them.
## plot view
bwplot(results)
We will create a new data frame to store the predictions and taking averageing for their predication.
pred_table_voting <- NULL
## model predictions
pred_table_voting$gbm <- predict(model_gbm, newdata = train_dataset, type = 'prob')
pred_table_voting$lg <- predict(model_lg, newdata = train_dataset, type = 'prob')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
pred_table_voting$rf <- predict(model_rf, newdata = train_dataset, type = 'prob')
pred_table_voting$nb <- predict(model_nb, newdata = train_dataset, type = 'prob')
pred_table_voting$knn <- predict(model_knn, newdata = train_dataset, type = 'prob')
pred_table_voting$xgb <- predict(model_xgb, newdata = train_dataset, type = 'prob')
## original value
pred_table_voting$Survived <- train_dataset$Survived
pred_table_voting <- as.data.frame(pred_table_voting)
## nb seems bad, so we kept it out
pred_table_voting <- pred_table_voting %>%
mutate(Survived_prob = (gbm.1+lg.1+rf.1+knn.1+xgb.1)/5) %>%
mutate(Survived_pred = if_else(Survived_prob>=0.5, 1, 0))
Check the groupped performance
## confustion matrix
cm_final <- confusionMatrix(as.factor(pred_table_voting$Survived_pred), as.factor(train_dataset$Survived), positive = '1')
cm_final
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 520 71
## 1 29 271
##
## Accuracy : 0.8878
## 95% CI : (0.8652, 0.9077)
## No Information Rate : 0.6162
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7571
##
## Mcnemar's Test P-Value : 4.132e-05
##
## Sensitivity : 0.7924
## Specificity : 0.9472
## Pos Pred Value : 0.9033
## Neg Pred Value : 0.8799
## Prevalence : 0.3838
## Detection Rate : 0.3042
## Detection Prevalence : 0.3367
## Balanced Accuracy : 0.8698
##
## 'Positive' Class : 1
##
The weighted model has an improved accuracy than individual models, so we will apply them onto the test set.
## create prob predication
pred_table_voting2 <- NULL
pred_table_voting2$gbm <- predict(model_gbm, newdata = test_dataset, type = 'prob')
pred_table_voting2$lg <- predict(model_lg, newdata = test_dataset, type = 'prob')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
pred_table_voting2$rf <- predict(model_rf, newdata = test_dataset, type = 'prob')
pred_table_voting2$knn <- predict(model_knn, newdata = test_dataset, type = 'prob')
pred_table_voting2$xgb <- predict(model_xgb, newdata = test_dataset, type = 'prob')
## convert to a dataframe
pred_table_voting2 <- as.data.frame(pred_table_voting2)
## find average
pred_table_voting2 <- pred_table_voting2 %>%
mutate(dummy_sum = (gbm.1+lg.1+rf.1+knn.1+xgb.1)/5) %>%
mutate(Survived = if_else(dummy_sum > 0.5, 1, 0)) %>%
mutate(PassengerId = test[ ,1]) %>%
select(PassengerId, Survived)
write.csv(pred_table_voting2, 'Output.final.csv', row.names = FALSE)
Final score of 0.77033.