Apply machine leaning techniques to predict what types of people were likely to survive in Titanic disaster.
Models used :
Achieved 80% accuracy in prediction. Rank top 11% among 5,601 competitions as of end of Oct-2016
Load Packages
library(mice)
library(randomForest)
library(caTools)
library(rpart)
library(rpart.plot)
library(dplyr)
library(ggplot2)
Import Data
train = read.csv("train.csv", stringsAsFactors = FALSE)
test <- read.csv("test.csv", stringsAsFactors = FALSE)
Combind training and test datasets for feature engineering
full <- bind_rows(train, test)
Look at the full dataset
str(full)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1309 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
Survived - Survival (0 = No, 1 = Yes)
Pclass - Passenger Class
Name - Passenger Name
Sex - Sex
Age - Age
Sibsp - Number of Siblings/Spouses Aboard
Parch - Number of Parents/Children Aboard
Ticket - Ticket Number
Fare - Fare
Cabin - Cabin
Embarked - Port of Embarkation
Create new variable : Title
full$Title <- gsub('(.*, )|(\\..*)', '', full$Name)
officer <- c("Capt", "Col", "Major", "Dr", "Rev")
Royality <- c("Jonkheer", "Don", "Sir", "the Countess", "Dona", "Lady")
full$Title[full$Title %in% officer] <- "Officer"
full$Title[full$Title %in% Royality] <- "Royality"
full$Title[full$Title == "Mlle"] <- "Miss"
full$Title[full$Title == "Mme"] <- "Mrs"
full$Title[full$Title == "Ms"] <- "Mrs"
table(full$Title)
##
## Master Miss Mr Mrs Officer Royality
## 61 262 757 200 23 6
Create new variable : Family Size
full$FamilySize <- (full$SibSp+full$Parch + 1)
ggplot(full[1:891,], aes(x = FamilySize, fill = factor(Survived))) +
geom_bar(stat = 'count', position = 'dodge') +
scale_x_continuous(breaks = c(1:11))
Create new variable : Family Size Category
full <- full %>%
mutate(FamilySizeCat = ifelse(FamilySize == 1, "Single", ifelse(FamilySize >= 5, "Large", "Samll")))
Create new varialble : Single Fare
fare.table <- full %>%
group_by(Ticket) %>%
summarise(Fare = mean(Fare, rm.na = TRUE),
TicketCount = n(),
SingleFare = Fare/TicketCount)
full <- left_join(full, fare.table, by = "Ticket")
full$Fare.y <- NULL
ggplot(full[1:891,], aes(x = SingleFare, fill = factor(Survived))) +
geom_histogram(position = 'dodge') +
xlim(0, 50)
Create new varilables : Cabin and Deck
full$Cabin[full$Cabin == ""] <- "U000"
full$Deck <- substr(full$Cabin, 1, 1)
full$Deck[full$Deck == "T"] <- "A"
table(full$Deck, full$Pclass)
##
## 1 2 3
## A 23 0 0
## B 65 0 0
## C 94 0 0
## D 40 6 0
## E 34 4 3
## F 0 13 8
## G 0 0 5
## U 67 254 693
ggplot(full[1:891,], aes(x = Deck, fill = factor(Survived))) +
geom_bar(position = 'dodge')
Impute Fare.x
full %>%
filter(Pclass == 3 & Cabin == "U000") %>%
group_by(Pclass, Cabin, Embarked) %>%
summarise(mean = mean(SingleFare, na.rm = TRUE))
## Source: local data frame [3 x 4]
## Groups: Pclass, Cabin [?]
##
## Pclass Cabin Embarked mean
## (int) (chr) (chr) (dbl)
## 1 3 U000 C 6.829740
## 2 3 U000 Q 7.575783
## 3 3 U000 S 7.392706
full$Fare.x[is.na(full$Fare.x)] <- 7.392706
full$SingleFare[is.na(full$SingleFare)] <- 7.392706
Impute Embarked
full %>%
filter(Pclass == 1) %>%
group_by(Embarked) %>%
summarise(mean = mean(SingleFare, na.rm = TRUE))
## Source: local data frame [4 x 2]
##
## Embarked mean
## (chr) (dbl)
## 1 40.00000
## 2 C 38.32432
## 3 Q 30.00000
## 4 S 30.39188
full$Embarked[full$Embarked == ""] <- "C"
Before we impute Age, take a look at what factors that might impact the age distribution
ggplot(full, aes(x = as.factor(Pclass), y = Age)) + geom_boxplot()
ggplot(full, aes(x = as.factor(Sex), y = Age)) + geom_boxplot()
ggplot(full, aes(x = as.factor(SibSp), y = Age)) + geom_boxplot()
ggplot(full, aes(x = as.factor(Parch), y = Age)) + geom_boxplot()
ggplot(full, aes(x = as.factor(Deck), y = Age)) + geom_boxplot()
ggplot(full, aes(x = Fare.x, y = Age)) +
geom_point(aes(color = factor(Survived))) +
geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1))
ggplot(full, aes(x = as.factor(TicketCount), y = Age)) + geom_boxplot()
Run imputation with selected variables
full.impute.1 <- full %>% select(Pclass, Sex, Age, SibSp, Parch, Title, FamilySize)
full.impute.1 <- complete(mice(full.impute.1))
##
## iter imp variable
## 1 1 Age
## 1 2 Age
## 1 3 Age
## 1 4 Age
## 1 5 Age
## 2 1 Age
## 2 2 Age
## 2 3 Age
## 2 4 Age
## 2 5 Age
## 3 1 Age
## 3 2 Age
## 3 3 Age
## 3 4 Age
## 3 5 Age
## 4 1 Age
## 4 2 Age
## 4 3 Age
## 4 4 Age
## 4 5 Age
## 5 1 Age
## 5 2 Age
## 5 3 Age
## 5 4 Age
## 5 5 Age
full.impute.1$Survived <- full$Survived
Run another imputation with all variables
full.impute.2 <- full %>% select(Pclass, Sex, Age, SibSp, Parch, Fare.x, Embarked, Title, FamilySize, SingleFare, Deck)
full.impute.2 <- complete(mice(full.impute.2))
##
## iter imp variable
## 1 1 Age
## 1 2 Age
## 1 3 Age
## 1 4 Age
## 1 5 Age
## 2 1 Age
## 2 2 Age
## 2 3 Age
## 2 4 Age
## 2 5 Age
## 3 1 Age
## 3 2 Age
## 3 3 Age
## 3 4 Age
## 3 5 Age
## 4 1 Age
## 4 2 Age
## 4 3 Age
## 4 4 Age
## 4 5 Age
## 5 1 Age
## 5 2 Age
## 5 3 Age
## 5 4 Age
## 5 5 Age
full.impute.2$Survived <- full$Survived
Look at the result after imputation
ggplot(full[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram() + ylim(0,120)
ggplot(full.impute.1[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram()
ggplot(full.impute.2[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram()
ggplot(full[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram(aes(y =..density..)) + ylim(0,0.1)
ggplot(full.impute.1[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram(aes(y =..density..)) + ylim(0,0.1)
ggplot(full.impute.2[1:891,], aes(x = Age, fill = factor(Survived))) + geom_histogram(aes(y =..density..)) + ylim(0,0.1)
Compare to 1st and 2nd imputations, the 1st one looks more close to original age distribution. So we will use it for runing the models.
Run logsitic regrssion with most variables
full$Age <- full.impute.1$Age
train.impute <- full[1:891,]
test.impute <- full[892:1309,]
lg_fit1 <- glm(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare.x + Embarked + FamilySize + SingleFare + Deck + Title, data = train.impute, family = "binomial")
summary(lg_fit1)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch +
## Fare.x + Embarked + FamilySize + SingleFare + Deck + Title,
## family = "binomial", data = train.impute)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3815 -0.5562 -0.3686 0.5499 2.4743
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 19.863520 504.071639 0.039 0.9686
## Pclass -0.872740 0.220964 -3.950 7.83e-05 ***
## Sexmale -14.784280 504.070448 -0.029 0.9766
## Age -0.027625 0.008529 -3.239 0.0012 **
## SibSp -0.549119 0.128476 -4.274 1.92e-05 ***
## Parch -0.338400 0.140148 -2.415 0.0158 *
## Fare.x 0.002588 0.004444 0.583 0.5602
## EmbarkedQ -0.284929 0.395314 -0.721 0.4711
## EmbarkedS -0.418922 0.254871 -1.644 0.1002
## FamilySize NA NA NA NA
## SingleFare 0.007233 0.019722 0.367 0.7138
## DeckB 0.115772 0.755810 0.153 0.8783
## DeckC -0.272818 0.702655 -0.388 0.6978
## DeckD 0.767585 0.767021 1.001 0.3170
## DeckE 1.089875 0.763804 1.427 0.1536
## DeckF 0.504067 1.083470 0.465 0.6418
## DeckG -0.870956 1.244861 -0.700 0.4842
## DeckU -0.340483 0.673794 -0.505 0.6133
## TitleMiss -15.345195 504.070708 -0.030 0.9757
## TitleMr -3.550740 0.535861 -6.626 3.44e-11 ***
## TitleMrs -14.504139 504.070772 -0.029 0.9770
## TitleOfficer -3.310967 0.830858 -3.985 6.75e-05 ***
## TitleRoyality -3.400136 1.375808 -2.471 0.0135 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.66 on 890 degrees of freedom
## Residual deviance: 712.49 on 869 degrees of freedom
## AIC: 756.49
##
## Number of Fisher Scoring iterations: 13
predict_fit1 <- predict(lg_fit1, data = train.impute, type = "response")
table(train.impute$Survived, predict_fit1 > 0.5)
##
## FALSE TRUE
## 0 484 65
## 1 78 264
(482+265)/891
## [1] 0.8383838
Select only variable with p value < 0.05
lg_fit2 <- glm(Survived ~ Pclass + Age + SibSp + Embarked + Parch + Title, data = train.impute, family = "binomial")
summary(lg_fit2)
##
## Call:
## glm(formula = Survived ~ Pclass + Age + SibSp + Embarked + Parch +
## Title, family = "binomial", data = train.impute)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3848 -0.5801 -0.3705 0.5370 2.4849
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.896863 0.689258 8.555 < 2e-16 ***
## Pclass -1.242591 0.142074 -8.746 < 2e-16 ***
## Age -0.025929 0.008222 -3.154 0.00161 **
## SibSp -0.525550 0.122538 -4.289 1.80e-05 ***
## EmbarkedQ -0.277634 0.389159 -0.713 0.47559
## EmbarkedS -0.470539 0.238882 -1.970 0.04887 *
## Parch -0.309174 0.130983 -2.360 0.01826 *
## TitleMiss -0.574150 0.492522 -1.166 0.24372
## TitleMr -3.565390 0.526791 -6.768 1.30e-11 ***
## TitleMrs 0.175215 0.540416 0.324 0.74577
## TitleOfficer -3.383071 0.780935 -4.332 1.48e-05 ***
## TitleRoyality -2.763221 1.072860 -2.576 0.01001 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.66 on 890 degrees of freedom
## Residual deviance: 731.05 on 879 degrees of freedom
## AIC: 755.05
##
## Number of Fisher Scoring iterations: 5
predict_fit2 <- predict(lg_fit2, data = train.impute, type = "response")
table(train.impute$Survived, predict_fit2 > 0.5)
##
## FALSE TRUE
## 0 486 63
## 1 81 261
(483+258)/891
## [1] 0.8316498
1st model with greater # of variable seems to have better accuracy on the training set. It will be interersting to see if it will still have better predicting power when we apply it to test set without being overfitting.
dt.fit1 <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare.x + Embarked + FamilySize + SingleFare + Deck + Title, data = train.impute, method = "class")
rpart.plot(dt.fit1)
predict_df_fit1 <- predict(dt.fit1, data = train.impute)
table(train.impute$Survived, predict_df_fit1[,2] > 0.5)
##
## FALSE TRUE
## 0 521 28
## 1 104 238
(509+246)/891
## [1] 0.8473625
The accuracy on the training from decision seem too good to be true especially comparing to logistic regression and random forest model we will see next. It could be overfitting. But the tree does give us a good logical path on what type of people were likely to survive.
train.impute$Pclass <- as.factor(train.impute$Pclass)
train.impute$SibSp <- as.factor(train.impute$SibSp)
train.impute$Parch <- as.factor(train.impute$SibSp)
train.impute$Embarked <- as.factor(train.impute$Embarked)
train.impute$Title <- as.factor(train.impute$Title)
train.impute$Sex <- as.factor(train.impute$Sex)
train.impute$FamilySize <- as.factor(train.impute$FamilySize)
train.impute$Deck <- as.factor(train.impute$Deck)
train.impute.rf <- train.impute %>%
select(Survived, Pclass, Age, SibSp, Parch, Embarked, Title, Sex, FamilySize, Deck, Fare.x, SingleFare)
rf_fit1 <- randomForest(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare.x + Embarked + FamilySize + SingleFare + Deck + Title, data = train.impute.rf)
predict.rf.fit1 <- predict(rf_fit1, data = train.impute.rf)
table(train.impute$Survived, predict.rf.fit1 > 0.5)
##
## FALSE TRUE
## 0 496 53
## 1 91 251
(489+248)/891
## [1] 0.8271605
varImpPlot(rf_fit1,type=2)
Choose top 5 most important variables and run random forest again
rf_fit2 <- randomForest(Survived ~ Sex + Age + Fare.x + SingleFare + Title, data = train.impute.rf)
predict.rf.fit2 <- predict(rf_fit2, data = train.impute.rf)
table(train.impute$Survived, predict.rf.fit2 > 0.5)
##
## FALSE TRUE
## 0 487 62
## 1 91 251
(491+248)/891
## [1] 0.8294052
A little surprising to see that the results from random forest are not as good as both logistic regression and decision tree. But again, it is only against the training set.
Since I do not have the final result on the test set, it is difficult to say which models give us the best result. One thing can be done is to split the original training set into training and test sets then we can do the evaluation.