Titanic was a British passenger liner that sank in 1912 after colliding with an iceberg. Only 31% of passengers survived in this disaster. The goal of this project is to complete the analysis of what sorts of people were likely to survive.
For more information see Titanic survival prediction challenge presented by Kaggle.
Two cvs files downloaded from Kaggle http://www.kaggle.com/c/titanic-gettingStarted/data:
Data description:
Load csv files:
library(reshape)
library(caret)
d <- read.csv("c:\\RDATA\\titanic\\train.csv")
t <- read.csv("c:\\RDATA\\titanic\\test.csv")
d.nrow<-seq(1, nrow(d)) # save the number of rows in the train dataset
d<-rbind(d, cbind(t, Survived=rep(NA, nrow(t)))) # Merge data to simplify preprocessing
Check missing values (empty or NA):
d.miss <- melt(apply(d[, -2], 2, function(x) sum(is.na(x) | x=="")))
cbind(row.names(d.miss)[d.miss$value>0], d.miss[d.miss$value>0,])
## [,1] [,2]
## [1,] "Age" "263"
## [2,] "Fare" "1"
## [3,] "Cabin" "1014"
## [4,] "Embarked" "2"
“Cabin” has missed about 80% values. We will not use this variable.
Update missing Embarked value with the most common value:
table(d$Embarked)
##
## C Q S
## 2 270 123 914
d$Embarked[which(is.na(d$Embarked) | d$Embarked=="")] <- 'S'
Some Fare values contains sum for tickets were purchased in groups. Introduce a new variable “Price” that will be Fare per person.
d$Fare[which(is.na(d$Fare))] <- 0 # Update missing Fare value with 0.
# calculate Ticket Price (Fare per person)
ticket.count <- aggregate(d$Ticket, by=list(d$Ticket), function(x) sum( !is.na(x) ))
d$Price<-apply(d, 1, function(x) as.numeric(x["Fare"]) / ticket.count[which(ticket.count[, 1] == x["Ticket"]), 2])
Price related to passenger class. Missig price values (price=0) we can update with median price per passenger class:
pclass.price<-aggregate(d$Price, by = list(d$Pclass), FUN = function(x) median(x, na.rm = T))
d[which(d$Price==0), "Price"] <- apply(d[which(d$Price==0), ] , 1, function(x) pclass.price[pclass.price[, 1]==x["Pclass"], 2])
Introduce a new variable “TicketCount” which is the number of passengers that have the same ticket number.
d$TicketCount<-apply(d, 1, function(x) ticket.count[which(ticket.count[, 1] == x["Ticket"]), 2])
Extract title of each persons name to a new variable “Title”
d$Title<-regmatches(as.character(d$Name),regexpr("\\,[A-z ]{1,20}\\.", as.character(d$Name)))
d$Title<-unlist(lapply(d$Title,FUN=function(x) substr(x, 3, nchar(x)-1)))
table(d$Title)
##
## Capt Col Don Dona Dr
## 1 4 1 1 8
## Jonkheer Lady Major Master Miss
## 1 1 2 61 260
## Mlle Mme Mr Mrs Ms
## 2 1 757 197 2
## Rev Sir the Countess
## 8 1 1
Merge 17 different title groups to the most common 4 groups.
d$Title[which(d$Title %in% c("Mme", "Mlle"))] <- "Miss"
d$Title[which(d$Title %in% c("Lady", "Ms", "the Countess", "Dona"))] <- "Mrs"
d$Title[which(d$Title=="Dr" & d$Sex=="female")] <- "Mrs"
d$Title[which(d$Title=="Dr" & d$Sex=="male")] <- "Mr"
d$Title[which(d$Title %in% c("Capt", "Col", "Don", "Jonkheer", "Major", "Rev", "Sir"))] <- "Mr"
d$Title<-as.factor(d$Title) #convert to factor variable
Update unknown age with median age for each group of title:
title.age<-aggregate(d$Age,by = list(d$Title), FUN = function(x) median(x, na.rm = T))
d[is.na(d$Age), "Age"] <- apply(d[is.na(d$Age), ] , 1, function(x) title.age[title.age[, 1]==x["Title"], 2])
We merged train and test data at the begining of preprocess. Now we will split it back to “t” and “d” Data frame variables.
Data frame “t” has no “Survival” values and will be used to predict “Survival” and submit on Kaggle.
Data frame “d” that contains train data we also split to test prediction models.
t <- d[-d.nrow, ] # test data. It has no "Survival" values.
d <- d[d.nrow, ] #Train data
set.seed(1234)
inTrain<-createDataPartition(d$Survived, p = 0.8)[[1]]
fit.8 <- glm(Survived ~ Pclass+Sex+Age+SibSp+Parch+Embarked+Title+Price+TicketCount, data=d[inTrain,], family=binomial(("logit")))
summary(fit.8)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch +
## Embarked + Title + Price + TicketCount, family = binomial(("logit")),
## data = d[inTrain, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.511 -0.550 -0.358 0.528 2.604
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.1631 0.9274 5.57 2.6e-08 ***
## Pclass -1.0814 0.2242 -4.82 1.4e-06 ***
## Sexmale -0.5634 0.6150 -0.92 0.35962
## Age -0.0257 0.0107 -2.41 0.01607 *
## SibSp -0.5786 0.1576 -3.67 0.00024 ***
## Parch -0.3902 0.1728 -2.26 0.02396 *
## EmbarkedQ 0.0366 0.4490 0.08 0.93501
## EmbarkedS -0.1674 0.2806 -0.60 0.55069
## TitleMiss -0.9006 0.3955 -2.28 0.02278 *
## TitleMr -3.3396 0.5921 -5.64 1.7e-08 ***
## TitleMrs NA NA NA NA
## Price 0.0159 0.0142 1.12 0.26077
## TicketCount 0.1325 0.1002 1.32 0.18574
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 945.03 on 712 degrees of freedom
## Residual deviance: 568.36 on 701 degrees of freedom
## AIC: 592.4
##
## Number of Fisher Scoring iterations: 5
confusionMatrix(d[-inTrain,"Survived"], as.numeric(as.numeric(predict(fit.8, d[-inTrain,])>0.5)))$overall[1]
## Warning: prediction from a rank-deficient fit may be misleading
## Accuracy
## 0.8258
# Code to prepeare test data for sumbiting on Kaggle. This code commented and for refernce only.
# fit.8 <- glm(Survived ~ Pclass+Sex+Age+SibSp+Parch+Embarked+Title+Price, data=d, family=binomial(("logit")))
# predict.8<-predict(fit.8, newdata=t, type="response")
# t$Survived <- as.numeric(as.numeric(predict.8)>0.5)
# write.csv(t[,c("PassengerId", "Survived")],"titanic\\result_8.csv", row.names=F)
Kaggel score: 0.78469
fit.5 <- glm(Survived ~ Pclass+Age+SibSp+Parch+Title, data=d[inTrain,], family=binomial("logit"))
summary(fit.5)
##
## Call:
## glm(formula = Survived ~ Pclass + Age + SibSp + Parch + Title,
## family = binomial("logit"), data = d[inTrain, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.446 -0.559 -0.364 0.500 2.673
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.4003 0.7138 7.57 3.8e-14 ***
## Pclass -1.2950 0.1491 -8.69 < 2e-16 ***
## Age -0.0269 0.0105 -2.57 0.01027 *
## SibSp -0.4726 0.1324 -3.57 0.00036 ***
## Parch -0.2681 0.1464 -1.83 0.06701 .
## TitleMiss -0.3150 0.5461 -0.58 0.56405
## TitleMr -3.3076 0.5949 -5.56 2.7e-08 ***
## TitleMrs 0.5163 0.6176 0.84 0.40309
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 945.03 on 712 degrees of freedom
## Residual deviance: 573.07 on 705 degrees of freedom
## AIC: 589.1
##
## Number of Fisher Scoring iterations: 5
confusionMatrix(d[-inTrain,"Survived"], as.numeric(as.numeric(predict(fit.5, d[-inTrain,])>0.5)))$overall[1]
## Accuracy
## 0.8539
# train model using all train data and test on kaggle (test result 0.79426)
# fit.5 <- glm(Survived ~ Pclass+Age+SibSp+Parch+Title, data=d, family=binomial(("logit")))
# predict.5<-predict(fit.5, newdata=t, type="response")
# t$Survived <- as.numeric(as.numeric(predict.5)>0.5)
# write.csv(t[,c("PassengerId", "Survived")],"titanic\\result_5.csv", row.names=F)
Kaggel score: 0.79426
fit.6.grp <- glm(Survived ~ Pclass+Age+SibSp+Parch+Title+I(TicketCount>2), data=d[inTrain,], family=binomial)
summary(fit.6.grp)
##
## Call:
## glm(formula = Survived ~ Pclass + Age + SibSp + Parch + Title +
## I(TicketCount > 2), family = binomial, data = d[inTrain,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.491 -0.553 -0.366 0.502 2.516
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.1345 0.7317 7.02 2.3e-12 ***
## Pclass -1.2558 0.1509 -8.32 < 2e-16 ***
## Age -0.0241 0.0106 -2.27 0.02335 *
## SibSp -0.5409 0.1406 -3.85 0.00012 ***
## Parch -0.4229 0.1720 -2.46 0.01397 *
## TitleMiss -0.2674 0.5511 -0.49 0.62756
## TitleMr -3.2646 0.6000 -5.44 5.3e-08 ***
## TitleMrs 0.5870 0.6214 0.94 0.34484
## I(TicketCount > 2)TRUE 0.5813 0.3296 1.76 0.07773 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 945.03 on 712 degrees of freedom
## Residual deviance: 569.98 on 704 degrees of freedom
## AIC: 588
##
## Number of Fisher Scoring iterations: 5
confusionMatrix(d[-inTrain,"Survived"], as.numeric(as.numeric(predict(fit.6.grp, d[-inTrain,])>0.5)))$overall[1]
## Accuracy
## 0.8483
# Code to prepeare test data for sumbiting on Kaggle. This code commented and for refernce only.
# fit.6.grp <- glm(Survived ~ Pclass+Age+SibSp+Parch+Title+I(TicketCount>2), data=d, family=binomial)
# predict.6.grp<-predict(fit.6.grp, newdata=t, type="response")
# t$Survived <- as.numeric(as.numeric(predict.6.grp)>0.5)
# write.csv(t[,c("PassengerId", "Survived")],"titanic\\result_6_grp.csv", row.names=F)
Kaggel score: 0.79904