What was titanic?

RMS Titanic was a British passenger liner that sank in the North Atlantic Ocean in the early hours of 15 April 1912, after colliding with an iceberg during her maiden voyage from Southampton to New York City. There were an estimated 2,224 passengers and crew aboard, and more than 1,500 died, making it one of the deadliest commercial peacetime maritime disasters in modern history. RMS Titanic was the largest ship afloat at the time she entered service and was the second of three Olympic-class ocean liners operated by the White Star Line. She was built by the Harland and Wolff shipyard in Belfast. Thomas Andrews, chief naval architect of the shipyard at the time, died in the disaster.

Loading the data.

test<-read.csv("F:/Data science/practice/titanic/test.csv")
train<-read.csv("F:/Data science/practice/titanic/train.csv")

Loading the libraries.

library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)
library(rpart)
library(rpart.plot)

Combining the datasets for better model.

test$Survived<-NA
test$set<-"test"
train$set<-"train"

full<-rbind(train,test)

Looking at the combined dataset.

str(full)

## 'data.frame':    1309 obs. of  13 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
##  $ set        : chr  "train" "train" "train" "train" ...

Checking for repeating entries.

lapply(full,function(x){length(unique(x))})

## $PassengerId
## [1] 1309
## 
## $Survived
## [1] 3
## 
## $Pclass
## [1] 3
## 
## $Name
## [1] 1307
## 
## $Sex
## [1] 2
## 
## $Age
## [1] 99
## 
## $SibSp
## [1] 7
## 
## $Parch
## [1] 8
## 
## $Ticket
## [1] 929
## 
## $Fare
## [1] 282
## 
## $Cabin
## [1] 187
## 
## $Embarked
## [1] 4
## 
## $set
## [1] 2

Checking for missing values.

missing_values<-summarise_all(full,funs(sum(is.na(.))/n()))
missing_values<-gather(missing_values,key="feature",value = "missing_pt")

Creating a visualization for missing values.

g<-ggplot(data=missing_values,aes(x=reorder(feature,-missing_pt),y=missing_pt))
g<-g+geom_bar(stat="identity",fill="blue")+coord_flip()+ylab("Feature")+xlab("Missing point")
g

Data Manipulation

For Age.

full<-mutate(full,Age=ifelse(is.na(Age),mean(full$Age,na.rm = TRUE),Age),
       'Age_Group'=case_when(Age<13 ~ "Age.012",
                             Age>=13 & Age<18 ~ "Age1317",
                             Age>=18 & Age<60 ~ "Age1859",
                             Age>=60 ~ "Age60v"
                             ))

For Titles.

Extracting titles from individual name.

names<-full$Name
title<-gsub("^.*, (.*?)\\..*$", "\\1", names)
full$title<-title
table(full$title)

## 
##         Capt          Col          Don         Dona           Dr 
##            1            4            1            1            8 
##     Jonkheer         Lady        Major       Master         Miss 
##            1            1            2           61          260 
##         Mlle          Mme           Mr          Mrs           Ms 
##            2            1          757          197            2 
##          Rev          Sir the Countess 
##            8            1            1

Combinig the titles into most common titles, to reduce variability.

full$title[full$title=="Dona"]<-"Miss"
full$title[full$title=="Mme"]<-"Miss"
full$title[full$title=="Ms"]<-"Miss"
full$title[full$title=="Mlle"]<-"Miss"
full$title[full$title=="Lady"]<-"Miss"

full$title[full$title=="Capt"]<-"Officer"
full$title[full$title=="Col"]<-"Officer"
full$title[full$title=="Major"]<-"Officer"
full$title[full$title=="Dr"]<-"Officer"
full$title[full$title=="Rev"]<-"Officer"
full$title[full$title=="Don"]<-"Officer"
full$title[full$title=="Sir"]<-"Officer"
full$title[full$title=="the Countess"]<-"Officer"
full$title[full$title=="Jonkheer"]<-"Officer"

Looking at the titles after combining.

table(full$title)

## 
##  Master    Miss      Mr     Mrs Officer 
##      61     267     757     197      27

For family groups.

full$family_group<-full$SibSp+full$Parch+1
full$family_group[full$family_group==1]<-"Single"
full$family_group[full$family_group>=2 & full$family_group<5]<-"Small"
full$family_group[full$family_group=="5"]<-"Big"
full$family_group[full$family_group=="6"]<-"Big"
full$family_group[full$family_group=="8"]<-"Big"
full$family_group[full$family_group=="7"]<-"Big"
full$family_group[full$family_group=="11"]<-"Big"
full$family_group<-as.factor(full$family_group)

Looking at family groups after operations.

unique(full$family_group)

## [1] Small  Single Big   
## Levels: Big Single Small

Tickets.

ticket.unique <- rep(0, nrow(full))
tickets <- unique(full$Ticket)
for (i in 1:length(tickets)) {
  current.ticket <- tickets[i]
  party.indexes <- which(full$Ticket == current.ticket)
  
  
for (k in 1:length(party.indexes)) {
    ticket.unique[party.indexes[k]]<-length(party.indexes)
  }
}
full$ticket.unique <- ticket.unique
full$ticket.size[full$ticket.unique == 1]<-'Single'
full$ticket.size[full$ticket.unique < 5 & full$ticket.unique>= 2]<-'Small'
full$ticket.size[full$ticket.unique >= 5]<-'Big'

Looking at tickets after operations.

unique(full$ticket.size)

## [1] "Single" "Small"  "Big"

Survival variable.

full<-mutate(full,Survived=case_when(Survived==1 ~ "Yes", Survived==0 ~ "No"))

Crude summary

##Creating crude summary.
crude_summary<-full %>%
  filter(set=="train") %>%
  select(PassengerId,Survived) %>%
  group_by(Survived) %>%
  summarise(n=n()) %>% 
  mutate(freq=n/sum(n))

crude_survrate<-crude_summary$freq[crude_summary$Survived=="Yes"]
table(crude_summary)

## , , freq = 0.383838383838384
## 
##         n
## Survived 342 549
##      No    0   0
##      Yes   1   0
## 
## , , freq = 0.616161616161616
## 
##         n
## Survived 342 549
##      No    0   1
##      Yes   0   0

Prediction

sample<-full[1:891, c("Pclass", "title","Sex","Embarked","family_group","ticket.size")]
response <- as.factor(train$Survived)
sample$Survived=as.factor(train$Survived)

Cross Validating.

i=createDataPartition(sample$Survived,times=1,p=0.6,list=FALSE)
train_val=sample[i,]
test_val=sample[-i,]
round(prop.table(table(train$Survived)*100),digits = 1)

## 
##   0   1 
## 0.6 0.4

round(prop.table(table(train_val$Survived)*100),digits = 1)

## 
##   0   1 
## 0.6 0.4

round(prop.table(table(test_val$Survived)*100),digits = 1)

## 
##   0   1 
## 0.6 0.4

UsingDecision Tree for predicting.

set.seed(123)
model<-rpart(Survived~.,data=train_val,method = "class")
rpart.plot(model,extra=3,fallen.leaves = T)

##Predicting now on train set only for accuracy calculation.

pred<-predict(model,data=train_val,type="class")
confusionMatrix(pred,train_val$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 294  54
##          1  36 152
##                                           
##                Accuracy : 0.8321          
##                  95% CI : (0.7977, 0.8628)
##     No Information Rate : 0.6157          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6393          
##  Mcnemar's Test P-Value : 0.07314         
##                                           
##             Sensitivity : 0.8909          
##             Specificity : 0.7379          
##          Pos Pred Value : 0.8448          
##          Neg Pred Value : 0.8085          
##              Prevalence : 0.6157          
##          Detection Rate : 0.5485          
##    Detection Prevalence : 0.6493          
##       Balanced Accuracy : 0.8144          
##                                           
##        'Positive' Class : 0               
##

Accuracy is 83.77% on training data.

Predicting on the test data now.

predT<-predict(model,newdata = test_val,type="class")
confusionMatrix(predT,test_val$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 198  44
##          1  21  92
##                                           
##                Accuracy : 0.8169          
##                  95% CI : (0.7727, 0.8558)
##     No Information Rate : 0.6169          
##     P-Value [Acc > NIR] : 2.529e-16       
##                                           
##                   Kappa : 0.5998          
##  Mcnemar's Test P-Value : 0.006357        
##                                           
##             Sensitivity : 0.9041          
##             Specificity : 0.6765          
##          Pos Pred Value : 0.8182          
##          Neg Pred Value : 0.8142          
##              Prevalence : 0.6169          
##          Detection Rate : 0.5577          
##    Detection Prevalence : 0.6817          
##       Balanced Accuracy : 0.7903          
##                                           
##        'Positive' Class : 0               
##

Accuracy is 80.85% on the test data.

On actual test data.

t<-filter(full,set=="test")
names(t)

##  [1] "PassengerId"   "Survived"      "Pclass"        "Name"         
##  [5] "Sex"           "Age"           "SibSp"         "Parch"        
##  [9] "Ticket"        "Fare"          "Cabin"         "Embarked"     
## [13] "set"           "Age_Group"     "title"         "family_group" 
## [17] "ticket.unique" "ticket.size"

val<-predict(model,newdata=t,type="class")
submission<-as.data.frame(val)
submission[1]<-t$PassengerId
submission[2]<-val
names(submission)<-c("PassengerID","Survived")
names(submission)

## [1] "PassengerID" "Survived"

Converting to a csv file

write.csv(submission,"submission.csv",row.names = F)

Titanic Survival Prediction

Shashwat Khare

February 19, 2019