Kaggle Competition-Titanic Dataset

Read the data

Here we read the training set and the test set provided and summarize it in order to understand how the data is presented and whether there are missing values for certain observations.

There are 177 NA values in the training set. Deleting so many values can lead to substantial loss of statisitcal relevance of the results because we have only 891 values. Therefore we update the NA (missing values) for age by the median calculated using the training set data. Similarly we update NAs for Embarked and Fare column.

##Read the data
trainset.titanic<-read.csv("/Users/uzma/Downloads/train.csv",header=TRUE)
summary(trainset.titanic)

##   PassengerId       Survived          Pclass     
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :446.0   Median :0.0000   Median :3.000  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309  
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000  
##                                                  
##                                     Name         Sex           Age       
##  Abbing, Mr. Anthony                  :  1   female:314   Min.   : 0.42  
##  Abbott, Mr. Rossmore Edward          :  1   male  :577   1st Qu.:20.12  
##  Abbott, Mrs. Stanton (Rosa Hunt)     :  1                Median :28.00  
##  Abelson, Mr. Samuel                  :  1                Mean   :29.70  
##  Abelson, Mrs. Samuel (Hannah Wizosky):  1                3rd Qu.:38.00  
##  Adahl, Mr. Mauritz Nils Martin       :  1                Max.   :80.00  
##  (Other)                              :885                NA's   :177    
##      SibSp           Parch             Ticket         Fare       
##  Min.   :0.000   Min.   :0.0000   1601    :  7   Min.   :  0.00  
##  1st Qu.:0.000   1st Qu.:0.0000   347082  :  7   1st Qu.:  7.91  
##  Median :0.000   Median :0.0000   CA. 2343:  7   Median : 14.45  
##  Mean   :0.523   Mean   :0.3816   3101295 :  6   Mean   : 32.20  
##  3rd Qu.:1.000   3rd Qu.:0.0000   347088  :  6   3rd Qu.: 31.00  
##  Max.   :8.000   Max.   :6.0000   CA 2144 :  6   Max.   :512.33  
##                                   (Other) :852                   
##          Cabin     Embarked
##             :687    :  2   
##  B96 B98    :  4   C:168   
##  C23 C25 C27:  4   Q: 77   
##  G6         :  4   S:644   
##  C22 C26    :  3           
##  D          :  3           
##  (Other)    :186

median(trainset.titanic$Age,na.rm=TRUE)

## [1] 28

testset.titanic<-read.csv("/Users/uzma/Downloads/test.csv",header=TRUE)
summary(testset.titanic)

##   PassengerId         Pclass     
##  Min.   : 892.0   Min.   :1.000  
##  1st Qu.: 996.2   1st Qu.:1.000  
##  Median :1100.5   Median :3.000  
##  Mean   :1100.5   Mean   :2.266  
##  3rd Qu.:1204.8   3rd Qu.:3.000  
##  Max.   :1309.0   Max.   :3.000  
##                                  
##                                         Name         Sex     
##  Abbott, Master. Eugene Joseph            :  1   female:152  
##  Abelseth, Miss. Karen Marie              :  1   male  :266  
##  Abelseth, Mr. Olaus Jorgensen            :  1               
##  Abrahamsson, Mr. Abraham August Johannes :  1               
##  Abrahim, Mrs. Joseph (Sophie Halaut Easu):  1               
##  Aks, Master. Philip Frank                :  1               
##  (Other)                                  :412               
##       Age            SibSp            Parch             Ticket   
##  Min.   : 0.17   Min.   :0.0000   Min.   :0.0000   PC 17608:  5  
##  1st Qu.:21.00   1st Qu.:0.0000   1st Qu.:0.0000   113503  :  4  
##  Median :27.00   Median :0.0000   Median :0.0000   CA. 2343:  4  
##  Mean   :30.27   Mean   :0.4474   Mean   :0.3923   16966   :  3  
##  3rd Qu.:39.00   3rd Qu.:1.0000   3rd Qu.:0.0000   220845  :  3  
##  Max.   :76.00   Max.   :8.0000   Max.   :9.0000   347077  :  3  
##  NA's   :86                                        (Other) :396  
##       Fare                     Cabin     Embarked
##  Min.   :  0.000                  :327   C:102   
##  1st Qu.:  7.896   B57 B59 B63 B66:  3   Q: 46   
##  Median : 14.454   A34            :  2   S:270   
##  Mean   : 35.627   B45            :  2           
##  3rd Qu.: 31.500   C101           :  2           
##  Max.   :512.329   C116           :  2           
##  NA's   :1         (Other)        : 80

median(testset.titanic$Age,na.rm=TRUE)

## [1] 27

##Combine the dataset
trainset.titanic$YesTrain<- TRUE
testset.titanic$YesTrain<- FALSE

##Test set has no survived column
testset.titanic$Survived<-NA

## Combine both sets and differentiate them via a column
finaltitanic<-rbind(trainset.titanic,testset.titanic)


#Embarked NAs
finaltitanic[finaltitanic$Embarked=='',"Embarked"]<-'S'
## Age NAs
finaltitanic[is.na(finaltitanic$Age),"Age"]<-median(finaltitanic$Age,na.rm=TRUE)
##Fare NAs
finaltitanic[is.na(finaltitanic$Fare),"Fare"]<-median(finaltitanic$Fare,na.rm=TRUE)

##Categorical Casting
finaltitanic$Pclass<-as.factor(finaltitanic$Pclass)
finaltitanic$Sex<-as.factor(finaltitanic$Sex)
finaltitanic$Embarked<-as.factor(finaltitanic$Embarked)

##Separate the test set and train set again
trainset.titanic<-finaltitanic[finaltitanic$YesTrain==TRUE,]
testset.titanic<-finaltitanic[finaltitanic$YesTrain==FALSE,]
trainset.titanic$Survived<-as.factor(trainset.titanic$Survived)

Factors and final model

In this section we demonstrate the results/predictions using three different algorithms- Logistic regression, Neural networks and randomforests.

Logistic Regression

## using Logistic Regression because this is a Yes/No problem
equation<-"Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked"
survival<-as.formula(equation)
logistic.fit <- glm(formula = survival,family=binomial(link='logit'),data=trainset.titanic)
##Model Summary
summary(logistic.fit)

## 
## Call:
## glm(formula = survival, family = binomial(link = "logit"), data = trainset.titanic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6199  -0.6089  -0.4176   0.6187   2.4514  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.064159   0.472813   8.596  < 2e-16 ***
## Pclass2     -0.919468   0.297326  -3.092  0.00199 ** 
## Pclass3     -2.150048   0.297720  -7.222 5.13e-13 ***
## Sexmale     -2.719444   0.200977 -13.531  < 2e-16 ***
## Age         -0.038517   0.007855  -4.903 9.43e-07 ***
## SibSp       -0.321794   0.109193  -2.947  0.00321 ** 
## Parch       -0.093329   0.118856  -0.785  0.43232    
## Fare         0.002339   0.002469   0.947  0.34346    
## EmbarkedQ   -0.056267   0.381471  -0.148  0.88274    
## EmbarkedS   -0.434226   0.239530  -1.813  0.06986 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.66  on 890  degrees of freedom
## Residual deviance:  785.04  on 881  degrees of freedom
## AIC: 805.04
## 
## Number of Fisher Scoring iterations: 5

## Final prediction
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

Survived <- predict(logistic.fit, newdata=testset.titanic, type="response")
PassengerId<-testset.titanic$PassengerId
Prediction<-cbind.data.frame(PassengerId,Survived)
for (i in 1:nrow(Prediction)){
  if(Prediction$Survived[i]>0.5){
    Prediction$Survived[i]<-1
  }
  else{Prediction$Survived[i]<-0}
}
 ##Display results for logistic regression
head(Prediction)

##     PassengerId Survived
## 892         892        0
## 893         893        0
## 894         894        0
## 895         895        0
## 896         896        1
## 897         897        0

tail(Prediction)

##      PassengerId Survived
## 1304        1304        1
## 1305        1305        0
## 1306        1306        1
## 1307        1307        0
## 1308        1308        0
## 1309        1309        0

Neural Networks

##Using neural networks 
library(nnet)
nnet.fit <- nnet(formula=survival, data=trainset.titanic, size=2)

## # weights:  25
## initial  value 801.946834 
## iter  10 value 523.410576
## iter  20 value 432.098991
## iter  30 value 378.541868
## iter  40 value 376.565814
## iter  50 value 375.743951
## iter  60 value 375.432528
## iter  70 value 375.123245
## iter  80 value 375.065323
## iter  90 value 374.919503
## iter 100 value 374.903315
## final  value 374.903315 
## stopped after 100 iterations

Survived <- predict(nnet.fit, newdata=testset.titanic)
##Model Summary
summary(nnet.fit)

## a 10-2-1 network with 25 weights
## options were - entropy fitting 
##   b->h1  i1->h1  i2->h1  i3->h1  i4->h1  i5->h1  i6->h1  i7->h1  i8->h1 
##   -2.76   -9.43    3.28   23.40   -0.01    0.04    0.73    0.01   -1.16 
##  i9->h1 i10->h1 
##   -1.05    0.31 
##   b->h2  i1->h2  i2->h2  i3->h2  i4->h2  i5->h2  i6->h2  i7->h2  i8->h2 
##  -38.24   18.09   -3.27   20.34    7.03   54.78   -4.22   -2.56   -4.57 
##  i9->h2 i10->h2 
##   -2.47  -31.76 
##   b->o  h1->o  h2->o 
##   4.93  -4.30  -2.34

PassengerId<-testset.titanic$PassengerId
Prediction<-cbind.data.frame(PassengerId,Survived)
for (i in 1:nrow(Prediction)){
  if(Prediction$Survived[i]>0.5){
    Prediction$Survived[i]<-1
  }
  else{Prediction$Survived[i]<-0}
}
##Display results for neural network prediction
head(Prediction)

##     PassengerId Survived
## 892         892        0
## 893         893        0
## 894         894        0
## 895         895        0
## 896         896        0
## 897         897        0

tail(Prediction)

##      PassengerId Survived
## 1304        1304        0
## 1305        1305        0
## 1306        1306        1
## 1307        1307        0
## 1308        1308        0
## 1309        1309        0

Random Forests

##Using random forest
#install.packages('randomForest')
library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

random.fit <- randomForest(formula=survival,
                      data=trainset.titanic, 
                      importance=TRUE, 
                      ntree=2000)
Survived <- predict(random.fit, newdata=testset.titanic)
Prediction<-cbind.data.frame(PassengerId,Survived)
##Display results for  random forest prediction
head(Prediction)

##     PassengerId Survived
## 892         892        0
## 893         893        0
## 894         894        0
## 895         895        0
## 896         896        0
## 897         897        0

tail(Prediction)

##      PassengerId Survived
## 1304        1304        0
## 1305        1305        0
## 1306        1306        1
## 1307        1307        0
## 1308        1308        0
## 1309        1309        1