Here we read the training set and the test set provided and summarize it in order to understand how the data is presented and whether there are missing values for certain observations.
There are 177 NA values in the training set. Deleting so many values can lead to substantial loss of statisitcal relevance of the results because we have only 891 values. Therefore we update the NA (missing values) for age by the median calculated using the training set data. Similarly we update NAs for Embarked and Fare column.
##Read the data
trainset.titanic<-read.csv("/Users/uzma/Downloads/train.csv",header=TRUE)
summary(trainset.titanic)
## PassengerId Survived Pclass
## Min. : 1.0 Min. :0.0000 Min. :1.000
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000
## Median :446.0 Median :0.0000 Median :3.000
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Name Sex Age
## Abbing, Mr. Anthony : 1 female:314 Min. : 0.42
## Abbott, Mr. Rossmore Edward : 1 male :577 1st Qu.:20.12
## Abbott, Mrs. Stanton (Rosa Hunt) : 1 Median :28.00
## Abelson, Mr. Samuel : 1 Mean :29.70
## Abelson, Mrs. Samuel (Hannah Wizosky): 1 3rd Qu.:38.00
## Adahl, Mr. Mauritz Nils Martin : 1 Max. :80.00
## (Other) :885 NA's :177
## SibSp Parch Ticket Fare
## Min. :0.000 Min. :0.0000 1601 : 7 Min. : 0.00
## 1st Qu.:0.000 1st Qu.:0.0000 347082 : 7 1st Qu.: 7.91
## Median :0.000 Median :0.0000 CA. 2343: 7 Median : 14.45
## Mean :0.523 Mean :0.3816 3101295 : 6 Mean : 32.20
## 3rd Qu.:1.000 3rd Qu.:0.0000 347088 : 6 3rd Qu.: 31.00
## Max. :8.000 Max. :6.0000 CA 2144 : 6 Max. :512.33
## (Other) :852
## Cabin Embarked
## :687 : 2
## B96 B98 : 4 C:168
## C23 C25 C27: 4 Q: 77
## G6 : 4 S:644
## C22 C26 : 3
## D : 3
## (Other) :186
median(trainset.titanic$Age,na.rm=TRUE)
## [1] 28
testset.titanic<-read.csv("/Users/uzma/Downloads/test.csv",header=TRUE)
summary(testset.titanic)
## PassengerId Pclass
## Min. : 892.0 Min. :1.000
## 1st Qu.: 996.2 1st Qu.:1.000
## Median :1100.5 Median :3.000
## Mean :1100.5 Mean :2.266
## 3rd Qu.:1204.8 3rd Qu.:3.000
## Max. :1309.0 Max. :3.000
##
## Name Sex
## Abbott, Master. Eugene Joseph : 1 female:152
## Abelseth, Miss. Karen Marie : 1 male :266
## Abelseth, Mr. Olaus Jorgensen : 1
## Abrahamsson, Mr. Abraham August Johannes : 1
## Abrahim, Mrs. Joseph (Sophie Halaut Easu): 1
## Aks, Master. Philip Frank : 1
## (Other) :412
## Age SibSp Parch Ticket
## Min. : 0.17 Min. :0.0000 Min. :0.0000 PC 17608: 5
## 1st Qu.:21.00 1st Qu.:0.0000 1st Qu.:0.0000 113503 : 4
## Median :27.00 Median :0.0000 Median :0.0000 CA. 2343: 4
## Mean :30.27 Mean :0.4474 Mean :0.3923 16966 : 3
## 3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:0.0000 220845 : 3
## Max. :76.00 Max. :8.0000 Max. :9.0000 347077 : 3
## NA's :86 (Other) :396
## Fare Cabin Embarked
## Min. : 0.000 :327 C:102
## 1st Qu.: 7.896 B57 B59 B63 B66: 3 Q: 46
## Median : 14.454 A34 : 2 S:270
## Mean : 35.627 B45 : 2
## 3rd Qu.: 31.500 C101 : 2
## Max. :512.329 C116 : 2
## NA's :1 (Other) : 80
median(testset.titanic$Age,na.rm=TRUE)
## [1] 27
##Combine the dataset
trainset.titanic$YesTrain<- TRUE
testset.titanic$YesTrain<- FALSE
##Test set has no survived column
testset.titanic$Survived<-NA
## Combine both sets and differentiate them via a column
finaltitanic<-rbind(trainset.titanic,testset.titanic)
#Embarked NAs
finaltitanic[finaltitanic$Embarked=='',"Embarked"]<-'S'
## Age NAs
finaltitanic[is.na(finaltitanic$Age),"Age"]<-median(finaltitanic$Age,na.rm=TRUE)
##Fare NAs
finaltitanic[is.na(finaltitanic$Fare),"Fare"]<-median(finaltitanic$Fare,na.rm=TRUE)
##Categorical Casting
finaltitanic$Pclass<-as.factor(finaltitanic$Pclass)
finaltitanic$Sex<-as.factor(finaltitanic$Sex)
finaltitanic$Embarked<-as.factor(finaltitanic$Embarked)
##Separate the test set and train set again
trainset.titanic<-finaltitanic[finaltitanic$YesTrain==TRUE,]
testset.titanic<-finaltitanic[finaltitanic$YesTrain==FALSE,]
trainset.titanic$Survived<-as.factor(trainset.titanic$Survived)
In this section we demonstrate the results/predictions using three different algorithms- Logistic regression, Neural networks and randomforests.
## using Logistic Regression because this is a Yes/No problem
equation<-"Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked"
survival<-as.formula(equation)
logistic.fit <- glm(formula = survival,family=binomial(link='logit'),data=trainset.titanic)
##Model Summary
summary(logistic.fit)
##
## Call:
## glm(formula = survival, family = binomial(link = "logit"), data = trainset.titanic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6199 -0.6089 -0.4176 0.6187 2.4514
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.064159 0.472813 8.596 < 2e-16 ***
## Pclass2 -0.919468 0.297326 -3.092 0.00199 **
## Pclass3 -2.150048 0.297720 -7.222 5.13e-13 ***
## Sexmale -2.719444 0.200977 -13.531 < 2e-16 ***
## Age -0.038517 0.007855 -4.903 9.43e-07 ***
## SibSp -0.321794 0.109193 -2.947 0.00321 **
## Parch -0.093329 0.118856 -0.785 0.43232
## Fare 0.002339 0.002469 0.947 0.34346
## EmbarkedQ -0.056267 0.381471 -0.148 0.88274
## EmbarkedS -0.434226 0.239530 -1.813 0.06986 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.66 on 890 degrees of freedom
## Residual deviance: 785.04 on 881 degrees of freedom
## AIC: 805.04
##
## Number of Fisher Scoring iterations: 5
## Final prediction
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
Survived <- predict(logistic.fit, newdata=testset.titanic, type="response")
PassengerId<-testset.titanic$PassengerId
Prediction<-cbind.data.frame(PassengerId,Survived)
for (i in 1:nrow(Prediction)){
if(Prediction$Survived[i]>0.5){
Prediction$Survived[i]<-1
}
else{Prediction$Survived[i]<-0}
}
##Display results for logistic regression
head(Prediction)
## PassengerId Survived
## 892 892 0
## 893 893 0
## 894 894 0
## 895 895 0
## 896 896 1
## 897 897 0
tail(Prediction)
## PassengerId Survived
## 1304 1304 1
## 1305 1305 0
## 1306 1306 1
## 1307 1307 0
## 1308 1308 0
## 1309 1309 0
##Using neural networks
library(nnet)
nnet.fit <- nnet(formula=survival, data=trainset.titanic, size=2)
## # weights: 25
## initial value 801.946834
## iter 10 value 523.410576
## iter 20 value 432.098991
## iter 30 value 378.541868
## iter 40 value 376.565814
## iter 50 value 375.743951
## iter 60 value 375.432528
## iter 70 value 375.123245
## iter 80 value 375.065323
## iter 90 value 374.919503
## iter 100 value 374.903315
## final value 374.903315
## stopped after 100 iterations
Survived <- predict(nnet.fit, newdata=testset.titanic)
##Model Summary
summary(nnet.fit)
## a 10-2-1 network with 25 weights
## options were - entropy fitting
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1
## -2.76 -9.43 3.28 23.40 -0.01 0.04 0.73 0.01 -1.16
## i9->h1 i10->h1
## -1.05 0.31
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2
## -38.24 18.09 -3.27 20.34 7.03 54.78 -4.22 -2.56 -4.57
## i9->h2 i10->h2
## -2.47 -31.76
## b->o h1->o h2->o
## 4.93 -4.30 -2.34
PassengerId<-testset.titanic$PassengerId
Prediction<-cbind.data.frame(PassengerId,Survived)
for (i in 1:nrow(Prediction)){
if(Prediction$Survived[i]>0.5){
Prediction$Survived[i]<-1
}
else{Prediction$Survived[i]<-0}
}
##Display results for neural network prediction
head(Prediction)
## PassengerId Survived
## 892 892 0
## 893 893 0
## 894 894 0
## 895 895 0
## 896 896 0
## 897 897 0
tail(Prediction)
## PassengerId Survived
## 1304 1304 0
## 1305 1305 0
## 1306 1306 1
## 1307 1307 0
## 1308 1308 0
## 1309 1309 0
##Using random forest
#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
random.fit <- randomForest(formula=survival,
data=trainset.titanic,
importance=TRUE,
ntree=2000)
Survived <- predict(random.fit, newdata=testset.titanic)
Prediction<-cbind.data.frame(PassengerId,Survived)
##Display results for random forest prediction
head(Prediction)
## PassengerId Survived
## 892 892 0
## 893 893 0
## 894 894 0
## 895 895 0
## 896 896 0
## 897 897 0
tail(Prediction)
## PassengerId Survived
## 1304 1304 0
## 1305 1305 0
## 1306 1306 1
## 1307 1307 0
## 1308 1308 0
## 1309 1309 1