Kaggle submission for Titanic dataset

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(stringr)
setwd("~/Data_Science/R/Projects/titanic")
  
titanic_train = read.csv("train.csv",header = TRUE, sep=",")
titanic_test  = read.csv("test.csv" ,header = TRUE, sep=",")

titanic_train$Survived = factor(titanic_train$Survived, levels = c(0,1), labels=c("Perished", "Survived"))

titanic_test$Survived=NA

titanic_train$train_test = "Train"
titanic_test$train_test  = "Test"
  
titanic = rbind(titanic_train, titanic_test)

#Lets look at some Plots
  #Relation between Sex and Survivability
  ggplot(titanic_train, aes(x=Sex, fill=Survived)) + geom_bar(stat="count", position="fill")

The graph above shows that female passengers had a much better chance of surviving than males. Almost 75% of female passengers survived

#Relation between Pclass and Survivability
ggplot(titanic_train, aes(x=as.factor(Pclass), fill=Survived)) + geom_bar(stat="count", position="fill")

It appears Pclass is an important factor as well. Pclass 1 had largest percentage of survivors

#Lets see how Age of the passenger played a role
summary(titanic$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.17   21.00   28.00   29.88   39.00   80.00     263

There is a large number of passengers without Age information.

#Lets assume the missing Age to be the mean age of rest of the passengers
titanic$Age[is.na(titanic$Age)==TRUE]= mean(titanic_train$Age, na.rm=TRUE)
  
#Lets divide the passengers into Minors and Adults
titanic$Minor = ifelse(titanic$Age < 18, "Yes", "No")
titanic$Minor = as.factor(titanic$Minor)

ggplot(titanic[titanic$train_test=="Train",], aes(x=Minor, fill=Survived))+ geom_bar(stat = "count", position="fill")

Graph shows Minors had a slightly better chance of survival. It seems like Women and children were allowed to get to rescue boats first.

#Lets see if family size made a difference. You would think that a large family would have less chance of survival as they might be looking for family members when it was time to get off the ship.
titanic$Family_size = titanic$Parch + titanic$SibSp + 1
summary(titanic$Family_size)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.884   2.000  11.000

#Lets divide the passengers among two groups. Family size 4 or less in Small group and others in Big
titanic$Family_Group = ifelse(titanic$Family_size <=4 , "Small", "Big")

titanic$Family_Group = as.factor(titanic$Family_Group)

ggplot(titanic[titanic$train_test=="Train",], aes(x=Family_Group, fill=Survived)) + geom_bar(stat = "count", position="fill")

Larger percentage of small families survived than big ones

table(titanic$Embarked)

## 
##       C   Q   S 
##   2 270 123 914

  #     C   Q   S 
  # 2 270 123 914 

levels(titanic$Embarked) = c(levels(titanic$Embarked), "U")  
#Assign U=Unknown embarking point
titanic$Embarked[titanic$Embarked==""] = "U"
titanic$Embarked = droplevels(titanic$Embarked) 

ggplot(titanic[titanic$train_test=="Train",], aes(x=Embarked, fill=Survived)) + geom_bar(stat = "count", position = "fill")

summary(titanic$Fare)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   7.896  14.454  33.295  31.275 512.329       1

titanic[is.na(titanic$Fare)==TRUE,]

##      PassengerId Survived Pclass               Name  Sex  Age SibSp Parch
## 1044        1044     <NA>      3 Storey, Mr. Thomas male 60.5     0     0
##      Ticket Fare Cabin Embarked train_test Minor Family_size Family_Group
## 1044   3701   NA              S       Test    No           1        Small

There is one passenger without Fare information. Lets assume it is the mean of all of the other fares in the same class.

titanic[is.na(titanic$Fare)==TRUE,]$Fare = mean(titanic$Fare[titanic$train_test=="Train" & titanic$Pclass==3], na.rm=TRUE)
ggplot(titanic[titanic$train_test=="Train",], aes(x=Survived, y=Fare)) + geom_boxplot()

Median fare paid by the survivors is higher than those who perished

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

set.seed(1)
rf.fit = randomForest(Survived ~ Sex + Pclass + Age + Family_size + Minor + Family_Group + Embarked + Fare, data = titanic[titanic$train_test=="Train",], mtry=3)
mean(rf.fit$predicted == titanic$Survived[titanic$train_test=="Train"])

## [1] 0.8350168

Training Accuracy = 83.50%

rf.predict=predict(rf.fit, titanic[titanic$train_test=="Test",], type="response")

result = data.frame(titanic_test$PassengerId, ifelse(rf.predict=="Survived", 1, 0))
colnames(result)=c("PassengerId","Survived")
write.csv(result, "Feature_Engineered.csv",row.names = FALSE)

Per Kaggle test accuracy is 78.947%.

Kaggle submission for Titanic dataset

Sumit Kumar

10/10/2019