library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
setwd("~/Data_Science/R/Projects/titanic")
titanic_train = read.csv("train.csv",header = TRUE, sep=",")
titanic_test = read.csv("test.csv" ,header = TRUE, sep=",")
titanic_train$Survived = factor(titanic_train$Survived, levels = c(0,1), labels=c("Perished", "Survived"))
titanic_test$Survived=NA
titanic_train$train_test = "Train"
titanic_test$train_test = "Test"
titanic = rbind(titanic_train, titanic_test)
#Lets look at some Plots
#Relation between Sex and Survivability
ggplot(titanic_train, aes(x=Sex, fill=Survived)) + geom_bar(stat="count", position="fill")
The graph above shows that female passengers had a much better chance of surviving than males. Almost 75% of female passengers survived
#Relation between Pclass and Survivability
ggplot(titanic_train, aes(x=as.factor(Pclass), fill=Survived)) + geom_bar(stat="count", position="fill")
It appears Pclass is an important factor as well. Pclass 1 had largest percentage of survivors
#Lets see how Age of the passenger played a role
summary(titanic$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.17 21.00 28.00 29.88 39.00 80.00 263
There is a large number of passengers without Age information.
#Lets assume the missing Age to be the mean age of rest of the passengers
titanic$Age[is.na(titanic$Age)==TRUE]= mean(titanic_train$Age, na.rm=TRUE)
#Lets divide the passengers into Minors and Adults
titanic$Minor = ifelse(titanic$Age < 18, "Yes", "No")
titanic$Minor = as.factor(titanic$Minor)
ggplot(titanic[titanic$train_test=="Train",], aes(x=Minor, fill=Survived))+ geom_bar(stat = "count", position="fill")
Graph shows Minors had a slightly better chance of survival. It seems like Women and children were allowed to get to rescue boats first.
#Lets see if family size made a difference. You would think that a large family would have less chance of survival as they might be looking for family members when it was time to get off the ship.
titanic$Family_size = titanic$Parch + titanic$SibSp + 1
summary(titanic$Family_size)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.884 2.000 11.000
#Lets divide the passengers among two groups. Family size 4 or less in Small group and others in Big
titanic$Family_Group = ifelse(titanic$Family_size <=4 , "Small", "Big")
titanic$Family_Group = as.factor(titanic$Family_Group)
ggplot(titanic[titanic$train_test=="Train",], aes(x=Family_Group, fill=Survived)) + geom_bar(stat = "count", position="fill")
Larger percentage of small families survived than big ones
table(titanic$Embarked)
##
## C Q S
## 2 270 123 914
# C Q S
# 2 270 123 914
levels(titanic$Embarked) = c(levels(titanic$Embarked), "U")
#Assign U=Unknown embarking point
titanic$Embarked[titanic$Embarked==""] = "U"
titanic$Embarked = droplevels(titanic$Embarked)
ggplot(titanic[titanic$train_test=="Train",], aes(x=Embarked, fill=Survived)) + geom_bar(stat = "count", position = "fill")
summary(titanic$Fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 7.896 14.454 33.295 31.275 512.329 1
titanic[is.na(titanic$Fare)==TRUE,]
## PassengerId Survived Pclass Name Sex Age SibSp Parch
## 1044 1044 <NA> 3 Storey, Mr. Thomas male 60.5 0 0
## Ticket Fare Cabin Embarked train_test Minor Family_size Family_Group
## 1044 3701 NA S Test No 1 Small
There is one passenger without Fare information. Lets assume it is the mean of all of the other fares in the same class.
titanic[is.na(titanic$Fare)==TRUE,]$Fare = mean(titanic$Fare[titanic$train_test=="Train" & titanic$Pclass==3], na.rm=TRUE)
ggplot(titanic[titanic$train_test=="Train",], aes(x=Survived, y=Fare)) + geom_boxplot()
Median fare paid by the survivors is higher than those who perished
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(1)
rf.fit = randomForest(Survived ~ Sex + Pclass + Age + Family_size + Minor + Family_Group + Embarked + Fare, data = titanic[titanic$train_test=="Train",], mtry=3)
mean(rf.fit$predicted == titanic$Survived[titanic$train_test=="Train"])
## [1] 0.8350168
Training Accuracy = 83.50%
rf.predict=predict(rf.fit, titanic[titanic$train_test=="Test",], type="response")
result = data.frame(titanic_test$PassengerId, ifelse(rf.predict=="Survived", 1, 0))
colnames(result)=c("PassengerId","Survived")
write.csv(result, "Feature_Engineered.csv",row.names = FALSE)
Per Kaggle test accuracy is 78.947%.