The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.
One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.
This mini-project involves the analysis of what sorts of people were likely to survive.
This stage involves the following: - Setting up a working directory to access Titanic dataset (data classified: 1. Train and 2. Test) - Loading the train dataset and other required libraries to this envrionment
rm(list=ls())
library(NLP)
library(tm)
library(R.utils)
library(stringi)
library(data.table)
library(ggplot2)
library(SnowballC)
library(caret) # Confusion Matrix
library(pscl) # For R^2
library(ROCR) # for ROC and AUC
setwd("/Users/Mughundhan/KAGGLE/Titanic")
train <- read.csv("train.csv")
#Columns available in the original Data-set
colnames(train)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
Handle missing values
Columns available after removing the unnecessary columns.
colnames(train)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Fare"
2.1.Name Cannonicalization into title-groups:
Process for converting data that has more than one possible representation into a “standard”, “normal”, or canonical form. (5 bins used here)
Names are replaced with Prefix (for instance: Mr. John is replaced with Mr. and Dan Mrs. Jahn is replaced with Mrs.)
The names of the passengers are given as follows:
## [1] "Braund, Mr. Owen Harris"
## [2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
## [3] "Heikkinen, Miss. Laina"
## [4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"
## [5] "Allen, Mr. William Henry"
## [6] "Moran, Mr. James"
After performing Cannonicalization, the above names are replaced as follows:
## [1] "Mr" "Mrs" "Miss" "Mrs" "Mr" "Mr"
Age column shall not be removed: Substitute the mean age values of each section of people (instead of taking an universal average - makes our prediction more efficient) in place of the missing values.
For instance, if a person’s age field is missing, and the person’s name is “Mr.”, then the mean of the age of all passengers whose name is “Mr.” is replaced in place of missing field. Likewise, it is done for Mrs, Miss, Master, Dr and Rev.
Operations performed on the dataset with missing values.
#Number of NA values after cleaning the data
sum(is.na(train$Age))
## [1] 177
#Maximum Age
max(train$Age)
## [1] NA
#Minimum Age
min(train$Age)
## [1] NA
Operations performed on the dataset, after handling the missing values.
#Number of NA values after cleaning the data
sum(is.na(train$Age))
## [1] 0
#Maximum Age
max(train$Age)
## [1] 80
#Minimum Age
min(train$Age)
## [1] 0.42
2.3.Creating New Varaibles and Discretization
## PassengerId Survived Pclass Name Sex Age SibSp Parch Fare
## 1 1 0 3 Mr male 22.00 1 0 7.2500
## 2 2 1 1 Mrs female 38.00 1 0 71.2833
## 3 3 1 3 Miss female 26.00 0 0 7.9250
## 4 4 1 1 Mrs female 35.00 1 0 53.1000
## 5 5 0 3 Mr male 35.00 0 0 8.0500
## 6 6 0 3 Mr male 32.36 0 0 8.4583
## FamilySize Fare2 Child
## 1 2 <$10 0
## 2 2 $30+ 0
## 3 1 <$10 0
## 4 2 $30+ 0
## 5 1 <$10 0
## 6 1 <$10 0
counts <- table(train$Survived, train$Sex)
#confusion Matrix
counts
##
## female male
## 0 81 468
## 1 233 109
female <- nrow(train[train$Sex=='female',])
male <- nrow(train[train$Sex=='male',])
slice <- c(female, male)
divi <- c("Female", "Male")
survival_percent <- c(counts[2]/(counts[1]+counts[2]), counts[4]/(counts[3]+counts[4]))
# Survival Percent for Female, Male
survival_percent
## [1] 0.7420382 0.1889081
fcounts <- table(train$Survived, train$Fare2)
#confusion Matrix
fcounts
##
## $10-$20 $20-$30 $30+ <$10
## 0 103 78 99 269
## 1 76 58 141 67
fare_survival_percent <- c(fcounts[2]/(fcounts[1]+fcounts[2]), fcounts[4]/(fcounts[3]+fcounts[4]), fcounts[6]/(fcounts[5]+fcounts[6]), fcounts[8]/(fcounts[7]+fcounts[8]))
fare_survival_percent
## [1] 0.4245810 0.4264706 0.5875000 0.1994048
# $10-$20, $20-$30, $30.00+, <$10.00
Ccounts <- table(train$Survived, train$Pclass)
#confusion Matrix
Ccounts
##
## 1 2 3
## 0 80 97 372
## 1 136 87 119
Class_survival_percent <- c(Ccounts[2]/(Ccounts[1]+Ccounts[2]), Ccounts[4]/(Ccounts[3]+Ccounts[4]), Ccounts[6]/(Ccounts[5]+Ccounts[6]))
Class_survival_percent
## [1] 0.6296296 0.4728261 0.2423625
# Class(1), Class(2), Class(3)
CHcounts <- table(train$Survived, train$Child)
#confusion Matrix
CHcounts
##
## 0 1
## 0 495 54
## 1 279 63
CH_survival_percent <- c(CHcounts[2]/(CHcounts[1]+CHcounts[2]), CHcounts[4]/(CHcounts[3]+CHcounts[4]))
CH_survival_percent
## [1] 0.3604651 0.5384615
# Others, Children
##
## 0 1
## female 0.09090909 0.26150393
## male 0.52525253 0.12233446
## Child Sex Survived
## 1 0 female 195
## 2 1 female 38
## 3 0 male 84
## 4 1 male 25
## Child Sex Survived
## 1 0 female 259
## 2 1 female 55
## 3 0 male 515
## 4 1 male 62
NOTE: 1.00 stands for 100% survival rate.
## Fare2 Child Sex Pclass Survived
## 7 $30+ 1 male 1 1
## 10 $30+ 0 female 2 1
## 11 $10-$20 1 female 2 1
## 12 $20-$30 1 female 2 1
## 13 $30+ 1 female 2 1
## 20 $30+ 1 male 2 1
## Fare2 Sex Pclass Survived
## 8 $30+ female 2 1.0000000
## 2 $30+ female 1 0.9772727
## 6 $10-$20 female 2 0.9142857
## 7 $20-$30 female 2 0.9000000
## 1 $20-$30 female 1 0.8333333
## 16 <$10 female 3 0.5937500
## Sex Pclass Survived
## 1 female 1 0.9680851
## 3 female 2 0.9210526
## 5 female 3 0.5000000
## 2 male 1 0.3688525
## 4 male 2 0.1574074
## 6 male 3 0.1354467
## Fare2 Child Pclass Survived
## 11 $30+ 1 2 1.0000000
## 4 $30+ 1 1 0.9166667
## 10 $20-$30 1 2 0.8888889
## 9 $10-$20 1 2 0.8571429
## 16 $10-$20 1 3 0.7500000
## 2 $30+ 0 1 0.6666667
## Child Sex Pclass Survived
## 4 1 male 1 1.0000000
## 6 1 female 2 1.0000000
## 1 0 female 1 0.9767442
## 5 0 female 2 0.9062500
## 2 1 female 1 0.8750000
## 8 1 male 2 0.8181818
## Child Sex Survived
## 1 0 female 0.7528958
## 2 1 female 0.6909091
## 4 1 male 0.4032258
## 3 0 male 0.1631068
## Child Pclass Survived
## 2 1 1 0.9166667
## 4 1 2 0.9130435
## 1 0 1 0.6127451
## 3 0 2 0.4099379
## 6 1 3 0.3780488
## 5 0 3 0.2151589
## Fare2 Child Survived
## 5 $10-$20 1 0.7741935
## 3 $30+ 0 0.6050000
## 7 $30+ 1 0.5000000
## 2 $20-$30 0 0.4272727
## 6 $20-$30 1 0.4230769
## 8 <$10 1 0.4000000
# Model fitting
model <- glm(Survived ~.,family=binomial(link='logit'),data=train)
# Analysis of deviance
anova(model,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 890 1186.66
## PassengerId 1 0.022 889 1186.63 0.881201
## Pclass 1 102.523 888 1084.11 < 2.2e-16 ***
## Name 18 313.614 870 770.50 < 2.2e-16 ***
## Sex 1 6.938 869 763.56 0.008438 **
## Age 1 6.010 868 757.55 0.014226 *
## SibSp 1 36.335 867 721.21 1.661e-09 ***
## Parch 1 6.021 866 715.19 0.014140 *
## Fare 1 2.742 865 712.45 0.097761 .
## FamilySize 0 0.000 865 712.45
## Fare2 3 2.300 862 710.15 0.512454
## Child 1 0.644 861 709.51 0.422412
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# McFadden R^2
pR2(model)
## llh llhNull G2 McFadden r2ML
## -354.7533940 -593.3275684 477.1483488 0.4020952 0.4146352
## r2CU
## 0.5633594
#-------------------------------------------------------------------------------------------------#
# MEASURING THE PREDICTIVE ABILITY OF THE MODEL
# If prob > 0.5 then 1, else 0. Threshold can be set for better results
fitted.results <- predict(model,newdata=train,type='response') #use test
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != train$Survived)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.829405162738496"
# Confusion matrix
confusionMatrix(data=fitted.results, reference=train$Survived) #use test
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 483 86
## 1 66 256
##
## Accuracy : 0.8294
## 95% CI : (0.8031, 0.8535)
## No Information Rate : 0.6162
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6353
## Mcnemar's Test P-Value : 0.1233
##
## Sensitivity : 0.8798
## Specificity : 0.7485
## Pos Pred Value : 0.8489
## Neg Pred Value : 0.7950
## Prevalence : 0.6162
## Detection Rate : 0.5421
## Detection Prevalence : 0.6386
## Balanced Accuracy : 0.8142
##
## 'Positive' Class : 0
##
# ROC and AUC
p <- predict(model, newdata=train, type="response") #use test
pr <- prediction(p, train$Survived) #use test
# TPR = sensitivity, FPR=specificity
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8837972
Alternative Way: Not evaluated
perf = function(pred, act) {
# Confusion Matrix
xtab = table(pred, act);
# Retrieve TP, FP, TN, FN
TP = xtab[1,1]
if(nrow(xtab)==2) {
TN = xtab[2,2]
}
else {
TN = 0
}
if(nrow(xtab)==2) {
FP = xtab[1,2]
}
else {
FP = 0
}
FN = xtab[2,1]
# Compute Accuracy, Precision, Recall, F-Measure
r = matrix(NA, nrow=1, ncol=4)
colnames(r) = c('Accuracy', 'Precision', 'Recall', 'F-Measure')
r[1,1] = (TP+TN)/(TP+TN+FP+FN)
r[1,2] = TP/(TP+FP)
r[1,3] = TP/(TP+FN)
r[1,4] = r[1,2]*r[1,3]
}
pred = rep(0, nrow(train)) #use test
#perf(pred, train$Survived) #use test
#-------------------------------------------------------------------------------------------------#
# Generalized Linear Model
train.glm <- glm(Survived ~ Pclass + Sex + Age + Sex*Pclass, family = binomial, data=train)
summary(train.glm)
prob.logistic <- predict.glm(train.glm, newdata = train, type="response") #use test
pred.llr <- vector()
for(i in 1:length(prob.logistic))
{ if(is.na(prob.logistic[i]) || prob.logistic[i] > 0.5)
{ pred.llr[i] <- 1 #Probability > 0.5 then survived
}
else{
pred.llr <- 0
}
}
perf(pred.llr, train$Survived) #use test