Project Title: Modelling on Titanic dataset (Decision tree, lift chart, confusion matrix) to select features that may explain survival of the passengers.

NAME: ASWATHY GUNADEEP

EMAIL: aswathygunadeep@gmail.com

COLLEGE / COMPANY: NATIONAL INSTITUTE OF TECHNOLOGY KARNATAKA

This is a case study on the sinking of the Titanic ship.

PRELIMINARY WORK

setwd("C:/Users/user/Desktop/tarsha systems summer internship/datasets")
titanic.df<-read.csv(paste("Titanic Data.csv", sep=""))
View(titanic.df)
attach(titanic.df)
str(titanic.df)
## 'data.frame':    889 obs. of  8 variables:
##  $ Survived: int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass  : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Sex     : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age     : num  22 38 26 35 35 29.7 54 2 27 14 ...
##  $ SibSp   : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch   : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Fare    : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Embarked: Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
library(car)
## Loading required package: carData
some(titanic.df)
##     Survived Pclass    Sex  Age SibSp Parch     Fare Embarked
## 200        0      3   male 28.0     0     0   9.5000        S
## 280        0      3   male 65.0     0     0   7.7500        Q
## 327        1      2 female 36.0     0     0  13.0000        S
## 435        1      1 female 14.0     1     2 120.0000        S
## 533        1      3 female 29.7     0     2  22.3583        C
## 577        1      1 female 39.0     1     0  55.9000        S
## 655        0      2   male 24.0     2     0  73.5000        S
## 691        1      3 female  4.0     0     1  13.4167        C
## 705        0      2   male 39.0     0     0  26.0000        S
## 736        0      3 female 48.0     1     3  34.3750        S

VISUALIZATION

Problem Statement: We are trying to understand what factors contributed to the survival and death of passengers during the sinking of RMS titanic.

pie(table(titanic.df$Survived),main="passengers survived[0:died, 1:survived]", col=c("blue","green"))

length(titanic.df$Survived)
## [1] 889
with(titanic.df, prop.table(table(Survived))*100)
## Survived
##        0        1 
## 61.75478 38.24522

There were 889 passengers totally on board the Titanic. Out of this, only 38.25% of the passengers survived and the rest 61.75% died.

CORRGRAM

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(corrplot)
## corrplot 0.84 loaded
par(mfrow=c(1,1))
sub2.df <- subset(titanic.df[,c(1,2,4,5,6,7)])
corrplot.mixed(corr=cor(sub2.df, use="complete.obs"), upper = "pie", tl.pos="lt", main="corrgram")

The numbers denote pearson’s correlation between each 2 variables. These are some of the possible conclusions that can be deduced from the above graph:

I 1. no. of passengers in each class

mytable1 <- with(titanic.df, table(Pclass))
hist(titanic.df$Pclass, main="no. of passengers in each class", xlab="passeneger class", col="grey")

prop.table(mytable1)*100
## Pclass
##        1        2        3 
## 24.07199 20.69741 55.23060

About 55% of passengers travelled in 3rd class.

  1. proportion of males and females on board the titanic
library(lattice)
mytable2 <- with(titanic.df, table(Sex))
histogram(titanic.df$Sex, main="proportion of males and females on board the tianic", xlab="gender", col=c("yellow","magenta"))

prop.table(mytable2)*100
## Sex
##   female     male 
## 35.09561 64.90439

There were more male than female passengers on board titanic.

  1. age distribution
plot(titanic.df$Age, main="age distribution", col="blue", ylab="age",xlab="count")

Most passengers are of the age range 20-40 years. The maximum age of passengers goes upto 80 years or more.

  1. Number of Siblings / spouses aboard the Titanic
library(lattice)
mytable <- with(titanic.df, table(SibSp))
barchart(mytable,main="Number of Siblings / spouses aboard the Titanic",col="orange")

Most people came alone to travel.

prop.table(mytable)*100
## SibSp
##          0          1          2          3          4          5 
## 68.1664792 23.5095613  3.1496063  1.7997750  2.0247469  0.5624297 
##          8 
##  0.7874016
  1. Number of Parents / children aboard the Titanic
mytable <- with(titanic.df, table(Parch))
barchart(mytable,main="Number of Parents / children aboard the Titanic",col="pink")

prop.table(mytable)*100
## Parch
##          0          1          2          3          4          5 
## 76.0404949 13.2733408  8.9988751  0.5624297  0.4499438  0.5624297 
##          6 
##  0.1124859

Families were less.

  1. Passenger fare
d <- density(titanic.df$Fare)
plot(d, main="Passenger fare", xlab="fare")
polygon(d, border="black", xlab="fare")

Very few passengers gave fares higher than 200 dollars(mostly 1st class passengers), since no. of 1st class passengers were less.

  1. Port of Embarking {C = Cherbourg, Q = Queenstown, S = Southampton}
mytable <- with(titanic.df, table(Embarked))
barchart(mytable, main="Port of Embarking", horizontal=FALSE, xlab="port", ylab="count")

prop.table(mytable)*100
## Embarked
##         C         Q         S 
## 18.897638  8.661417 72.440945

About 72% passengers boarded from Southampton.

II 1. the number of first-class passengers who survived the sinking of the Titanic

mytable <- xtabs(~ Survived+Pclass, data=titanic.df)
prop.table(mytable)*100
##         Pclass
## Survived         1         2         3
##        0  8.998875 10.911136 41.844769
##        1 15.073116  9.786277 13.385827
library(vcd)
## Loading required package: grid
mosaic(mytable, shade=TRUE, legend=TRUE, main="survivors in each class")

Most survivors were passengers of 1st class and a majority of the people who died travelled in 3rd class. This could be due to the type of services and hospitality available to each class people when the accident happened.

  1. Passenger fare v/s no. of survivors
par(mfrow=c(1, 1))
boxplot(Fare ~ Survived, main="Passenger fare v/s no. of survivors", xlab="survived", ylab="fare", col=c("brown", "red"))

Sinc most survivors were 1st class passengers, the boxplot also supports this statement.

  1. no. of male and female survivors
mytable <- xtabs(~ Sex+Survived, data=titanic.df)
prop.table(mytable)*100
##         Survived
## Sex              0         1
##   female  9.111361 25.984252
##   male   52.643420 12.260967
colors = c("blue","green")
survive <- c("died","survived")
barchart(mytable, col=c("blue","green"), main="no. of male and female survivors (BLUE:DIED, GREEN:SURVIVED)", xlab="count", ylab="gender")

  1. How is fare and passenger class related?
plot(Fare ~ Pclass, col="darkcyan",main="ticket fare v/s passenger class")

1st class passnegers have paid upto 800 dollars, 2nd and 3rd class passengers have paid maximum only upto 100 dollars.

III 1. the number of females from First-Class who survived the sinking of the Titanic

mytable <- xtabs(~ Survived+Pclass+Sex, data=titanic.df)
mytable
## , , Sex = female
## 
##         Pclass
## Survived   1   2   3
##        0   3   6  72
##        1  89  70  72
## 
## , , Sex = male
## 
##         Pclass
## Survived   1   2   3
##        0  77  91 300
##        1  45  17  47
library(lattice)
histogram(~Survived | Pclass+Sex, horizontal=FALSE, col="red")

From the graph and table it is evident that females of 1st class mostly survived(89 out of the 340 survivors). And most people who died were males who travelled in 2nd and 3rd class.

2.fare and gender with survived

library(ggplot2)
qplot(x=Sex,y=Fare,color=factor(Survived))

  1. fare and passenger class with survived
qplot(x=Pclass,y=Fare,color=factor(Survived))

IV 1.CHI SQAURED TESTS

Hypothesis: The proportion of females onboard who survived the sinking of the Titanic was higher than the proportion of males onboard who survived the sinking of the Titanic.

mytable <- xtabs(~ Sex+Survived, data=titanic.df)
mytable
##         Survived
## Sex        0   1
##   female  81 231
##   male   468 109
chisq.test(mytable)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytable
## X-squared = 258.43, df = 1, p-value < 2.2e-16

Since the p-value is vey small(<0.01), we reject the Null hypothesis.

Hypothesis: The proportion of 1ct class passengers onboard who survived the sinking of the Titanic was higher than the proportion of 2nd and 3rd class passengers onboard who survived the sinking of the Titanic.

mytable <- xtabs(~ Survived+Pclass, data=titanic.df)
mytable
##         Pclass
## Survived   1   2   3
##        0  80  97 372
##        1 134  87 119
chisq.test(mytable)
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 100.98, df = 2, p-value < 2.2e-16

Since the p-value is vey small(<0.01), we reject the Null hypothesis.

2.t-Tests

NULL hypothesis: People who survived had same fare.

t.test(Fare ~ Survived, data=titanic.df)
## 
##  Welch Two Sample t-test
## 
## data:  Fare by Survived
## t = -6.7597, df = 433.18, p-value = 4.483e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -33.67804 -18.50519
## sample estimates:
## mean in group 0 mean in group 1 
##        22.11789        48.20950

Based on the above output of the t-test, we can reject the hypothesis beacuse the p-value is very less.

V MODELLING

1.Decision tree

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(party)
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## The following object is masked from 'package:car':
## 
##     Predict
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library(rpart)
library(rpart.plot)
library(ROCR)
set.seed(100)
titanic1 <- "https://goo.gl/At238b" %>% read_csv %>% select(survived, embarked, sex, sibsp, parch, fare) %>% mutate(embarked = factor(embarked), sex = factor(sex))
## Parsed with column specification:
## cols(
##   pclass = col_character(),
##   survived = col_integer(),
##   name = col_character(),
##   sex = col_character(),
##   age = col_double(),
##   sibsp = col_integer(),
##   parch = col_integer(),
##   ticket = col_character(),
##   fare = col_double(),
##   cabin = col_character(),
##   embarked = col_character(),
##   boat = col_character(),
##   body = col_integer(),
##   home.dest = col_character()
## )

SCATTERPLOTMATRIX

library(car)
scatterplotMatrix(~Pclass+Age+SibSp+Parch+Fare+Sex | Survived, data=titanic.df, col=c("red","black"), pch=1, main="pairwise plot with survived variable", lower.panel=NULL)
## Warning in smoother(x[subs], y[subs], col = smoother.args$col[i], log.x =
## FALSE, : could not fit smooth

## Warning in smoother(x[subs], y[subs], col = smoother.args$col[i], log.x =
## FALSE, : could not fit smooth

## Warning in smoother(x[subs], y[subs], col = smoother.args$col[i], log.x =
## FALSE, : could not fit smooth

## Warning in smoother(x[subs], y[subs], col = smoother.args$col[i], log.x =
## FALSE, : could not fit smooth

## Warning in smoother(x[subs], y[subs], col = smoother.args$col[i], log.x =
## FALSE, : could not fit smooth

## Warning in smoother(x[subs], y[subs], col = smoother.args$col[i], log.x =
## FALSE, : could not fit smooth

.data <- c("training", "test") %>%
  sample(nrow(titanic1), replace = T) %>%
  split(titanic1, .)

Recursive partitioning

rtree_fit <- rpart(survived ~ ., 
          .data$training) 
rpart.plot(rtree_fit)

Conditional partinitioning

tree_fit <- ctree(survived ~ ., 
                  data = .data$training)
tree_fit
## 
##   Conditional inference tree with 7 terminal nodes
## 
## Response:  survived 
## Inputs:  embarked, sex, sibsp, parch, fare 
## Number of observations:  641 
## 
## 1) sex == {female}; criterion = 1, statistic = 181.078
##   2) fare <= 46.9; criterion = 0.998, statistic = 12.985
##     3) sibsp <= 2; criterion = 0.965, statistic = 7.272
##       4)*  weights = 155 
##     3) sibsp > 2
##       5)*  weights = 8 
##   2) fare > 46.9
##     6) sibsp <= 0; criterion = 1, statistic = 26.482
##       7)*  weights = 26 
##     6) sibsp > 0
##       8) parch <= 1; criterion = 0.998, statistic = 15.269
##         9)*  weights = 26 
##       8) parch > 1
##         10)*  weights = 12 
## 1) sex == {male}
##   11) fare <= 51.8625; criterion = 0.978, statistic = 8.126
##     12)*  weights = 364 
##   11) fare > 51.8625
##     13)*  weights = 50
plot(tree_fit)

Decsion tree rules show that sex and 3 other inputs(fare,parch,sibsp) predict no. of people survived.

ROC curve

tree_roc <- tree_fit %>%
  predict(newdata = .data$test) %>%
  prediction(.data$test$survived) %>%
  performance("tpr", "fpr")
plot(tree_roc)

Lift curve

.data$test$prediction <- predict(tree_fit, newdata = .data$test)
.data$test$probabilities <- 1- unlist(treeresponse(tree_fit, newdata = .data$test), use.names=F)[seq(1,nrow(.data$test)*2,2)]
pred <- prediction(.data$test$probabilities, .data$test$survived)
perf <- performance(pred,"lift","rpp")
plot(perf, main="lift curve", colorize=T)

We’re looking at the improvement of predictions as a function of the unpredicted values. The possible improvement multiple declines sharply and then has alternate increase and decreases until rpp=1.0(100% positive prediction of values using the model). The correct interpretation of this lift chart is that the model plotted in red gives the greatest predictive lift, that is upto 0.65.

  1. Logistic regression
model <- glm(Survived ~ Pclass + Sex+ Age+ SibSp + Parch + Fare + Embarked, family=binomial(link='logit'),data=titanic.df)
summary(model)
## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch + 
##     Fare + Embarked, family = binomial(link = "logit"), data = titanic.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6446  -0.5907  -0.4230   0.6220   2.4432  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.285269   0.564781   9.358  < 2e-16 ***
## Pclass      -1.100058   0.143529  -7.664 1.80e-14 ***
## Sexmale     -2.718719   0.200784 -13.541  < 2e-16 ***
## Age         -0.039902   0.007854  -5.080 3.77e-07 ***
## SibSp       -0.325765   0.109383  -2.978   0.0029 ** 
## Parch       -0.092618   0.118709  -0.780   0.4353    
## Fare         0.001918   0.002376   0.807   0.4195    
## EmbarkedQ   -0.034099   0.381935  -0.089   0.9289    
## EmbarkedS   -0.418855   0.236793  -1.769   0.0769 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1182.82  on 888  degrees of freedom
## Residual deviance:  784.19  on 880  degrees of freedom
## AIC: 802.19
## 
## Number of Fisher Scoring iterations: 5

The sginficant variables are gender,passenger class and age.

library(gplots)
fitted.results <- predict(model,newdata=subset(titanic.df,select=c(2,3,4)),type='response')
pr <- prediction(fitted.results, titanic.df$Survived)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8572994

Since there is 86% accuracy, this is a good fit.

LIFT CURVE

perf <- performance(pr,"lift","rpp")
plot(perf, main="lift curve", colorize=T)

The possible improvement multiple has a sudden dip initially and then decreases at a slow rate. The model predicts the greatest predictive lift(red colour in graph) till 0.1 only.