Titanic EDA

dataset obtained from Kaggle.

library(ggplot2)
library(visdat)
library(caTools)
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.1

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

setwd("/Users/jennycheng/Downloads/Titanic\ Project")

#first lets load in the dataset and treat all empty values as NA.

titanic_train <- read.csv("train.csv", na.strings = c(""))
titanic_test <- read.csv("test.csv", na.strings = c(""))

#Test set missing Survived column, add it and treat empty values as NA.
#add a column to differentiate training or test set.

titanic_test$Survived = NA
titanic_train$Set <- rep("train", nrow(titanic_train))
titanic_test$Set <- rep("test", nrow(titanic_test))

#Combine test set to training set.
titanic = rbind(titanic_train, titanic_test)

#What data types do we have?
str(titanic)

## 'data.frame':    1309 obs. of  13 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 186 levels "A10","A14","A16",..: NA 82 NA 56 NA NA 130 NA NA NA ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
##  $ Set        : chr  "train" "train" "train" "train" ...

head(titanic)

##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked   Set
## 1     0        A/5 21171  7.2500  <NA>        S train
## 2     0         PC 17599 71.2833   C85        C train
## 3     0 STON/O2. 3101282  7.9250  <NA>        S train
## 4     0           113803 53.1000  C123        S train
## 5     0           373450  8.0500  <NA>        S train
## 6     0           330877  8.4583  <NA>        Q train

summary(titanic)

##   PassengerId      Survived          Pclass     
##  Min.   :   1   Min.   :0.0000   Min.   :1.000  
##  1st Qu.: 328   1st Qu.:0.0000   1st Qu.:2.000  
##  Median : 655   Median :0.0000   Median :3.000  
##  Mean   : 655   Mean   :0.3838   Mean   :2.295  
##  3rd Qu.: 982   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :1309   Max.   :1.0000   Max.   :3.000  
##                 NA's   :418                     
##                                Name          Sex           Age       
##  Connolly, Miss. Kate            :   2   female:466   Min.   : 0.17  
##  Kelly, Mr. James                :   2   male  :843   1st Qu.:21.00  
##  Abbing, Mr. Anthony             :   1                Median :28.00  
##  Abbott, Mr. Rossmore Edward     :   1                Mean   :29.88  
##  Abbott, Mrs. Stanton (Rosa Hunt):   1                3rd Qu.:39.00  
##  Abelson, Mr. Samuel             :   1                Max.   :80.00  
##  (Other)                         :1301                NA's   :263    
##      SibSp            Parch            Ticket          Fare        
##  Min.   :0.0000   Min.   :0.000   CA. 2343:  11   Min.   :  0.000  
##  1st Qu.:0.0000   1st Qu.:0.000   1601    :   8   1st Qu.:  7.896  
##  Median :0.0000   Median :0.000   CA 2144 :   8   Median : 14.454  
##  Mean   :0.4989   Mean   :0.385   3101295 :   7   Mean   : 33.295  
##  3rd Qu.:1.0000   3rd Qu.:0.000   347077  :   7   3rd Qu.: 31.275  
##  Max.   :8.0000   Max.   :9.000   347082  :   7   Max.   :512.329  
##                                   (Other) :1261   NA's   :1        
##              Cabin      Embarked       Set           
##  C23 C25 C27    :   6   C   :270   Length:1309       
##  B57 B59 B63 B66:   5   Q   :123   Class :character  
##  G6             :   5   S   :914   Mode  :character  
##  B96 B98        :   4   NA's:  2                     
##  C22 C26        :   4                                
##  (Other)        : 271                                
##  NA's           :1014

#We have 13 variables: 
  
#Passenger ID: Passenger Identifier
#Survived: indicator of surivival (0 = Didn't Survived, 1= Survived. Will be NA if set = test)
#Pclass: Class of Cabin (1st, 2nd, or 3rd)
#Name : Passenger Name (Order: Last name, title, first name)
#Sex: sex of passenger
#Age: Age of passenger with decimal
#SibSp: number of siblings or spouses on board
#Parch: number of parents or children on board
#Ticket: Ticket number
#Fare: Price of Ticket
#Cabin: the cabin number of the passenger
#Embarked: the port of embarkment (C = Cherbourg, Q = Queenstown, S = Southampton)
#Set: Source of set

#Let's make sure we have a backup.

titanic_backup <- titanic

#Lets look at the data class structure and missingness

?vis_dat
vis_miss(titanic)

#Embarked and Fare have a few records missing. As less than 0.1%, let's remove them.

length(titanic[is.na(titanic$Fare),"Fare"])

## [1] 1

titanic[is.na(titanic$Fare),]

##      PassengerId Survived Pclass               Name  Sex  Age SibSp Parch
## 1044        1044       NA      3 Storey, Mr. Thomas male 60.5     0     0
##      Ticket Fare Cabin Embarked  Set
## 1044   3701   NA  <NA>        S test

#1 record with NA

length(titanic[is.na(titanic$Embarked),"Embarked"])

## [1] 2

titanic[is.na(titanic$Embarked),]

##     PassengerId Survived Pclass                                      Name
## 62           62        1      1                       Icard, Miss. Amelie
## 830         830        1      1 Stone, Mrs. George Nelson (Martha Evelyn)
##        Sex Age SibSp Parch Ticket Fare Cabin Embarked   Set
## 62  female  38     0     0 113572   80   B28     <NA> train
## 830 female  62     0     0 113572   80   B28     <NA> train

#2 records with NA

#Override titanic to remove records of Fare and Cabin with NA. (Total 3 rows)

titanic <- titanic[!is.na(titanic$Fare) & !is.na(titanic$Embarked),]

vis_miss(titanic)

#Now that fare and embarked are clean, let's look at the others.
#As we can see age and cabin are the variables with the most missing values
#as cabin ID isn't too important for our analysis. We can disregard them for now.

#Distributions
#Lets look at the distribution for age.

ggplot(titanic, aes(x=Age)) +
  geom_histogram(colour="black", fill="seagreen4",binwidth=10) +
  ggtitle("Age Distribution") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(panel.grid.minor = element_line(colour = "red", linetype = "dotted"), panel.background = element_rect(colour = "darkblue"))

## Warning: Removed 263 rows containing non-finite values (stat_bin).

#Is Age a normal distribution?
shapiro.test(titanic$Age)

## 
##  Shapiro-Wilk normality test
## 
## data:  titanic$Age
## W = 0.97972, p-value = 6.915e-11

#As p value < 0.05, we reject the Ho. Age is not normally distributed.

#From viz_miss, we can conclude 20.14% missing in Age

#Let's replace missing values in Age using the median imputation method.
med_age <- median(titanic[titanic$Age,"Age"], na.rm = TRUE)
mean_age <- mean(titanic[titanic$Age,"Age"], na.rm = TRUE)

titanic[is.na(titanic$Age),"Age"] <- med_age

#Now let's look at viz_miss again
vis_miss(titanic)

#Now that missing values have been dealt with, let's look at a variable we want to explore.
#such as survival.

#How many people survived? How many didn't?

table(titanic$Survived)

## 
##   0   1 
## 549 340

340+549

## [1] 889

340/889

## [1] 0.3824522

#Globally, chance of survival around 38%.

#Now, let's look at the survival rate by different factors

ggplot(titanic, aes(x=Survived, fill=Sex)) +
  geom_bar(width=0.7) +
  scale_fill_manual(values=c("hotpink", "deepskyblue"))+
  theme(panel.grid.minor=element_blank(),panel.grid.major=element_blank())+
  scale_x_continuous(breaks = c(0,1), labels = c("No", "Yes"))

## Warning: Removed 417 rows containing non-finite values (stat_count).

#Women had a higher chance of surviving than men on the titanic.

#How about by class?
#First we have to convert class from a number to a factor for the bar chart.
titanic$Pclass = as.factor(titanic$Pclass)

titanic2 <- titanic %>% 
  group_by(Survived,Pclass) %>% 
  summarise(count=n()) %>% 
  mutate(perc=count/sum(count))

ggplot(titanic2, aes(x = Survived, y = perc*100, fill = Pclass)) +
  geom_bar(stat="identity", width = 0.7) +
  scale_fill_manual(values=c("hotpink", "deepskyblue", "midnightblue")) +
  theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank()) +
  scale_x_continuous(breaks = c(0,1), labels = c("No", "Yes")) +
  labs(x = "Survived", y = "Percentage", fill = "Pclass")

## Warning: Removed 3 rows containing missing values (position_stack).

#As you can see, first class boarders had the highest chance of surviving compared to
#second or third class boarders.

#Being in third class showed you are more unlikely to survived.

#How about Survival by age.
#First lets factorise age into a few groups.
titanic$AgeF = as.factor(titanic$Age)
titanic$AgeF = "Infant"
titanic$AgeF[titanic$Age >=2] <- "Child"
titanic$AgeF[titanic$Age >= 12] <- "Adult"
titanic$AgeF[titanic$Age >=51] <- "Elderly"

#According to the titanic certificate of clearance, if you were 12 or older, you would be labelled as an adult.
#The median life expectancy age in 1912 (when titanic sank) was 51.5 for both men and women.
titanic3 <- titanic %>% 
  group_by(Survived,AgeF) %>% 
  summarise(count=n()) %>% 
  mutate(perc1=count/sum(count))

ggplot(titanic3, aes(x = Survived, y = perc1*100, fill = factor(AgeF))) +
  geom_bar(stat="identity", width = 0.7) +
  scale_fill_manual(values=c("hotpink", "deepskyblue", "midnightblue", "green")) +
  scale_x_continuous(breaks = c(0,1), labels = c("No", "Yes")) +
  theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank()) +
  labs(x = "Survived", y = "Percentage", fill = "Pclass")

## Warning: Removed 4 rows containing missing values (position_stack).

#Infants had a higher chance of survival compared to the rest.
#If you were elderly, you were less likely to survive.

#Fare is numeric. Let's look at the distribution of how much people were spending on titanic.
ggplot(titanic, aes(x=Fare)) +
  geom_histogram(colour="black", fill="seagreen4", binwidth=10) +
  ggtitle("Fare Distribution") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(panel.grid.minor = element_line(colour = "red",  linetype = "dotted"), panel.background = element_rect(colour = "darkblue"))

mean(titanic[titanic$Fare,"Fare"])

## [1] 29.65526

#Average fare price = $30 dollars.

#Now let's create a correlation matrix to study the different relationships between
#the data elements.

titanic_backup$Pclass = as.numeric(titanic_backup$Pclass)
titanic_backup$SexN = 1
titanic_backup$SexN[titanic_backup$Sex == "male"] = 0

#Female = 1, Male = 0

titanic_numeric = titanic_backup[, sapply(titanic_backup, is.numeric)]

titanic_numeric = titanic_numeric[,-1] #Passenger Id not needed, Override to not include it in our correlation matrix

head(titanic_numeric)

##   Survived Pclass Age SibSp Parch    Fare SexN
## 1        0      3  22     1     0  7.2500    0
## 2        1      1  38     1     0 71.2833    1
## 3        1      3  26     0     0  7.9250    1
## 4        1      1  35     1     0 53.1000    1
## 5        0      3  35     0     0  8.0500    0
## 6        0      3  NA     0     0  8.4583    0

#Now lets make the correlation matrix
library(corrplot)

## corrplot 0.84 loaded

M = cor(titanic_numeric, use="pairwise.complete.obs")
corrplot(M, cl.pos="b", tl.pos="d", tl.col = "black", tl.cex=0.85, type="upper", method="color")

#Let's look at the actual correlation numbers.
corrplot.mixed(M, lower = "number", upper="color", tl.pos = "d", tl.col = "black", tl.cex=0.85)

#Now lets create a logistic regression model to intepret the statistical significance of the variables on survival.
classifier = glm(Survived ~ Pclass + Age + SibSp + Parch + Fare + SexN, data = titanic_numeric, family = "binomial")
summary(classifier)

## 
## Call:
## glm(formula = Survived ~ Pclass + Age + SibSp + Parch + Fare + 
##     SexN, family = "binomial", data = titanic_numeric)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7953  -0.6476  -0.3847   0.6271   2.4433  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.754158   0.548818   5.018 5.21e-07 ***
## Pclass      -1.242249   0.163191  -7.612 2.69e-14 ***
## Age         -0.043953   0.008179  -5.374 7.70e-08 ***
## SibSp       -0.375755   0.127361  -2.950  0.00317 ** 
## Parch       -0.061937   0.122925  -0.504  0.61436    
## Fare         0.002160   0.002493   0.866  0.38627    
## SexN         2.634845   0.219609  11.998  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 964.52  on 713  degrees of freedom
## Residual deviance: 635.81  on 707  degrees of freedom
##   (595 observations deleted due to missingness)
## AIC: 649.81
## 
## Number of Fisher Scoring iterations: 5

#As we can see, sex has the lowest p-value. Indicating it may be the most statistically significant.
#This is followed by class, age, and number of siblings.

#Now let's try to make some predictions
#First, let's create a training set and test set with the complete data only.

dataset_pred = na.omit(titanic_numeric)

rownames(dataset_pred) <- NULL #reseting the dataframe index.


set.seed(123) #equivalent to random_state in Python
split = sample.split(dataset_pred$Survived, SplitRatio = 0.80)   #(Dependent Variable, Split ratio into traning set)
training_set = subset(dataset_pred, split == TRUE)
test_set = subset(dataset_pred, split == FALSE)

#Predicting the Test set results
classifier = glm(Survived ~ Pclass + Age + SibSp + Parch + Fare + SexN, data = dataset_pred, family = "binomial")

prob_pred = predict(classifier, type = 'response', newdata=test_set[-1])


#transforming vector of probabilities to 1 and 0.
y_pred = ifelse(prob_pred>0.50, 1, 0)
y_pred

##   2   5   7  10  11  12  16  20  22  28  33  36  41  50  53  54  56  81 
##   1   0   0   1   1   0   0   1   1   0   1   1   0   1   0   0   0   0 
##  84  85  91  97 102 103 104 109 125 129 131 132 149 153 159 160 165 170 
##   1   0   0   0   0   0   0   1   0   0   0   0   0   0   1   0   0   1 
## 172 176 180 189 192 193 199 206 209 213 226 234 238 244 256 266 272 275 
##   0   1   0   0   1   0   1   0   1   0   0   1   1   1   1   1   0   1 
## 281 283 290 292 296 304 307 311 314 315 317 319 328 334 337 351 364 365 
##   0   0   0   0   1   0   0   1   0   0   1   1   0   1   1   1   1   0 
## 367 368 371 377 382 391 395 396 398 402 406 412 414 416 419 427 432 436 
##   0   1   0   1   0   0   0   1   0   1   0   1   0   0   1   1   0   0 
## 439 442 447 451 459 463 464 469 472 475 477 491 501 502 505 506 512 513 
##   0   0   0   0   0   1   1   1   0   1   1   0   1   1   0   0   0   0 
## 515 533 535 537 542 543 552 555 561 562 566 570 575 579 585 593 598 600 
##   1   0   0   0   0   0   0   0   0   1   0   1   0   1   0   0   1   0 
## 606 613 630 633 635 639 640 643 650 655 674 675 677 685 690 694 703 
##   0   1   0   0   1   1   1   0   0   0   0   0   0   1   1   1   0

#Time to evaluate the results
#Making the Confusion Matrix
cm = table(test_set[,1], y_pred) #actual test set vs predicted
cm

##    y_pred
##      0  1
##   0 72 13
##   1 15 43

y_pred != test_set$Survived #vector of y_pred that doesn't match the actual results.

##     2     5     7    10    11    12    16    20    22    28    33    36 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE 
##    41    50    53    54    56    81    84    85    91    97   102   103 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##   104   109   125   129   131   132   149   153   159   160   165   170 
## FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE 
##   172   176   180   189   192   193   199   206   209   213   226   234 
## FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE 
##   238   244   256   266   272   275   281   283   290   292   296   304 
##  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE 
##   307   311   314   315   317   319   328   334   337   351   364   365 
## FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE 
##   367   368   371   377   382   391   395   396   398   402   406   412 
## FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE 
##   414   416   419   427   432   436   439   442   447   451   459   463 
## FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE 
##   464   469   472   475   477   491   501   502   505   506   512   513 
## FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE 
##   515   533   535   537   542   543   552   555   561   562   566   570 
## FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE 
##   575   579   585   593   598   600   606   613   630   633   635   639 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE 
##   640   643   650   655   674   675   677   685   690   694   703 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE

misClasificError = mean(y_pred != test_set$Survived) #mean (average) amount of inaccurate results.
misClasificError

## [1] 0.1958042

print(paste('Accuracy',1-misClasificError))

## [1] "Accuracy 0.804195804195804"

#Our model has an accuracy of roughly 80%

Titanic EDA

Jenny Cheng

7/14/2018