dataset obtained from Kaggle.
library(ggplot2)
library(visdat)
library(caTools)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
setwd("/Users/jennycheng/Downloads/Titanic\ Project")
#first lets load in the dataset and treat all empty values as NA.
titanic_train <- read.csv("train.csv", na.strings = c(""))
titanic_test <- read.csv("test.csv", na.strings = c(""))
#Test set missing Survived column, add it and treat empty values as NA.
#add a column to differentiate training or test set.
titanic_test$Survived = NA
titanic_train$Set <- rep("train", nrow(titanic_train))
titanic_test$Set <- rep("test", nrow(titanic_test))
#Combine test set to training set.
titanic = rbind(titanic_train, titanic_test)
#What data types do we have?
str(titanic)
## 'data.frame': 1309 obs. of 13 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 186 levels "A10","A14","A16",..: NA 82 NA 56 NA NA 130 NA NA NA ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## $ Set : chr "train" "train" "train" "train" ...
head(titanic)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked Set
## 1 0 A/5 21171 7.2500 <NA> S train
## 2 0 PC 17599 71.2833 C85 C train
## 3 0 STON/O2. 3101282 7.9250 <NA> S train
## 4 0 113803 53.1000 C123 S train
## 5 0 373450 8.0500 <NA> S train
## 6 0 330877 8.4583 <NA> Q train
summary(titanic)
## PassengerId Survived Pclass
## Min. : 1 Min. :0.0000 Min. :1.000
## 1st Qu.: 328 1st Qu.:0.0000 1st Qu.:2.000
## Median : 655 Median :0.0000 Median :3.000
## Mean : 655 Mean :0.3838 Mean :2.295
## 3rd Qu.: 982 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :1309 Max. :1.0000 Max. :3.000
## NA's :418
## Name Sex Age
## Connolly, Miss. Kate : 2 female:466 Min. : 0.17
## Kelly, Mr. James : 2 male :843 1st Qu.:21.00
## Abbing, Mr. Anthony : 1 Median :28.00
## Abbott, Mr. Rossmore Edward : 1 Mean :29.88
## Abbott, Mrs. Stanton (Rosa Hunt): 1 3rd Qu.:39.00
## Abelson, Mr. Samuel : 1 Max. :80.00
## (Other) :1301 NA's :263
## SibSp Parch Ticket Fare
## Min. :0.0000 Min. :0.000 CA. 2343: 11 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.000 1601 : 8 1st Qu.: 7.896
## Median :0.0000 Median :0.000 CA 2144 : 8 Median : 14.454
## Mean :0.4989 Mean :0.385 3101295 : 7 Mean : 33.295
## 3rd Qu.:1.0000 3rd Qu.:0.000 347077 : 7 3rd Qu.: 31.275
## Max. :8.0000 Max. :9.000 347082 : 7 Max. :512.329
## (Other) :1261 NA's :1
## Cabin Embarked Set
## C23 C25 C27 : 6 C :270 Length:1309
## B57 B59 B63 B66: 5 Q :123 Class :character
## G6 : 5 S :914 Mode :character
## B96 B98 : 4 NA's: 2
## C22 C26 : 4
## (Other) : 271
## NA's :1014
#We have 13 variables:
#Passenger ID: Passenger Identifier
#Survived: indicator of surivival (0 = Didn't Survived, 1= Survived. Will be NA if set = test)
#Pclass: Class of Cabin (1st, 2nd, or 3rd)
#Name : Passenger Name (Order: Last name, title, first name)
#Sex: sex of passenger
#Age: Age of passenger with decimal
#SibSp: number of siblings or spouses on board
#Parch: number of parents or children on board
#Ticket: Ticket number
#Fare: Price of Ticket
#Cabin: the cabin number of the passenger
#Embarked: the port of embarkment (C = Cherbourg, Q = Queenstown, S = Southampton)
#Set: Source of set
#Let's make sure we have a backup.
titanic_backup <- titanic
#Lets look at the data class structure and missingness
?vis_dat
vis_miss(titanic)
#Embarked and Fare have a few records missing. As less than 0.1%, let's remove them.
length(titanic[is.na(titanic$Fare),"Fare"])
## [1] 1
titanic[is.na(titanic$Fare),]
## PassengerId Survived Pclass Name Sex Age SibSp Parch
## 1044 1044 NA 3 Storey, Mr. Thomas male 60.5 0 0
## Ticket Fare Cabin Embarked Set
## 1044 3701 NA <NA> S test
#1 record with NA
length(titanic[is.na(titanic$Embarked),"Embarked"])
## [1] 2
titanic[is.na(titanic$Embarked),]
## PassengerId Survived Pclass Name
## 62 62 1 1 Icard, Miss. Amelie
## 830 830 1 1 Stone, Mrs. George Nelson (Martha Evelyn)
## Sex Age SibSp Parch Ticket Fare Cabin Embarked Set
## 62 female 38 0 0 113572 80 B28 <NA> train
## 830 female 62 0 0 113572 80 B28 <NA> train
#2 records with NA
#Override titanic to remove records of Fare and Cabin with NA. (Total 3 rows)
titanic <- titanic[!is.na(titanic$Fare) & !is.na(titanic$Embarked),]
vis_miss(titanic)
#Now that fare and embarked are clean, let's look at the others.
#As we can see age and cabin are the variables with the most missing values
#as cabin ID isn't too important for our analysis. We can disregard them for now.
#Distributions
#Lets look at the distribution for age.
ggplot(titanic, aes(x=Age)) +
geom_histogram(colour="black", fill="seagreen4",binwidth=10) +
ggtitle("Age Distribution") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(panel.grid.minor = element_line(colour = "red", linetype = "dotted"), panel.background = element_rect(colour = "darkblue"))
## Warning: Removed 263 rows containing non-finite values (stat_bin).
#Is Age a normal distribution?
shapiro.test(titanic$Age)
##
## Shapiro-Wilk normality test
##
## data: titanic$Age
## W = 0.97972, p-value = 6.915e-11
#As p value < 0.05, we reject the Ho. Age is not normally distributed.
#From viz_miss, we can conclude 20.14% missing in Age
#Let's replace missing values in Age using the median imputation method.
med_age <- median(titanic[titanic$Age,"Age"], na.rm = TRUE)
mean_age <- mean(titanic[titanic$Age,"Age"], na.rm = TRUE)
titanic[is.na(titanic$Age),"Age"] <- med_age
#Now let's look at viz_miss again
vis_miss(titanic)
#Now that missing values have been dealt with, let's look at a variable we want to explore.
#such as survival.
#How many people survived? How many didn't?
table(titanic$Survived)
##
## 0 1
## 549 340
340+549
## [1] 889
340/889
## [1] 0.3824522
#Globally, chance of survival around 38%.
#Now, let's look at the survival rate by different factors
ggplot(titanic, aes(x=Survived, fill=Sex)) +
geom_bar(width=0.7) +
scale_fill_manual(values=c("hotpink", "deepskyblue"))+
theme(panel.grid.minor=element_blank(),panel.grid.major=element_blank())+
scale_x_continuous(breaks = c(0,1), labels = c("No", "Yes"))
## Warning: Removed 417 rows containing non-finite values (stat_count).
#Women had a higher chance of surviving than men on the titanic.
#How about by class?
#First we have to convert class from a number to a factor for the bar chart.
titanic$Pclass = as.factor(titanic$Pclass)
titanic2 <- titanic %>%
group_by(Survived,Pclass) %>%
summarise(count=n()) %>%
mutate(perc=count/sum(count))
ggplot(titanic2, aes(x = Survived, y = perc*100, fill = Pclass)) +
geom_bar(stat="identity", width = 0.7) +
scale_fill_manual(values=c("hotpink", "deepskyblue", "midnightblue")) +
theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank()) +
scale_x_continuous(breaks = c(0,1), labels = c("No", "Yes")) +
labs(x = "Survived", y = "Percentage", fill = "Pclass")
## Warning: Removed 3 rows containing missing values (position_stack).
#As you can see, first class boarders had the highest chance of surviving compared to
#second or third class boarders.
#Being in third class showed you are more unlikely to survived.
#How about Survival by age.
#First lets factorise age into a few groups.
titanic$AgeF = as.factor(titanic$Age)
titanic$AgeF = "Infant"
titanic$AgeF[titanic$Age >=2] <- "Child"
titanic$AgeF[titanic$Age >= 12] <- "Adult"
titanic$AgeF[titanic$Age >=51] <- "Elderly"
#According to the titanic certificate of clearance, if you were 12 or older, you would be labelled as an adult.
#The median life expectancy age in 1912 (when titanic sank) was 51.5 for both men and women.
titanic3 <- titanic %>%
group_by(Survived,AgeF) %>%
summarise(count=n()) %>%
mutate(perc1=count/sum(count))
ggplot(titanic3, aes(x = Survived, y = perc1*100, fill = factor(AgeF))) +
geom_bar(stat="identity", width = 0.7) +
scale_fill_manual(values=c("hotpink", "deepskyblue", "midnightblue", "green")) +
scale_x_continuous(breaks = c(0,1), labels = c("No", "Yes")) +
theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank()) +
labs(x = "Survived", y = "Percentage", fill = "Pclass")
## Warning: Removed 4 rows containing missing values (position_stack).
#Infants had a higher chance of survival compared to the rest.
#If you were elderly, you were less likely to survive.
#Fare is numeric. Let's look at the distribution of how much people were spending on titanic.
ggplot(titanic, aes(x=Fare)) +
geom_histogram(colour="black", fill="seagreen4", binwidth=10) +
ggtitle("Fare Distribution") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(panel.grid.minor = element_line(colour = "red", linetype = "dotted"), panel.background = element_rect(colour = "darkblue"))
mean(titanic[titanic$Fare,"Fare"])
## [1] 29.65526
#Average fare price = $30 dollars.
#Now let's create a correlation matrix to study the different relationships between
#the data elements.
titanic_backup$Pclass = as.numeric(titanic_backup$Pclass)
titanic_backup$SexN = 1
titanic_backup$SexN[titanic_backup$Sex == "male"] = 0
#Female = 1, Male = 0
titanic_numeric = titanic_backup[, sapply(titanic_backup, is.numeric)]
titanic_numeric = titanic_numeric[,-1] #Passenger Id not needed, Override to not include it in our correlation matrix
head(titanic_numeric)
## Survived Pclass Age SibSp Parch Fare SexN
## 1 0 3 22 1 0 7.2500 0
## 2 1 1 38 1 0 71.2833 1
## 3 1 3 26 0 0 7.9250 1
## 4 1 1 35 1 0 53.1000 1
## 5 0 3 35 0 0 8.0500 0
## 6 0 3 NA 0 0 8.4583 0
#Now lets make the correlation matrix
library(corrplot)
## corrplot 0.84 loaded
M = cor(titanic_numeric, use="pairwise.complete.obs")
corrplot(M, cl.pos="b", tl.pos="d", tl.col = "black", tl.cex=0.85, type="upper", method="color")
#Let's look at the actual correlation numbers.
corrplot.mixed(M, lower = "number", upper="color", tl.pos = "d", tl.col = "black", tl.cex=0.85)
#Now lets create a logistic regression model to intepret the statistical significance of the variables on survival.
classifier = glm(Survived ~ Pclass + Age + SibSp + Parch + Fare + SexN, data = titanic_numeric, family = "binomial")
summary(classifier)
##
## Call:
## glm(formula = Survived ~ Pclass + Age + SibSp + Parch + Fare +
## SexN, family = "binomial", data = titanic_numeric)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7953 -0.6476 -0.3847 0.6271 2.4433
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.754158 0.548818 5.018 5.21e-07 ***
## Pclass -1.242249 0.163191 -7.612 2.69e-14 ***
## Age -0.043953 0.008179 -5.374 7.70e-08 ***
## SibSp -0.375755 0.127361 -2.950 0.00317 **
## Parch -0.061937 0.122925 -0.504 0.61436
## Fare 0.002160 0.002493 0.866 0.38627
## SexN 2.634845 0.219609 11.998 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 635.81 on 707 degrees of freedom
## (595 observations deleted due to missingness)
## AIC: 649.81
##
## Number of Fisher Scoring iterations: 5
#As we can see, sex has the lowest p-value. Indicating it may be the most statistically significant.
#This is followed by class, age, and number of siblings.
#Now let's try to make some predictions
#First, let's create a training set and test set with the complete data only.
dataset_pred = na.omit(titanic_numeric)
rownames(dataset_pred) <- NULL #reseting the dataframe index.
set.seed(123) #equivalent to random_state in Python
split = sample.split(dataset_pred$Survived, SplitRatio = 0.80) #(Dependent Variable, Split ratio into traning set)
training_set = subset(dataset_pred, split == TRUE)
test_set = subset(dataset_pred, split == FALSE)
#Predicting the Test set results
classifier = glm(Survived ~ Pclass + Age + SibSp + Parch + Fare + SexN, data = dataset_pred, family = "binomial")
prob_pred = predict(classifier, type = 'response', newdata=test_set[-1])
#transforming vector of probabilities to 1 and 0.
y_pred = ifelse(prob_pred>0.50, 1, 0)
y_pred
## 2 5 7 10 11 12 16 20 22 28 33 36 41 50 53 54 56 81
## 1 0 0 1 1 0 0 1 1 0 1 1 0 1 0 0 0 0
## 84 85 91 97 102 103 104 109 125 129 131 132 149 153 159 160 165 170
## 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1
## 172 176 180 189 192 193 199 206 209 213 226 234 238 244 256 266 272 275
## 0 1 0 0 1 0 1 0 1 0 0 1 1 1 1 1 0 1
## 281 283 290 292 296 304 307 311 314 315 317 319 328 334 337 351 364 365
## 0 0 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0
## 367 368 371 377 382 391 395 396 398 402 406 412 414 416 419 427 432 436
## 0 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 0 0
## 439 442 447 451 459 463 464 469 472 475 477 491 501 502 505 506 512 513
## 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 0
## 515 533 535 537 542 543 552 555 561 562 566 570 575 579 585 593 598 600
## 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0
## 606 613 630 633 635 639 640 643 650 655 674 675 677 685 690 694 703
## 0 1 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0
#Time to evaluate the results
#Making the Confusion Matrix
cm = table(test_set[,1], y_pred) #actual test set vs predicted
cm
## y_pred
## 0 1
## 0 72 13
## 1 15 43
y_pred != test_set$Survived #vector of y_pred that doesn't match the actual results.
## 2 5 7 10 11 12 16 20 22 28 33 36
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE
## 41 50 53 54 56 81 84 85 91 97 102 103
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 104 109 125 129 131 132 149 153 159 160 165 170
## FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE
## 172 176 180 189 192 193 199 206 209 213 226 234
## FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 238 244 256 266 272 275 281 283 290 292 296 304
## TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 307 311 314 315 317 319 328 334 337 351 364 365
## FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
## 367 368 371 377 382 391 395 396 398 402 406 412
## FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
## 414 416 419 427 432 436 439 442 447 451 459 463
## FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## 464 469 472 475 477 491 501 502 505 506 512 513
## FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE
## 515 533 535 537 542 543 552 555 561 562 566 570
## FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## 575 579 585 593 598 600 606 613 630 633 635 639
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## 640 643 650 655 674 675 677 685 690 694 703
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
misClasificError = mean(y_pred != test_set$Survived) #mean (average) amount of inaccurate results.
misClasificError
## [1] 0.1958042
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.804195804195804"
#Our model has an accuracy of roughly 80%