The data is from the kaggle … competititon (https://www.kaggle.com/c/titanic)
setwd("C:/Users/Owner/Desktop/MachineLearningR_sampleData/TitanicDataSet")
titanic_train <- read.csv("train.csv", stringsAsFactors = TRUE)
titanic_test <- read.csv("test.csv", stringsAsFactors = TRUE)
dim(titanic_train)
## [1] 891 12
head(titanic_train)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
head(titanic_test)
## PassengerId Pclass Name Sex
## 1 892 3 Kelly, Mr. James male
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) female
## 3 894 2 Myles, Mr. Thomas Francis male
## 4 895 3 Wirz, Mr. Albert male
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female
## 6 897 3 Svensson, Mr. Johan Cervin male
## Age SibSp Parch Ticket Fare Cabin Embarked
## 1 34.5 0 0 330911 7.8292 Q
## 2 47.0 1 0 363272 7.0000 S
## 3 62.0 0 0 240276 9.6875 Q
## 4 27.0 0 0 315154 8.6625 S
## 5 22.0 1 1 3101298 12.2875 S
## 6 14.0 0 0 7538 9.2250 S
library(Amelia)
## Warning: package 'Amelia' was built under R version 3.4.2
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.4.2
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(titanic_train, main = "Missing values in train dataset")
missmap(titanic_test, main = "Missing values in test dataset")
sum(is.na(titanic_train$Age))
## [1] 177
sum(is.na(titanic_test$Age))
## [1] 86
There are missing values in Age column in both the training (177) and the testing (86) datasets. There are a few ways of handling missing values. I use the mice() from the mice package which draw missing values from a distribution specifically designed for each missing datapoint. Before imputing the Age missing values, I will check all the other features on the dataset since this key to drawing plausible Age values.
The passenge ID and ticket number are not informative features. I will exclude them from both the training and testing datasets. The passenger name is not very informative either, but their Titles (Mr. Miss, Mrs. …) might informative in predictive the age of the passengers and for building an accurate prediction model. I forked the following code from EasyD (https://github.com/EasyD/IntroToDataScience/blob/master/TitanicDataAnalysis_Video6.R) to extract the Titles of the passengers.
extract_Title <- function(name) {
name <- as.character(name)
if (length(grep("Miss.", name)) > 0) {
return ("Miss.")
} else if (length(grep("Master.", name)) > 0) {
return ("Master.")
} else if (length(grep("Mrs.", name)) > 0) {
return ("Mrs.")
} else if (length(grep("Mr.", name)) > 0) {
return ("Mr.")
} else {
return ("Other")
}
}
Titles <- NULL
for (i in 1:nrow(titanic_train)) {
Titles <- c(Titles, extract_Title(titanic_train[i,"Name"]))
}
titanic_train$Title <- as.factor(Titles)
Titles <- NULL
for (i in 1:nrow(titanic_test)) {
Titles <- c(Titles, extract_Title(titanic_test[i,"Name"]))
}
titanic_test$Title <- as.factor(Titles)
head(titanic_train)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked Title
## 1 0 A/5 21171 7.2500 S Mr.
## 2 0 PC 17599 71.2833 C85 C Mrs.
## 3 0 STON/O2. 3101282 7.9250 S Miss.
## 4 0 113803 53.1000 C123 S Mrs.
## 5 0 373450 8.0500 S Mr.
## 6 0 330877 8.4583 Q Mr.
head(titanic_test)
## PassengerId Pclass Name Sex
## 1 892 3 Kelly, Mr. James male
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) female
## 3 894 2 Myles, Mr. Thomas Francis male
## 4 895 3 Wirz, Mr. Albert male
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female
## 6 897 3 Svensson, Mr. Johan Cervin male
## Age SibSp Parch Ticket Fare Cabin Embarked Title
## 1 34.5 0 0 330911 7.8292 Q Mr.
## 2 47.0 1 0 363272 7.0000 S Mrs.
## 3 62.0 0 0 240276 9.6875 Q Mr.
## 4 27.0 0 0 315154 8.6625 S Mr.
## 5 22.0 1 1 3101298 12.2875 S Mrs.
## 6 14.0 0 0 7538 9.2250 S Mr.
titanic_train <- titanic_train[c(-1,-4,-9)]
titanic_test <- titanic_test[c(-1,-3,-8)]
sum(is.na(titanic_train))
## [1] 177
head(titanic_train)
## Survived Pclass Sex Age SibSp Parch Fare Cabin Embarked Title
## 1 0 3 male 22 1 0 7.2500 S Mr.
## 2 1 1 female 38 1 0 71.2833 C85 C Mrs.
## 3 1 3 female 26 0 0 7.9250 S Miss.
## 4 1 1 female 35 1 0 53.1000 C123 S Mrs.
## 5 0 3 male 35 0 0 8.0500 S Mr.
## 6 0 3 male NA 0 0 8.4583 Q Mr.
head(titanic_test)
## Pclass Sex Age SibSp Parch Fare Cabin Embarked Title
## 1 3 male 34.5 0 0 7.8292 Q Mr.
## 2 3 female 47.0 1 0 7.0000 S Mrs.
## 3 2 male 62.0 0 0 9.6875 Q Mr.
## 4 3 male 27.0 0 0 8.6625 S Mr.
## 5 3 female 22.0 1 1 12.2875 S Mrs.
## 6 3 male 14.0 0 0 9.2250 S Mr.
Just like Title, Survived and PClass should also be factor variables
titanic_train$Survived <- factor(titanic_train$Survived, levels = c("0", "1"),
labels = c("No", "YES"))
titanic_train$Pclass<-as.factor(titanic_train$Pclass)
titanic_train$Pclass <- factor(titanic_train$Pclass, levels = c("1", "2","3"),
labels = c("first", "second", "third"))
titanic_test$Pclass<-as.factor(titanic_test$Pclass)
titanic_test$Pclass <- factor(titanic_test$Pclass, levels = c("1", "2","3"),
labels = c("first", "second", "third"))
table(titanic_train$Cabin)
##
## A10 A14 A16
## 687 1 1 1
## A19 A20 A23 A24
## 1 1 1 1
## A26 A31 A32 A34
## 1 1 1 1
## A36 A5 A6 A7
## 1 1 1 1
## B101 B102 B18 B19
## 1 1 2 1
## B20 B22 B28 B3
## 2 2 2 1
## B30 B35 B37 B38
## 1 2 1 1
## B39 B4 B41 B42
## 1 1 1 1
## B49 B5 B50 B51 B53 B55
## 2 2 1 2
## B57 B59 B63 B66 B58 B60 B69 B71
## 2 2 1 1
## B73 B77 B78 B79
## 1 2 1 1
## B80 B82 B84 B86 B94
## 1 1 1 1
## B96 B98 C101 C103 C104
## 4 1 1 1
## C106 C110 C111 C118
## 1 1 1 1
## C123 C124 C125 C126
## 2 2 2 2
## C128 C148 C2 C22 C26
## 1 1 2 3
## C23 C25 C27 C30 C32 C45
## 4 1 1 1
## C46 C47 C49 C50
## 1 1 1 1
## C52 C54 C62 C64 C65
## 2 1 1 2
## C68 C7 C70 C78
## 2 1 1 2
## C82 C83 C85 C86
## 1 2 1 1
## C87 C90 C91 C92
## 1 1 1 2
## C93 C95 C99 D
## 2 1 1 3
## D10 D12 D11 D15 D17
## 1 1 1 2
## D19 D20 D21 D26
## 1 2 1 2
## D28 D30 D33 D35
## 1 1 2 2
## D36 D37 D45 D46
## 2 1 1 1
## D47 D48 D49 D50
## 1 1 1 1
## D56 D6 D7 D9
## 1 1 1 1
## E10 E101 E12 E121
## 1 3 1 2
## E17 E24 E25 E31
## 1 2 2 1
## E33 E34 E36 E38
## 2 1 1 1
## E40 E44 E46 E49
## 1 2 1 1
## E50 E58 E63 E67
## 1 1 1 2
## E68 E77 E8 F E69
## 1 1 2 1
## F G63 F G73 F2 F33
## 1 2 3 3
## F38 F4 G6 T
## 1 2 4 1
There are 687 passengers without an assigned Cabin label in the training data set. The other passengers have Cabin labels from A(number) through G(number). I will reduce the labels to their respective letters excluding the numbers. Passengers without Cabin number will serve as a standalone category.
titanic_train$Cabin <- as.character(titanic_train$Cabin)
a<-transform(titanic_train, Cabin = replace(Cabin, startsWith(Cabin, "A"), "A"))
b<-transform(a, Cabin = replace(Cabin, startsWith(Cabin, "B"), "B"))
c<-transform(b, Cabin = replace(Cabin, startsWith(Cabin, "C"), "C"))
d<-transform(c, Cabin = replace(Cabin, startsWith(Cabin, "D"), "D"))
e<-transform(d, Cabin = replace(Cabin, startsWith(Cabin, "E"), "E"))
f<-transform(e, Cabin = replace(Cabin, startsWith(Cabin, "F"), "F"))
g<-transform(f, Cabin = replace(Cabin, startsWith(Cabin, "G"), "G"))
g$Cabin<-as.factor(g$Cabin)
train <- g
head(train)
## Survived Pclass Sex Age SibSp Parch Fare Cabin Embarked Title
## 1 No third male 22 1 0 7.2500 S Mr.
## 2 YES first female 38 1 0 71.2833 C C Mrs.
## 3 YES third female 26 0 0 7.9250 S Miss.
## 4 YES first female 35 1 0 53.1000 C S Mrs.
## 5 No third male 35 0 0 8.0500 S Mr.
## 6 No third male NA 0 0 8.4583 Q Mr.
table(train$Cabin)
##
## A B C D E F G T
## 687 15 47 59 33 32 13 4 1
titanic_test$Cabin <- as.character(titanic_test$Cabin)
a<-transform(titanic_test, Cabin = replace(Cabin, startsWith(Cabin, "A"), "A"))
b<-transform(a, Cabin = replace(Cabin, startsWith(Cabin, "B"), "B"))
c<-transform(b, Cabin = replace(Cabin, startsWith(Cabin, "C"), "C"))
d<-transform(c, Cabin = replace(Cabin, startsWith(Cabin, "D"), "D"))
e<-transform(d, Cabin = replace(Cabin, startsWith(Cabin, "E"), "E"))
f<-transform(e, Cabin = replace(Cabin, startsWith(Cabin, "F"), "F"))
g<-transform(f, Cabin = replace(Cabin, startsWith(Cabin, "G"), "G"))
g$Cabin<-as.factor(g$Cabin)
test<-g
head(test)
## Pclass Sex Age SibSp Parch Fare Cabin Embarked Title
## 1 third male 34.5 0 0 7.8292 Q Mr.
## 2 third female 47.0 1 0 7.0000 S Mrs.
## 3 second male 62.0 0 0 9.6875 Q Mr.
## 4 third male 27.0 0 0 8.6625 S Mr.
## 5 third female 22.0 1 1 12.2875 S Mrs.
## 6 third male 14.0 0 0 9.2250 S Mr.
Now that we have catered for all the other variables, let’s fix the missing values in the Age column. I will use the mice() from the mice package.
I forked these junks of codes from Megan L. Risdal (https://rstudio-pubs-static.s3.amazonaws.com/202517_d1c1e3e9101d49b1a0135a422a9b3748.html) that does a good job in predicting the missing age values.
library(mice)
## Warning: package 'mice' was built under R version 3.4.2
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.2
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
factor_vars <- c('Pclass','Sex','Embarked', 'Title')
train[factor_vars] <- lapply(train[factor_vars], function(x) as.factor(x))
test[factor_vars] <- lapply(test[factor_vars], function(x) as.factor(x))
# Perform mice imputation, excluding certain less-than-useful variables:
mice_mod <- mice(train[, !names(train) %in% c('Ticket','Cabin','Survived')], method='rf')
##
## iter imp variable
## 1 1 Age
## 1 2 Age
## 1 3 Age
## 1 4 Age
## 1 5 Age
## 2 1 Age
## 2 2 Age
## 2 3 Age
## 2 4 Age
## 2 5 Age
## 3 1 Age
## 3 2 Age
## 3 3 Age
## 3 4 Age
## 3 5 Age
## 4 1 Age
## 4 2 Age
## 4 3 Age
## 4 4 Age
## 4 5 Age
## 5 1 Age
## 5 2 Age
## 5 3 Age
## 5 4 Age
## 5 5 Age
mice_mod1 <- mice(test[, !names(test) %in% c('Ticket','Cabin','Survived')], method='rf')
##
## iter imp variable
## 1 1 Age Fare
## 1 2 Age Fare
## 1 3 Age Fare
## 1 4 Age Fare
## 1 5 Age Fare
## 2 1 Age Fare
## 2 2 Age Fare
## 2 3 Age Fare
## 2 4 Age Fare
## 2 5 Age Fare
## 3 1 Age Fare
## 3 2 Age Fare
## 3 3 Age Fare
## 3 4 Age Fare
## 3 5 Age Fare
## 4 1 Age Fare
## 4 2 Age Fare
## 4 3 Age Fare
## 4 4 Age Fare
## 4 5 Age Fare
## 5 1 Age Fare
## 5 2 Age Fare
## 5 3 Age Fare
## 5 4 Age Fare
## 5 5 Age Fare
mice_output <- complete(mice_mod)
mice_output1 <- complete(mice_mod1)
par(mfrow=c(1,2))
hist(train$Age, freq=F, main='Age: Original Data',
col='darkgreen', ylim=c(0,0.04))
hist(mice_output$Age, freq=F, main='Age: MICE Output',
col='lightgreen', ylim=c(0,0.04))
# Replace Age variable from the mice model.
train$Age <- mice_output$Age
test$Age <- mice_output1$Age
# Show new number of missing Age values
sum(is.na(train$Age))
## [1] 0
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
ggplot(train, aes(x = Survived)) +
theme_bw() +
geom_bar() +
labs(y = "Passenger Count",
Title = "Titanic Survival Rate")
table(train$Survived)
##
## No YES
## 549 342
549 people died while 342 survived
ggplot(train, aes(x = Sex, fill = Survived)) +
theme_bw() +
geom_bar() +
labs(y = "Passenger Count",
Title = "Titanic Survival Rates by Sex")
Relatively more female passengers survived.
ggplot(train, aes(x = Age, fill = Survived)) +
theme_bw() +
geom_histogram(binwidth = 5) +
labs(y = "Passenger Count",
x = "Age (binwidth = 5)",
Title = "Titanic Survival Rates by Age")
Majority of the passengers that died where between the ages of 20 and 40.
ggplot(train, aes(x = Title, fill = Survived)) +
theme_bw() +
geom_bar() +
labs(y = "Passenger Count",
Title = "Titanic Survival Rates by Passenger Title")
Majority of the passengers that died had a Mr. title. Only about 1 in every 5 passenger in this group survived.
ggplot(train, aes(x = Cabin, fill = Survived)) +
theme_bw() +
geom_bar() +
labs(y = "Passenger Count",
Title = "Titanic Survival Rates by Cabin Occupied")
The Cabin of the majority of passengers is not mentioned. This group of passengers disportionately had a lower survival rate compared to those with known Cabin.
ggplot(train, aes(x = Embarked, fill = Survived)) +
theme_bw() +
geom_bar() +
labs(y = "Passenger Count",
Title = "Titanic Survival Rates by where the passengers Embarked")
Kaggle provided two datasets, train and test (with unkown survival) data set. I will split the training data into 2 (one for building the model and the other for training the model)
shuffled_train <- train[sample(891), ]
train_indices <- 1:round(0.85 * 891)
train_train <- shuffled_train[train_indices, ]
test_indices <- (round(0.85 * 891) + 1):891
test_train <- shuffled_train[test_indices, ]
model <- glm (Survived ~ . , data = train_train, family = binomial(link = "logit"), na.action = na.pass)
summary(model)
##
## Call:
## glm(formula = Survived ~ ., family = binomial(link = "logit"),
## data = train_train, na.action = na.pass)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5446 -0.5405 -0.3731 0.5240 2.5421
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.249e+01 1.654e+03 0.020 0.984330
## Pclasssecond -5.362e-01 4.913e-01 -1.091 0.275112
## Pclassthird -1.441e+00 4.906e-01 -2.937 0.003314 **
## Sexmale -1.613e+01 7.866e+02 -0.021 0.983636
## Age -2.908e-02 9.525e-03 -3.053 0.002268 **
## SibSp -5.816e-01 1.444e-01 -4.029 5.61e-05 ***
## Parch -4.074e-01 1.497e-01 -2.721 0.006503 **
## Fare 4.433e-03 3.057e-03 1.450 0.147016
## CabinA 4.905e-01 7.465e-01 0.657 0.511178
## CabinB 6.557e-01 6.443e-01 1.018 0.308796
## CabinC 2.584e-01 5.702e-01 0.453 0.650442
## CabinD 1.578e+00 6.490e-01 2.432 0.015021 *
## CabinE 1.728e+00 6.663e-01 2.593 0.009509 **
## CabinF 8.336e-01 9.316e-01 0.895 0.370924
## CabinG -5.485e-01 1.039e+00 -0.528 0.597423
## CabinT -1.425e+01 1.455e+03 -0.010 0.992186
## EmbarkedC -1.258e+01 1.455e+03 -0.009 0.993106
## EmbarkedQ -1.282e+01 1.455e+03 -0.009 0.992972
## EmbarkedS -1.296e+01 1.455e+03 -0.009 0.992895
## TitleMiss. -1.683e+01 7.866e+02 -0.021 0.982927
## TitleMr. -3.561e+00 6.202e-01 -5.742 9.37e-09 ***
## TitleMrs. -1.592e+01 7.866e+02 -0.020 0.983857
## TitleOther -3.602e+00 9.403e-01 -3.830 0.000128 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1001.73 on 756 degrees of freedom
## Residual deviance: 605.75 on 734 degrees of freedom
## AIC: 651.75
##
## Number of Fisher Scoring iterations: 14
anova(model, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 756 1001.73
## Pclass 2 78.419 754 923.32 < 2.2e-16 ***
## Sex 1 212.532 753 710.78 < 2.2e-16 ***
## Age 1 19.010 752 691.77 1.301e-05 ***
## SibSp 1 15.292 751 676.48 9.210e-05 ***
## Parch 1 0.652 750 675.83 0.41948
## Fare 1 1.602 749 674.23 0.20563
## Cabin 8 17.840 741 656.39 0.02246 *
## Embarked 3 2.650 738 653.74 0.44878
## Title 4 47.987 734 605.75 9.496e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Gender > PClass > Title > Age > Sibling present had the most effect on determining weather or not a pessanger on board the Titanic survived.
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.2
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.2
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predict <- predict(model, test_train, type="response")
table(test_train$Survived, predict > 0.5)
##
## FALSE TRUE
## No 66 10
## YES 12 46
pr <- prediction(predict, test_train$Survived)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8920145
summary(table(predict, predict > 0.5))
## Number of cases in table: 134
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 134, df = 130, p-value = 0.3871
## Chi-squared approximation may be incorrect
titanic_test <- read.csv("test.csv", stringsAsFactors = TRUE)
prediction <- predict(model, test)
# Create a dataframe with the ID of the passengers and whether or not they survived.
test_survival_predict <- data.frame(PassengerID = titanic_test$PassengerId, Survived = prediction)
# Change Survived to a binary factor variable
test_survival_predict$Survived <- ifelse(test_survival_predict$Survived >= 0.5, 1, 0)
test_survival_predict$Survived<- factor(test_survival_predict$Survived, levels = c("0", "1"),
labels = c("no", "yes"))
head(test_survival_predict, n = 10)
## PassengerID Survived
## 1 892 no
## 2 893 no
## 3 894 no
## 4 895 no
## 5 896 yes
## 6 897 no
## 7 898 yes
## 8 899 no
## 9 900 yes
## 10 901 no
table(test_survival_predict$Survived)
##
## no yes
## 277 140