Titanic: Machine Learning from disaster https://www.kaggle.com/c/titanic
# set the working directory
setwd("~/dev/titanic_kaggle/")
# read data files
test.data <- read.csv("~/dev/titanic_kaggle/test.csv", na.strings = c("NA",""))
train.data <- read.csv("~/dev/titanic_kaggle/train.csv", na.strings = c("NA",""))
# convert ints factors
train.data$Survived = factor(train.data$Survived)
train.data$Pclass = factor(train.data$Pclass)
# test.data$Survived = factor(test.data$Survived)
# test.data$Pclass = factor(test.data$Pclass)
# DETECTING MISSING VALUES
sum(is.na(train.data$Embarked))
## [1] 2
sapply(train.data, function(df){
sum(is.na(df==TRUE) /length(df));
})
## PassengerId Survived Pclass Name Sex Age
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.198653199
## SibSp Parch Ticket Fare Cabin Embarked
## 0.000000000 0.000000000 0.000000000 0.000000000 0.771043771 0.002244669
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
library(Amelia)
## Warning: package 'Amelia' was built under R version 3.2.5
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(train.data, main="Missing Map")
# Ameliaview()
### IMPUTTING MISSING VALUES ###
table(train.data$Embarked, useNA = "always") ;
##
## C Q S <NA>
## 168 77 644 2
# Change NA Values to S, the most probable port
train.data$Embarked[which(is.na(train.data$Embarked))] = 'S' ; #using which to subset
# get table of salutations from names using grep
train.data$Name = as.character(train.data$Name)
table_words = table(unlist(strsplit(train.data$Name, "\\s+")))
sort(table_words [grep('\\.',names(table_words))], decreasing = TRUE)
##
## Mr. Miss. Mrs. Master. Dr. Rev. Col.
## 517 182 125 40 7 6 2
## Major. Mlle. Capt. Countess. Don. Jonkheer. L.
## 2 2 1 1 1 1 1
## Lady. Mme. Ms. Sir.
## 1 1 1 1
# find missing values
library(stringr)
## Warning: package 'stringr' was built under R version 3.2.5
tb = cbind(train.data$Age, str_match(train.data$Name, "[a-zA-z]+\\."))
table(tb[is.na(tb[,1]),2])
##
## Dr. Master. Miss. Mr. Mrs.
## 1 4 36 119 17
# impute mean value for missing ages
mean.mr = mean(train.data$Age[grepl(" Mr\\.", train.data$Name) & !is.na(train.data$Age)])
mean.mrs = mean(train.data$Age[grepl(" Mrs\\.", train.data$Name) & !is.na(train.data$Age)])
mean.dr = mean(train.data$Age[grepl(" Dr\\.", train.data$Name) & !is.na(train.data$Age)])
mean.miss = mean(train.data$Age[grepl(" Miss\\.", train.data$Name) & !is.na(train.data$Age)])
mean.master = mean(train.data$Age[grepl(" Master\\.", train.data$Name) & !is.na(train.data$Age)])
# assign missing value with the mean value of each title
train.data$Age[grepl(" Mr\\.", train.data$Name) & is.na(train.data$Age)] = mean.mr
train.data$Age[grepl(" Mrs\\.", train.data$Name) & is.na(train.data$Age)] = mean.mrs
train.data$Age[grepl(" Dr\\.", train.data$Name) & is.na(train.data$Age)] = mean.dr
train.data$Age[grepl(" Miss\\.", train.data$Name) & is.na(train.data$Age)] = mean.miss
train.data$Age[grepl(" Master\\.", train.data$Name) & is.na(train.data$Age)] = mean.master
### DATA VISUALIZATION ###
barplot(table(train.data$Survived), main="Passenger Survival", names = c("Perished", "Survived"))
barplot(table(train.data$Pclass), main = "Passenger Class", names = c("first", "seconds", "third"))
barplot(table(train.data$Sex), main = "Passenger Gender")
hist(train.data$Age, main = "Passenger Age", xlab = "Age")
barplot(table(train.data$SibSp), main = "Passenger Siblings")
barplot(table(train.data$Parch), main = "Passenger Parch")
hist(train.data$Fare, main = "Passenger Fare", xlab = "Fare")
barplot(table(train.data$Embarked), main = "Port of Embarkation")
counts = table( train.data$Survived, train.data$Sex )
counts
##
## female male
## 0 81 468
## 1 233 109
barplot(counts, col = c("darkblue", "red"), legend = c("Perished", "Survived"), main = "Passenger Survival by Sex")
# does Pclass affect survival rate? YUP!
counts = table( train.data$Survived, train.data$Pclass)
barplot(counts, col = c("darkblue", "red"), legend = c("Perished", "Survived"), main = "Passenger Survival by Class")
# Gender Composition of Class
counts = table( train.data$Sex, train.data$Pclass)
barplot(counts, col = c("darkblue", "red"), legend = rownames(counts), main = "Passenger Gender by Class")
# What does age distribution look like? - Age Histogram
hist(train.data$Age[which(train.data$Survived == "0")], main = "Passenger Age Histogram",xlab = "Age", ylab = "Count", col = "blue", breaks = seq(0,80,by=2))
hist(train.data$Age[which(train.data$Survived == "1")], col = "red", add = T, breaks = seq(0,80,by=2))
# what's the relationship between age and survival rate?
boxplot(train.data$Age ~ train.data$Survived , main = "Passenger Survival by Age" , xlab = "Survived", ylab = "Age" )
# categorize people into different age groups
train.child = train.data$Survived[train.data$Age < 13]
paste( 'child survival rate = ', length(train.child[which(train.child == 1)]) / length(train.child))
## [1] "child survival rate = 0.575342465753425"
train.youth = train.data$Survived[train.data$Age >= 15 & train.data$Age <25 ]
paste('youth survival rate = ', length(train.youth[which(train.youth == 1 )]) / length(train.youth))
## [1] "youth survival rate = 0.402542372881356"
train.adult = train.data$Survived[train.data$Age >= 25 & train.data$Age <65 ]
paste('adult survival rate = ', length(train.adult[which(train.adult == 1 )]) / length(train.adult))
## [1] "adult survival rate = 0.354092526690391"
train.senior = train.data$Survived[train.data$Age >= 65 ]
paste('senior survival rate = ', length(train.senior[which(train.senior == 1 )]) / length(train.senior))
## [1] "senior survival rate = 0.0909090909090909"
library(vcd)
## Warning: package 'vcd' was built under R version 3.2.5
## Loading required package: grid
mosaicplot(train.data$Pclass ~ train.data$Survived,
main = "Passenger Survival Class", color = TRUE,
xlab = "Pclass", ylab = "Survived")
split.data = function(data, p=0.7, s=666){
set.seed(s)
index = sample(1:dim(data)[1])
train = data[index[1:floor(dim(data)[1] * p)], ]
test = data[index[((ceiling(dim(data)[1] * p)) + 1):dim(data)[1]], ]
return(list(train = train, test = test))
}
allset = split.data(train.data)
trainset = allset$train
testset = allset$test
library(party)
## Warning: package 'party' was built under R version 3.2.5
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 3.2.5
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 3.2.5
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.2.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.2.5
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.2.5
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
train.ctree = ctree(Survived ~ Pclass + Sex + Age + SibSp + Fare + Parch + Embarked, data = trainset)
train.ctree
##
## Conditional inference tree with 7 terminal nodes
##
## Response: Survived
## Inputs: Pclass, Sex, Age, SibSp, Fare, Parch, Embarked
## Number of observations: 623
##
## 1) Sex == {male}; criterion = 1, statistic = 173.672
## 2) Pclass == {2, 3}; criterion = 1, statistic = 30.951
## 3) Age <= 9; criterion = 0.993, statistic = 10.923
## 4) SibSp <= 1; criterion = 0.999, statistic = 14.856
## 5)* weights = 10
## 4) SibSp > 1
## 6)* weights = 13
## 3) Age > 9
## 7)* weights = 280
## 2) Pclass == {1}
## 8)* weights = 87
## 1) Sex == {female}
## 9) Pclass == {1, 2}; criterion = 1, statistic = 59.504
## 10)* weights = 125
## 9) Pclass == {3}
## 11) Fare <= 23.25; criterion = 0.997, statistic = 12.456
## 12)* weights = 85
## 11) Fare > 23.25
## 13)* weights = 23
windows.options(width=20, height=10)
plot(train.ctree, main = "Conditional inference tree of Titanic Dataset")
# Now use svm to generate a prediction model using the same dataset as above
library('e1071')
## Warning: package 'e1071' was built under R version 3.2.5
svm.model = svm(Survived ~ Pclass + Sex + Age + SibSp + Fare + Parch + Embarked, data = trainset, probability=TRUE)
svm.model
##
## Call:
## svm(formula = Survived ~ Pclass + Sex + Age + SibSp + Fare +
## Parch + Embarked, data = trainset, probability = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.1
##
## Number of Support Vectors: 324
# Validate the power of the prediction model using a confusion matrix
ctree.predict = predict(train.ctree, testset)
library('caret')
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.5
# test the accuracy of the ctree model using confusion matrix via caret package
confusionMatrix(ctree.predict, testset$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 160 23
## 1 16 68
##
## Accuracy : 0.8539
## 95% CI : (0.8058, 0.894)
## No Information Rate : 0.6592
## P-Value [Acc > NIR] : 5.347e-13
##
## Kappa : 0.6688
## Mcnemar's Test P-Value : 0.3367
##
## Sensitivity : 0.9091
## Specificity : 0.7473
## Pos Pred Value : 0.8743
## Neg Pred Value : 0.8095
## Prevalence : 0.6592
## Detection Rate : 0.5993
## Detection Prevalence : 0.6854
## Balanced Accuracy : 0.8282
##
## 'Positive' Class : 0
##
# Validate the power of the prediction model using a confusion matrix
svm.predict = predict(svm.model, testset)
library('caret')
# test the accuracy of the ctree model using confusion matrix via caret package
confusionMatrix(svm.predict, testset$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 161 24
## 1 15 67
##
## Accuracy : 0.8539
## 95% CI : (0.8058, 0.894)
## No Information Rate : 0.6592
## P-Value [Acc > NIR] : 5.347e-13
##
## Kappa : 0.667
## Mcnemar's Test P-Value : 0.2002
##
## Sensitivity : 0.9148
## Specificity : 0.7363
## Pos Pred Value : 0.8703
## Neg Pred Value : 0.8171
## Prevalence : 0.6592
## Detection Rate : 0.6030
## Detection Prevalence : 0.6929
## Balanced Accuracy : 0.8255
##
## 'Positive' Class : 0
##
train.ctree.pred = predict(train.ctree, testset)
train.ctree.prob = 1 - unlist(treeresponse(train.ctree, testset), use.names = F) [seq(1,nrow(testset)*2,2)]
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
train.ctree.prob.rocr = prediction(train.ctree.prob, testset$Survived)
train.ctree.perf = performance(train.ctree.prob.rocr, "tpr", "fpr")
train.ctree.auc.perf = performance(train.ctree.prob.rocr, measure = "auc", x.measure = "cutoff")
plot(train.ctree.perf, col=2, colorize=T, main=paste("AUC:", train.ctree.auc.perf@y.values))