Titanic: Machine Learning from disaster https://www.kaggle.com/c/titanic

# set the working directory
setwd("~/dev/titanic_kaggle/")

# read data files
test.data <- read.csv("~/dev/titanic_kaggle/test.csv", na.strings = c("NA",""))
train.data <- read.csv("~/dev/titanic_kaggle/train.csv", na.strings = c("NA",""))

# convert ints factors 
train.data$Survived = factor(train.data$Survived)
train.data$Pclass = factor(train.data$Pclass)
# test.data$Survived = factor(test.data$Survived)
# test.data$Pclass = factor(test.data$Pclass)


# DETECTING MISSING VALUES
sum(is.na(train.data$Embarked))
## [1] 2
sapply(train.data, function(df){
    sum(is.na(df==TRUE) /length(df));
    })
## PassengerId    Survived      Pclass        Name         Sex         Age 
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.198653199 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
## 0.000000000 0.000000000 0.000000000 0.000000000 0.771043771 0.002244669

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

library(Amelia)
## Warning: package 'Amelia' was built under R version 3.2.5
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(train.data, main="Missing Map")

# Ameliaview() 
### IMPUTTING MISSING VALUES ###
table(train.data$Embarked, useNA = "always") ;
## 
##    C    Q    S <NA> 
##  168   77  644    2
# Change NA Values to S, the most probable port
train.data$Embarked[which(is.na(train.data$Embarked))] = 'S' ; #using which to subset
# get table of salutations from names using grep
train.data$Name = as.character(train.data$Name) 
table_words = table(unlist(strsplit(train.data$Name, "\\s+")))
sort(table_words [grep('\\.',names(table_words))], decreasing = TRUE)
## 
##       Mr.     Miss.      Mrs.   Master.       Dr.      Rev.      Col. 
##       517       182       125        40         7         6         2 
##    Major.     Mlle.     Capt. Countess.      Don. Jonkheer.        L. 
##         2         2         1         1         1         1         1 
##     Lady.      Mme.       Ms.      Sir. 
##         1         1         1         1
# find missing values
library(stringr)
## Warning: package 'stringr' was built under R version 3.2.5
tb = cbind(train.data$Age, str_match(train.data$Name, "[a-zA-z]+\\."))
table(tb[is.na(tb[,1]),2])
## 
##     Dr. Master.   Miss.     Mr.    Mrs. 
##       1       4      36     119      17
# impute mean value for missing ages
mean.mr = mean(train.data$Age[grepl(" Mr\\.", train.data$Name) & !is.na(train.data$Age)])
mean.mrs = mean(train.data$Age[grepl(" Mrs\\.", train.data$Name) & !is.na(train.data$Age)])
mean.dr = mean(train.data$Age[grepl(" Dr\\.", train.data$Name) & !is.na(train.data$Age)])
mean.miss = mean(train.data$Age[grepl(" Miss\\.", train.data$Name) & !is.na(train.data$Age)])
mean.master = mean(train.data$Age[grepl(" Master\\.", train.data$Name) & !is.na(train.data$Age)])

# assign missing value with the mean value of each title
train.data$Age[grepl(" Mr\\.", train.data$Name) & is.na(train.data$Age)] = mean.mr
train.data$Age[grepl(" Mrs\\.", train.data$Name) & is.na(train.data$Age)] = mean.mrs
train.data$Age[grepl(" Dr\\.", train.data$Name) & is.na(train.data$Age)] = mean.dr
train.data$Age[grepl(" Miss\\.", train.data$Name) & is.na(train.data$Age)] = mean.miss
train.data$Age[grepl(" Master\\.", train.data$Name) & is.na(train.data$Age)] = mean.master
### DATA VISUALIZATION ###
barplot(table(train.data$Survived), main="Passenger Survival", names = c("Perished", "Survived"))

barplot(table(train.data$Pclass), main = "Passenger Class", names = c("first", "seconds", "third"))

barplot(table(train.data$Sex), main = "Passenger Gender")

hist(train.data$Age, main = "Passenger Age", xlab = "Age")

barplot(table(train.data$SibSp), main = "Passenger Siblings")

barplot(table(train.data$Parch), main = "Passenger Parch")

hist(train.data$Fare, main = "Passenger Fare", xlab = "Fare")

barplot(table(train.data$Embarked), main = "Port of Embarkation")

counts = table( train.data$Survived, train.data$Sex )
counts
##    
##     female male
##   0     81  468
##   1    233  109
barplot(counts, col = c("darkblue", "red"), legend = c("Perished", "Survived"), main = "Passenger Survival by Sex")

# does Pclass affect survival rate? YUP!
counts = table( train.data$Survived, train.data$Pclass)
barplot(counts, col = c("darkblue", "red"), legend = c("Perished", "Survived"), main = "Passenger Survival by Class")

# Gender Composition of Class
counts = table( train.data$Sex, train.data$Pclass)
barplot(counts, col = c("darkblue", "red"), legend = rownames(counts), main = "Passenger Gender by Class")

# What does age distribution look like? - Age Histogram
hist(train.data$Age[which(train.data$Survived == "0")], main = "Passenger Age Histogram",xlab = "Age", ylab = "Count", col = "blue", breaks = seq(0,80,by=2))
hist(train.data$Age[which(train.data$Survived == "1")], col = "red", add = T, breaks = seq(0,80,by=2))

# what's the relationship between age and survival rate?
boxplot(train.data$Age ~ train.data$Survived , main = "Passenger Survival by Age" , xlab = "Survived", ylab = "Age" )

# categorize people into different age groups
train.child = train.data$Survived[train.data$Age < 13]
paste( 'child survival rate = ', length(train.child[which(train.child == 1)]) / length(train.child))
## [1] "child survival rate =  0.575342465753425"
train.youth = train.data$Survived[train.data$Age >= 15 & train.data$Age <25 ]
paste('youth survival rate = ', length(train.youth[which(train.youth == 1 )]) / length(train.youth))
## [1] "youth survival rate =  0.402542372881356"
train.adult = train.data$Survived[train.data$Age >= 25 & train.data$Age <65 ]
paste('adult survival rate = ', length(train.adult[which(train.adult == 1 )]) / length(train.adult)) 
## [1] "adult survival rate =  0.354092526690391"
train.senior = train.data$Survived[train.data$Age >= 65 ]
paste('senior survival rate = ', length(train.senior[which(train.senior == 1 )]) / length(train.senior)) 
## [1] "senior survival rate =  0.0909090909090909"
library(vcd)
## Warning: package 'vcd' was built under R version 3.2.5
## Loading required package: grid
mosaicplot(train.data$Pclass ~ train.data$Survived, 
  main = "Passenger Survival Class", color = TRUE,
  xlab = "Pclass", ylab = "Survived")

split.data  = function(data, p=0.7, s=666){
  set.seed(s) 
  index = sample(1:dim(data)[1]) 
  train = data[index[1:floor(dim(data)[1] * p)], ]
  test = data[index[((ceiling(dim(data)[1] * p)) + 1):dim(data)[1]], ]
  return(list(train = train, test = test))
}

allset = split.data(train.data)
trainset = allset$train
testset = allset$test
library(party)
## Warning: package 'party' was built under R version 3.2.5
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 3.2.5
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 3.2.5
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.2.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.2.5
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.2.5
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
train.ctree = ctree(Survived ~ Pclass + Sex + Age + SibSp + Fare + Parch + Embarked, data = trainset)
train.ctree
## 
##   Conditional inference tree with 7 terminal nodes
## 
## Response:  Survived 
## Inputs:  Pclass, Sex, Age, SibSp, Fare, Parch, Embarked 
## Number of observations:  623 
## 
## 1) Sex == {male}; criterion = 1, statistic = 173.672
##   2) Pclass == {2, 3}; criterion = 1, statistic = 30.951
##     3) Age <= 9; criterion = 0.993, statistic = 10.923
##       4) SibSp <= 1; criterion = 0.999, statistic = 14.856
##         5)*  weights = 10 
##       4) SibSp > 1
##         6)*  weights = 13 
##     3) Age > 9
##       7)*  weights = 280 
##   2) Pclass == {1}
##     8)*  weights = 87 
## 1) Sex == {female}
##   9) Pclass == {1, 2}; criterion = 1, statistic = 59.504
##     10)*  weights = 125 
##   9) Pclass == {3}
##     11) Fare <= 23.25; criterion = 0.997, statistic = 12.456
##       12)*  weights = 85 
##     11) Fare > 23.25
##       13)*  weights = 23
windows.options(width=20, height=10)
plot(train.ctree, main = "Conditional inference tree of Titanic Dataset")

# Now use svm to generate a prediction model using the same dataset as above
library('e1071')
## Warning: package 'e1071' was built under R version 3.2.5
svm.model = svm(Survived ~ Pclass + Sex + Age + SibSp + Fare + Parch + Embarked, data = trainset, probability=TRUE)
svm.model
## 
## Call:
## svm(formula = Survived ~ Pclass + Sex + Age + SibSp + Fare + 
##     Parch + Embarked, data = trainset, probability = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.1 
## 
## Number of Support Vectors:  324
# Validate the power of the prediction model using a confusion matrix
ctree.predict = predict(train.ctree, testset)
library('caret')
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.5
# test the accuracy of the ctree model using confusion matrix via caret package
confusionMatrix(ctree.predict, testset$Survived)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 160  23
##          1  16  68
##                                          
##                Accuracy : 0.8539         
##                  95% CI : (0.8058, 0.894)
##     No Information Rate : 0.6592         
##     P-Value [Acc > NIR] : 5.347e-13      
##                                          
##                   Kappa : 0.6688         
##  Mcnemar's Test P-Value : 0.3367         
##                                          
##             Sensitivity : 0.9091         
##             Specificity : 0.7473         
##          Pos Pred Value : 0.8743         
##          Neg Pred Value : 0.8095         
##              Prevalence : 0.6592         
##          Detection Rate : 0.5993         
##    Detection Prevalence : 0.6854         
##       Balanced Accuracy : 0.8282         
##                                          
##        'Positive' Class : 0              
## 
# Validate the power of the prediction model using a confusion matrix
svm.predict = predict(svm.model, testset)
library('caret')
# test the accuracy of the ctree model using confusion matrix via caret package
confusionMatrix(svm.predict, testset$Survived)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 161  24
##          1  15  67
##                                          
##                Accuracy : 0.8539         
##                  95% CI : (0.8058, 0.894)
##     No Information Rate : 0.6592         
##     P-Value [Acc > NIR] : 5.347e-13      
##                                          
##                   Kappa : 0.667          
##  Mcnemar's Test P-Value : 0.2002         
##                                          
##             Sensitivity : 0.9148         
##             Specificity : 0.7363         
##          Pos Pred Value : 0.8703         
##          Neg Pred Value : 0.8171         
##              Prevalence : 0.6592         
##          Detection Rate : 0.6030         
##    Detection Prevalence : 0.6929         
##       Balanced Accuracy : 0.8255         
##                                          
##        'Positive' Class : 0              
## 

Assesing Performance with the ROC Curve

Prepare the Probability Matrix

train.ctree.pred = predict(train.ctree, testset)
train.ctree.prob = 1 - unlist(treeresponse(train.ctree, testset), use.names = F) [seq(1,nrow(testset)*2,2)]

Create an ROCR prediction from probabilities & Create ROC Curve

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
train.ctree.prob.rocr = prediction(train.ctree.prob, testset$Survived)
train.ctree.perf = performance(train.ctree.prob.rocr, "tpr", "fpr")
train.ctree.auc.perf = performance(train.ctree.prob.rocr, measure = "auc", x.measure = "cutoff")
plot(train.ctree.perf, col=2, colorize=T, main=paste("AUC:", train.ctree.auc.perf@y.values))