#Load row data
train <- read.csv('/Users/srikant/Downloads/titanic-machine-learning-from-disaster/train.csv', header = TRUE)
test <- read.csv('/Users/srikant/Downloads/titanic-machine-learning-from-disaster/test.csv', header = TRUE)
#Add a "Survived" variable to the test set to allow for combining data sets
test.survived <- data.frame(Survived = rep("None", nrow(test)), test[,])
# Combine data sets
data.combined <- rbind(train,test.survived)
colnames(test.survived) <- colnames(train)
# A bit about R dada types
str(data.combined)
'data.frame': 1309 obs. of 12 variables:
$ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
$ Survived : chr "0" "1" "1" "1" ...
$ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
$ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
$ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
$ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
$ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
data.combined$Pclass <- as.factor(data.combined$Pclass)
data.combined$Survived <- as.factor(data.combined$Survived)
#Take a look at gross survival rates
table(data.combined$Survived)
0 1 None
549 342 418
#Distribution accross the classes
table(data.combined$Pclass)
1 2 3
323 277 709
#load ggplot2 package to use for visualization
library(ggplot2)
#Hypothesis -Rich folks survived at a higher rate
train$Pclass <- as.factor(train$Pclass)
ggplot(train, aes(x = Pclass, fill = factor(Survived))) +
geom_bar(width = 0.5) +
xlab("Pclass")+
ylab("Total Count") +
labs(fill = "Survived")

#Examining the first few names in the training data set
head(as.character(train$Name))
[1] "Braund, Mr. Owen Harris"
[2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
[3] "Heikkinen, Miss. Laina"
[4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"
[5] "Allen, Mr. William Henry"
[6] "Moran, Mr. James"
# How many unique names are there accross both train & test?
length(unique(as.character(data.combined$Name)))
[1] 1307
#Two duplicate names, take a closer look
#first , get the duplicate names and store them as vector
dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined))), "name"])
#next take a look at the record in the combined data set
data.combined[which(data.combined$Name %in% dup.names),]
#what is up with Mr and Mrs thing?
library(stringr)
#any corelation with other variables (eg.,sibsp)?
misses <- data.combined[which(str_detect(data.combined$Name, "Miss")),]
misses[1:5,]
#Hypothesis -Name titles corelate with age
mrses <- data.combined[which(str_detect(data.combined$Name, "Mrs.")), ]
mrses[1:5,]
#check out males to see if the pattern continues
males <- data.combined[which(train$Sex == 'male'),]
males[1:5,]
#Expand upon the real relationship between 'Survived' and 'Pclass' by adding the new' Title' variable to the data set and then explore a potential 3-dimensional relationship.
#create a utility function to help with the title extraction
extractTitle <- function(Name){
name <- as.character(Name)
if(length(grep("Miss.", Name)) > 0) {
return("Miss.")
}else if (length(grep("Master.", Name)) > 0)
{return("Master.")
}else if (length(grep("Mrs.", Name)) > 0) {
return("Mrs")
}else if (length(grep("Mr", Name)) > 0){
return("Mr")
}else{
return("Others")
}
}
titles <- NULL
for (i in 1:nrow(data.combined)) {
titles <- c(titles, extractTitle(data.combined[i,"Name"]))
}
data.combined$title <- as.factor(titles)
#since we only have survived tables for the train set, only use the first 891 rows
ggplot(data.combined[1:891,], aes(x = title, fill = Survived))+
geom_bar(stat = "count")+
facet_wrap(~Pclass)+
ggtitle("Pclass")+
xlab("Title")+
ylab("Total Count")+
labs(fill = "Survived")

#visulization the 3 ways relationship of sex,pclass and survival,compare to analysis
ggplot(data.combined[1:891,], aes(x = Sex, fill = Survived)) +
geom_bar(stat = "count")+
facet_wrap(~Pclass)+
ggtitle("Pclass")+
xlab("Sex")+
ylab("Total Count")+
labs(fill = "Survived")

summary(data.combined$Age)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.17 21.00 28.00 29.88 39.00 80.00 263
summary(data.combined[1:891, "Age"])
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.42 20.12 28.00 29.70 38.00 80.00 177
#just to be through take a look at survival rates rates broken out by sex,pclass a
ggplot(data.combined[1:891,], aes(x = Age, fill = Survived)) +
facet_wrap(~Sex + Pclass)+
geom_bar(stat = "count")+
xlab("Age")+
ylab("Total Count")+
labs(fill = "Survived")

ggplot(data.combined[1:891,], aes(x = Sex, fill = Survived)) +
facet_wrap(~Sex + Pclass)+
geom_bar(stat = "count")+
xlab("Age")+
ylab("Total Count")+
labs(fill = "Survived")

#validation that "Master." is a good proxy for male children
boys <- data.combined[which(data.combined$title == "Master."),]
summary(boys$Age)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.330 2.000 4.000 5.483 9.000 14.500 8
# we know that "Miss" is more complicated lets examine further
misses <- data.combined[which(data.combined$title == "Miss."),]
summary(misses$Age)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.17 15.00 22.00 21.77 30.00 63.00 50
ggplot(misses[misses$Survived != "None",], aes(x = Age, fill = Survived)) +
facet_wrap(~Pclass)+
geom_bar(stat = "count")+
ggtitle("Age for 'Miss.' by Pclass")

xlab("Age")+
ylab("Total Count")+
labs(fill = "Survived")
Error in xlab("Age") + ylab("Total Count") :
non-numeric argument to binary operator
misses.alone <- misses[which(misses$SibSp == 0 & misses$Parch == 0),]
summary(misses.alone$Age)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
5.00 21.00 26.00 27.23 32.50 58.00 33
length(which(misses.alone$Age <= 14.5))
[1] 4
summary(data.combined$SibSp)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.4989 1.0000 8.0000
#can we treat as a factor?
length(unique(data.combined$SibSp))
[1] 7
#we beleive title is predective. visualize survival rates by sibsp,pclass and title
ggplot(data.combined[1:891,], aes(x = SibSp, fill = Survived)) +
geom_bar(stat = "count")+
facet_wrap(~Pclass + title)+
ggtitle("Pclass, Title")+
xlab("SibSp")+
ylab("Total Count")+
ylim(0,300)+
labs(fill = "Survived")

#treat the prach variable as a factor and vizualise
data.combined$Parch <- as.factor(data.combined$Parch)
ggplot(data.combined[1:891,], aes(x = Parch, fill = Survived)) +
geom_bar(stat = "count")+
facet_wrap(~Pclass + title)+
ggtitle("Pclass, Title")+
xlab("Parch")+
ylab("Total Count")+
ylim(0,300)+
labs(fill = "Survived")

#lets try some feature engineering what about creating a family six=ze feature
temp.SibSp <- c(train$SibSp, test$SibSp)
temp.Parch <- c(train$Parch, test$Parch)
data.combined$family.size <- as.factor(temp.SibSp + temp.Parch +1)
#Visualize it to see if it is predective
ggplot(data.combined[1:891,], aes(x = family.size,fill = Survived)) +
geom_bar(stat = "count")+
facet_wrap(~Pclass + title)+
ggtitle("Pclass, Title")+
xlab("family.Size")+
ylab("Total Count")+
ylim(0,300)+
labs(fill = "Survived")

NA
# Take a look at the ticket variable
str(data.combined$Ticket)
Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
# Based on the huge number of levels ticket really isn't a factor variable it is a string.
# Convert it and display first 20
data.combined$ticket <- as.character(data.combined$Ticket)
data.combined$ticket[1:20]
[1] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803"
[5] "373450" "330877" "17463" "349909"
[9] "347742" "237736" "PP 9549" "113783"
[13] "A/5. 2151" "347082" "350406" "248706"
[17] "382652" "244373" "345763" "2649"
# There's no immediately apparent structure in the data, let's see if we can find some.
# We'll start with taking a look at just the first char for each
ticket.first.char <- ifelse(data.combined$ticket == "", " ", substr(data.combined$ticket, 1, 1))
unique(ticket.first.char)
[1] "A" "P" "S" "1" "3" "2" "C" "7" "W" "4" "F" "L" "9" "6" "5" "8"
# OK, we can make a factor for analysis purposes and visualize
data.combined$ticket.first.char <- as.factor(ticket.first.char)
# First, a high-level plot of the data
ggplot(data.combined[1:891,], aes(x = ticket.first.char, fill = Survived)) +
geom_bar() +
ggtitle("Survivability by ticket.first.char") +
xlab("ticket.first.char") +
ylab("Total Count") +
ylim(0,350) +
labs(fill = "Survived")

# Ticket seems like it might be predictive, drill down a bit
ggplot(data.combined[1:891,], aes(x = ticket.first.char, fill = Survived)) +
geom_bar() +
facet_wrap(~Pclass) +
ggtitle("Pclass") +
xlab("ticket.first.char") +
ylab("Total Count") +
ylim(0,300) +
labs(fill = "Survived")

# Lastly, see if we get a pattern when using combination of pclass & title
ggplot(data.combined[1:891,], aes(x = ticket.first.char, fill = Survived)) +
geom_bar() +
facet_wrap(~Pclass + title) +
ggtitle("Pclass, Title") +
xlab("ticket.first.char") +
ylab("Total Count") +
ylim(0,200) +
labs(fill = "Survived")

# Next up - the fares Titanic passengers paid
summary(data.combined$Fare)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.000 7.896 14.454 33.295 31.275 512.329 1
length(unique(data.combined$Fare))
[1] 282
# Can't make fare a factor, treat as numeric & visualize with histogram
ggplot(data.combined, aes(x = Fare)) +
geom_bar() +
ggtitle("Combined Fare Distribution") +
xlab("Fare") +
ylab("Total Count") +
ylim(0,200)

# Let's check to see if fare has predictive power
ggplot(data.combined[1:891,], aes(x = Fare, fill = Survived)) +
geom_bar(binwidth = 5) +
facet_wrap(~Pclass + title) +
ggtitle("Pclass, Title") +
xlab("fare") +
ylab("Total Count") +
ylim(0,50) +
labs(fill = "Survived")
`geom_bar()` no longer has a `binwidth` parameter. Please use `geom_histogram()` instead.

# Analysis of the cabin variable
str(data.combined$cabin)
NULL
# Cabin really isn't a factor, make a string and the display first 100
data.combined$Cabin <- as.character(data.combined$Cabin)
data.combined$Cabin[1:100]
[1] "" "C85" "" "C123" ""
[6] "" "E46" "" "" ""
[11] "G6" "C103" "" "" ""
[16] "" "" "" "" ""
[21] "" "D56" "" "A6" ""
[26] "" "" "C23 C25 C27" "" ""
[31] "" "B78" "" "" ""
[36] "" "" "" "" ""
[41] "" "" "" "" ""
[46] "" "" "" "" ""
[51] "" "" "D33" "" "B30"
[56] "C52" "" "" "" ""
[61] "" "B28" "C83" "" ""
[66] "" "F33" "" "" ""
[71] "" "" "" "" ""
[76] "F G73" "" "" "" ""
[81] "" "" "" "" ""
[86] "" "" "" "C23 C25 C27" ""
[91] "" "" "E31" "" ""
[96] "" "A5" "D10 D12" "" ""
# Replace empty cabins with a "U"
data.combined[which(data.combined$Cabin.first.char == ""), "Cabin"] <- "U"
data.combined$Cabin[1:100]
[1] "" "C85" "" "C123" ""
[6] "" "E46" "" "" ""
[11] "G6" "C103" "" "" ""
[16] "" "" "" "" ""
[21] "" "D56" "" "A6" ""
[26] "" "" "C23 C25 C27" "" ""
[31] "" "B78" "" "" ""
[36] "" "" "" "" ""
[41] "" "" "" "" ""
[46] "" "" "" "" ""
[51] "" "" "D33" "" "B30"
[56] "C52" "" "" "" ""
[61] "" "B28" "C83" "" ""
[66] "" "F33" "" "" ""
[71] "" "" "" "" ""
[76] "F G73" "" "" "" ""
[81] "" "" "" "" ""
[86] "" "" "" "C23 C25 C27" ""
[91] "" "" "E31" "" ""
[96] "" "A5" "D10 D12" "" ""
# Take a look at just the first char as a factor
cabin.first.char <- as.factor(substr(data.combined$Cabin, 1, 1))
str(cabin.first.char)
Factor w/ 9 levels "","A","B","C",..: 1 4 1 4 1 1 6 1 1 1 ...
levels(cabin.first.char)
[1] "" "A" "B" "C" "D" "E" "F" "G" "T"
# Add to combined data set and plot
data.combined$Cabin.first.char <- cabin.first.char
# High level plot
ggplot(data.combined[,], aes(x = cabin.first.char, fill = Survived)) +
geom_bar() +
ggtitle("Survivability by cabin.first.char") +
xlab("cabin.first.char") +
ylab("Total Count") +
ylim(0,750) +
labs(fill = "Survived")

# Could have some predictive power, drill in
ggplot(data.combined[,], aes(x = cabin.first.char, fill = Survived)) +
geom_bar() +
facet_wrap(~Pclass) +
ggtitle("Survivability by cabin.first.char") +
xlab("Pclass") +
ylab("Total Count") +
ylim(0,500) +
labs(fill = "Survived")

NA
# Does this feature improve upon pclass + title?
ggplot(data.combined[,], aes(x = cabin.first.char, fill = Survived)) +
geom_bar() +
facet_wrap(~Pclass + title) +
ggtitle("Pclass, Title") +
xlab("cabin.first.char") +
ylab("Total Count") +
ylim(0,500) +
labs(fill = "Survived")

# What about folks with multiple cabins?
data.combined$cabin.multiple <- as.factor(ifelse(str_detect(data.combined$Cabin, " "), "Y", "N"))
ggplot(data.combined[1:891,], aes(x = cabin.multiple, fill = Survived)) +
geom_bar() +
facet_wrap(~Pclass + title) +
ggtitle("Pclass, Title") +
xlab("cabin.multiple") +
ylab("Total Count") +
ylim(0,350) +
labs(fill = "Survived")

NA
NA
# Does survivability depend on where you got onboard the Titanic?
str(data.combined$Embarked)
Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
levels(data.combined$Embarked)
[1] "" "C" "Q" "S"
# Plot data for analysis
ggplot(data.combined[1:891,], aes(x = Embarked, fill = Survived)) +
geom_bar() +
facet_wrap(~Pclass + title) +
ggtitle("Pclass, Title") +
xlab("embarked") +
ylab("Total Count") +
ylim(0,300) +
labs(fill = "Survived")

library(randomForest)
# Train a Random Forest with the default parameters using pclass & title
rf.train.1 <- data.combined[1:891, c("Pclass", "title")]
rf.label <- as.factor(train$Survived)
set.seed(1234)
rf.1 <- randomForest(x = rf.train.1, y = rf.label, importance = TRUE, ntree = 1000)
rf.1
Call:
randomForest(x = rf.train.1, y = rf.label, ntree = 1000, importance = TRUE)
Type of random forest: classification
Number of trees: 1000
No. of variables tried at each split: 1
OOB estimate of error rate: 20.99%
Confusion matrix:
0 1 class.error
0 536 13 0.02367942
1 174 168 0.50877193
varImpPlot(rf.1)

# Train a Random Forest using pclass, title, & sibsp
rf.train.2 <- data.combined[1:891, c("Pclass", "title", "SibSp")]
set.seed(1234)
rf.2 <- randomForest(x = rf.train.2, y = rf.label, importance = TRUE, ntree = 1000)
rf.2
Call:
randomForest(x = rf.train.2, y = rf.label, ntree = 1000, importance = TRUE)
Type of random forest: classification
Number of trees: 1000
No. of variables tried at each split: 1
OOB estimate of error rate: 18.74%
Confusion matrix:
0 1 class.error
0 501 48 0.08743169
1 119 223 0.34795322
varImpPlot(rf.2)

# Train a Random Forest using pclass, title, & parch
rf.train.3 <- data.combined[1:891, c("Pclass", "title", "Parch")]
set.seed(1234)
rf.3 <- randomForest(x = rf.train.3, y = rf.label, importance = TRUE, ntree = 1000)
rf.3
Call:
randomForest(x = rf.train.3, y = rf.label, ntree = 1000, importance = TRUE)
Type of random forest: classification
Number of trees: 1000
No. of variables tried at each split: 1
OOB estimate of error rate: 19.87%
Confusion matrix:
0 1 class.error
0 496 53 0.09653916
1 124 218 0.36257310
varImpPlot(rf.3)

# Train a Random Forest using pclass, title, sibsp, parch
rf.train.4 <- data.combined[1:891, c("Pclass", "title", "SibSp", "Parch")]
set.seed(1234)
rf.4 <- randomForest(x = rf.train.4, y = rf.label, importance = TRUE, ntree = 1000)
rf.4
Call:
randomForest(x = rf.train.4, y = rf.label, ntree = 1000, importance = TRUE)
Type of random forest: classification
Number of trees: 1000
No. of variables tried at each split: 2
OOB estimate of error rate: 18.52%
Confusion matrix:
0 1 class.error
0 490 59 0.1074681
1 106 236 0.3099415
varImpPlot(rf.4)

# Train a Random Forest using pclass, title, & family.size
rf.train.5 <- data.combined[1:891, c("Pclass", "title", "family.size")]
set.seed(1234)
rf.5 <- randomForest(x = rf.train.5, y = rf.label, importance = TRUE, ntree = 1000)
rf.5
Call:
randomForest(x = rf.train.5, y = rf.label, ntree = 1000, importance = TRUE)
Type of random forest: classification
Number of trees: 1000
No. of variables tried at each split: 1
OOB estimate of error rate: 18.18%
Confusion matrix:
0 1 class.error
0 486 63 0.1147541
1 99 243 0.2894737
varImpPlot(rf.5)

# Train a Random Forest using pclass, title, sibsp, & family.size
rf.train.6 <- data.combined[1:891, c("Pclass", "title", "SibSp", "family.size")]
set.seed(1234)
rf.6 <- randomForest(x = rf.train.6, y = rf.label, importance = TRUE, ntree = 1000)
rf.6
Call:
randomForest(x = rf.train.6, y = rf.label, ntree = 1000, importance = TRUE)
Type of random forest: classification
Number of trees: 1000
No. of variables tried at each split: 2
OOB estimate of error rate: 19.3%
Confusion matrix:
0 1 class.error
0 486 63 0.1147541
1 109 233 0.3187135
varImpPlot(rf.6)

# Train a Random Forest using pclass, title, parch, & family.size
rf.train.7 <- data.combined[1:891, c("Pclass", "title", "Parch", "family.size")]
set.seed(1234)
rf.7 <- randomForest(x = rf.train.7, y = rf.label, importance = TRUE, ntree = 1000)
rf.7
Call:
randomForest(x = rf.train.7, y = rf.label, ntree = 1000, importance = TRUE)
Type of random forest: classification
Number of trees: 1000
No. of variables tried at each split: 2
OOB estimate of error rate: 19.19%
Confusion matrix:
0 1 class.error
0 489 60 0.1092896
1 111 231 0.3245614
varImpPlot(rf.7)

#cross validation
# Subset our test records and features
test.submit.df <- data.combined[892:1309, c("Pclass", "title", "family.size")]
# Make predictions
rf.5.preds <- predict(rf.5, test.submit.df)
table(rf.5.preds)
rf.5.preds
0 1
258 160
# Write out a CSV file for submission to Kaggle
submit.df <- data.frame(PassengerId = rep(892:1309), Survived = rf.5.preds)
write.csv(submit.df, file = "RF_SUB_20190730_1.csv", row.names = FALSE)
# more accurate estimates
library(caret)
library(doSNOW)
set.seed(2348)
cv.10.folds <- createMultiFolds(rf.label, k = 10, times = 10)
# Check stratification
table(rf.label)
rf.label
0 1
549 342
342 / 549
[1] 0.6229508
table(rf.label[cv.10.folds[[33]]])
0 1
494 307
308 / 494
[1] 0.6234818
# Set up caret's trainControl object per above.
ctrl.1 <- trainControl(method = "repeatedcv", number = 10, repeats = 10,
index = cv.10.folds)
# Set up doSNOW package for multi-core training. This is helpful as we're going
# to be training a lot of trees.
# NOTE - This works on Windows and Mac, unlike doMC
cl <- makeCluster(6, type = "SOCK")
registerDoSNOW(cl)
# Set seed for reproducibility and train
install.packages('e1071', dependencies=TRUE)
Error in install.packages : Updating loaded packages
set.seed(34324)
rf.5.cv.1 <- train(x = rf.train.5, y = rf.label, method = "rf", tuneLength = 3,
ntree = 1000, trControl = ctrl.1)
note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
install.packages("e1071", dependencies = TRUE)
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/e1071_1.7-2.tgz'
Content type 'application/x-gzip' length 900872 bytes (879 KB)
==================================================
downloaded 879 KB
The downloaded binary packages are in
/var/folders/h2/xs484r_17wgd16z7g2j864k40000gp/T//Rtmp04DssK/downloaded_packages
#Shutdown cluster
stopCluster(cl)
set.seed(5983)
cv.5.folds <- createMultiFolds(rf.label, k = 5, times = 10)
ctrl.2 <- trainControl(method = "repeatedcv", number = 5, repeats = 10,
index = cv.5.folds)
cl <- makeCluster(6, type = "SOCK")
registerDoSNOW(cl)
set.seed(89472)
rf.5.cv.2 <- train(x = rf.train.5, y = rf.label, method = "rf", tuneLength = 3,
ntree = 1000, trControl = ctrl.2)
note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
#Shutdown cluster
stopCluster(cl)
# 5-fold CV isn't better. Move to 3-fold CV repeated 10 times.
set.seed(37596)
cv.3.folds <- createMultiFolds(rf.label, k = 3, times = 10)
ctrl.3 <- trainControl(method = "repeatedcv", number = 3, repeats = 10,
index = cv.3.folds)
cl <- makeCluster(6, type = "SOCK")
registerDoSNOW(cl)
set.seed(94622)
rf.5.cv.3 <- train(x = rf.train.5, y = rf.label, method = "rf", tuneLength = 3,
ntree = 64, trControl = ctrl.3)
note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
#Shutdown cluster
stopCluster(cl)
# Check out results
rf.5.cv.3
Random Forest
891 samples
3 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (3 fold, repeated 10 times)
Summary of sample sizes: 594, 594, 594, 594, 594, 594, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.8120090 0.5900921
3 0.8094276 0.5841791
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
# Install and load packages
#install.packages("rpart")
#install.packages("rpart.plot")
library(rpart)
library(rpart.plot)
# Per video #5, let's use 3-fold CV repeated 10 times
# Create utility function
rpart.cv <- function(seed, training, labels, ctrl) {
cl <- makeCluster(6, type = "SOCK")
registerDoSNOW(cl)
set.seed(seed)
# Leverage formula interface for training
rpart.cv <- train(x = training, y = labels, method = "rpart", tuneLength = 30,
trControl = ctrl)
#Shutdown cluster
stopCluster(cl)
return (rpart.cv)
}
# Grab features
features <- c("Pclass", "title", "family.size")
rpart.train.1 <- data.combined[1:891, features]
# Run CV and check out results
rpart.1.cv.1 <- rpart.cv(94622, rpart.train.1, rf.label, ctrl.3)
rpart.1.cv.1
CART
891 samples
3 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (3 fold, repeated 10 times)
Summary of sample sizes: 594, 594, 594, 594, 594, 594, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000000 0.8123457 0.5939089
0.01542650 0.8210999 0.6155647
0.03085299 0.8210999 0.6155647
0.04627949 0.8126824 0.5976956
0.06170599 0.7900112 0.5565000
0.07713249 0.7891134 0.5548539
0.09255898 0.7868687 0.5511812
0.10798548 0.7852974 0.5485631
0.12341198 0.7851852 0.5485353
0.13883848 0.7851852 0.5485353
0.15426497 0.7851852 0.5485353
0.16969147 0.7851852 0.5485353
0.18511797 0.7851852 0.5485353
0.20054446 0.7851852 0.5485353
0.21597096 0.7851852 0.5485353
0.23139746 0.7851852 0.5485353
0.24682396 0.7851852 0.5485353
0.26225045 0.7851852 0.5485353
0.27767695 0.7851852 0.5485353
0.29310345 0.7851852 0.5485353
0.30852995 0.7851852 0.5485353
0.32395644 0.7851852 0.5485353
0.33938294 0.7851852 0.5485353
0.35480944 0.7851852 0.5485353
0.37023593 0.7851852 0.5485353
0.38566243 0.7784512 0.5277248
0.40108893 0.7645342 0.4856939
0.41651543 0.7511785 0.4447832
0.43194192 0.7141414 0.3281876
0.44736842 0.6854097 0.2353859
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.03085299.
# Plot
prp(rpart.1.cv.1$finalModel, type = 0, extra = 1, under = TRUE)

NA
NA
# The plot bring out some interesting lines of investigation. Namely:
# 1 - Titles of "Mr." and "Other" are predicted to perish at an
# overall accuracy rate of 83.2 %.
# 2 - Titles of "Master.", "Miss.", & "Mrs." in 1st & 2nd class
# are predicted to survive at an overall accuracy rate of 94.9%.
# 3 - Titles of "Master.", "Miss.", & "Mrs." in 3rd class with
# family sizes equal to 5, 6, 8, & 11 are predicted to perish
# with 100% accuracy.
# 4 - Titles of "Master.", "Miss.", & "Mrs." in 3rd class with
# family sizes not equal to 5, 6, 8, or 11 are predicted to
# survive with 59.6% accuracy.
# Both rpart and rf confirm that title is important, let's investigate further
table(data.combined$title)
Master. Miss. Mr Mrs Others
61 260 758 199 31
# Parse out last name and title
data.combined[1:25, "Name"]
[1] Braund, Mr. Owen Harris
[2] Cumings, Mrs. John Bradley (Florence Briggs Thayer)
[3] Heikkinen, Miss. Laina
[4] Futrelle, Mrs. Jacques Heath (Lily May Peel)
[5] Allen, Mr. William Henry
[6] Moran, Mr. James
[7] McCarthy, Mr. Timothy J
[8] Palsson, Master. Gosta Leonard
[9] Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
[10] Nasser, Mrs. Nicholas (Adele Achem)
[11] Sandstrom, Miss. Marguerite Rut
[12] Bonnell, Miss. Elizabeth
[13] Saundercock, Mr. William Henry
[14] Andersson, Mr. Anders Johan
[15] Vestrom, Miss. Hulda Amanda Adolfina
[16] Hewlett, Mrs. (Mary D Kingcome)
[17] Rice, Master. Eugene
[18] Williams, Mr. Charles Eugene
[19] Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)
[20] Masselmani, Mrs. Fatima
[21] Fynney, Mr. Joseph J
[22] Beesley, Mr. Lawrence
[23] McGowan, Miss. Anna "Annie"
[24] Sloper, Mr. William Thompson
[25] Palsson, Miss. Torborg Danira
1307 Levels: Abbing, Mr. Anthony ... Zakarian, Mr. Ortin
library(stringr)
name.splits <- str_split(data.combined$Name, ",")
name.splits[1]
[[1]]
[1] "Braund" " Mr. Owen Harris"
last.names <- sapply(name.splits, "[", 1)
last.names[1:10]
[1] "Braund" "Cumings" "Heikkinen" "Futrelle" "Allen" "Moran"
[7] "McCarthy" "Palsson" "Johnson" "Nasser"
# Add last names to dataframe in case we find it useful later
data.combined$last.name <- last.names
# Now for titles
name.splits <- str_split(sapply(name.splits, "[", 2), " ")
titles <- sapply(name.splits, "[", 2)
unique(titles)
[1] "Mr." "Mrs." "Miss." "Master." "Don." "Rev."
[7] "Dr." "Mme." "Ms." "Major." "Lady." "Sir."
[13] "Mlle." "Col." "Capt." "the" "Jonkheer." "Dona."
# What's up with a title of 'the'?
data.combined[which(titles == "the"),]
# Re-map titles to be more exact
titles[titles %in% c("Dona.", "the")] <- "Lady."
titles[titles %in% c("Ms.", "Mlle.")] <- "Miss."
titles[titles == "Mme."] <- "Mrs."
titles[titles %in% c("Jonkheer.", "Don.")] <- "Sir."
titles[titles %in% c("Col.", "Capt.", "Major.")] <- "Officer"
table(titles)
titles
Dr. Lady. Master. Miss. Mr. Mrs. Officer Rev. Sir.
8 3 61 264 757 198 7 8 3
# Make title a factor
data.combined$new.title <- as.factor(titles)
# Visualize new version of title
ggplot(data.combined[1:891,], aes(x = new.title, fill = Survived)) +
geom_bar() +
facet_wrap(~ Pclass) +
ggtitle("Surival Rates for new.title by pclass")

# Collapse titles based on visual analysis
indexes <- which(data.combined$new.title == "Lady."|data.combined$new.title == "Dr.")
data.combined$new.title[indexes] <- "Mrs."
indexes <- which(
data.combined$new.title == "Rev." |
data.combined$new.title == "Sir." |
data.combined$new.title == "Officer")
data.combined$new.title[indexes] <- "Mr."
# Visualize
ggplot(data.combined[1:891,], aes(x = new.title, fill = Survived)) +
geom_bar() +
facet_wrap(~ Pclass) +
ggtitle("Surival Rates for Collapsed new.title by pclass")

# Grab features
features <- c("Pclass", "new.title", "family.size")
rpart.train.2 <- data.combined[1:891, features]
# Run CV and check out results
rpart.2.cv.1 <- rpart.cv(94622, rpart.train.2, rf.label, ctrl.3)
rpart.2.cv.1
CART
891 samples
3 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (3 fold, repeated 10 times)
Summary of sample sizes: 594, 594, 594, 594, 594, 594, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000000 0.8170595 0.6048352
0.01572898 0.8269360 0.6291944
0.03145796 0.8269360 0.6291944
0.04718693 0.8199776 0.6141663
0.06291591 0.7989899 0.5761638
0.07864489 0.7973064 0.5731135
0.09437387 0.7936027 0.5667472
0.11010284 0.7913580 0.5629801
0.12583182 0.7912458 0.5629478
0.14156080 0.7912458 0.5629478
0.15728978 0.7912458 0.5629478
0.17301875 0.7912458 0.5629478
0.18874773 0.7912458 0.5629478
0.20447671 0.7912458 0.5629478
0.22020569 0.7912458 0.5629478
0.23593466 0.7912458 0.5629478
0.25166364 0.7912458 0.5629478
0.26739262 0.7912458 0.5629478
0.28312160 0.7912458 0.5629478
0.29885057 0.7912458 0.5629478
0.31457955 0.7912458 0.5629478
0.33030853 0.7912458 0.5629478
0.34603751 0.7912458 0.5629478
0.36176649 0.7912458 0.5629478
0.37749546 0.7912458 0.5629478
0.39322444 0.7912458 0.5629478
0.40895342 0.7840629 0.5414002
0.42468240 0.7563412 0.4570712
0.44041137 0.7178451 0.3369192
0.45614035 0.6934905 0.2598962
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.03145796.
# Plot
prp(rpart.2.cv.1$finalModel, type = 0, extra = 1, under = TRUE)

# Dive in on 1st class "Mr."
indexes.first.mr <- which(data.combined$new.title == "Mr." & data.combined$Pclass == "1")
first.mr.df <- data.combined[indexes.first.mr, ]
summary(first.mr.df)
PassengerId Survived Pclass Name
Min. : 7.0 0 :75 1:169 Anderson, Mr. Harry : 1
1st Qu.: 371.0 1 :40 2: 0 Andrews, Mr. Thomas Jr : 1
Median : 646.0 None:54 3: 0 Artagaveytia, Mr. Ramon : 1
Mean : 655.7 Barkworth, Mr. Algernon Henry Wilson: 1
3rd Qu.: 967.0 Baumann, Mr. John D : 1
Max. :1299.0 Baxter, Mr. Quigg Edmond : 1
(Other) :163
Sex Age SibSp Parch Ticket
female: 0 Min. :17.00 Min. :0.0000 0 :141 113503 : 3
male :169 1st Qu.:31.00 1st Qu.:0.0000 1 : 20 110465 : 2
Median :42.00 Median :0.0000 2 : 6 112058 : 2
Mean :42.12 Mean :0.3077 3 : 1 113059 : 2
3rd Qu.:50.75 3rd Qu.:1.0000 4 : 1 113796 : 2
Max. :80.00 Max. :3.0000 5 : 0 12749 : 2
NA's :27 (Other): 0 (Other):156
Fare Cabin Embarked title family.size
Min. : 0.00 Length:169 : 0 Master.: 0 1 :106
1st Qu.: 27.72 Class :character C: 66 Miss. : 0 2 : 45
Median : 45.50 Mode :character Q: 0 Mr :160 3 : 13
Mean : 67.36 S:103 Mrs : 0 4 : 2
3rd Qu.: 77.29 Others : 9 6 : 2
Max. :512.33 5 : 1
(Other): 0
ticket ticket.first.char Cabin.first.char cabin.multiple
Length:169 1 :106 C :46 N:156
Class :character P : 44 :42 Y: 13
Mode :character 3 : 7 B :26
6 : 4 D :19
2 : 3 E :19
F : 2 A :16
(Other): 3 (Other): 1
last.name new.title
Length:169 Mr. :169
Class :character Dr. : 0
Mode :character Lady. : 0
Master.: 0
Miss. : 0
Mrs. : 0
(Other): 0
# One female?
first.mr.df[first.mr.df$Sex == "female",]
NA
# Update new.title feature
indexes <- which(data.combined$new.title == "Mr." &
data.combined$sex == "female")
data.combined$new.title[indexes] <- "Mrs."
# Any other gender slip-ups?
length(which(data.combined$sex == "female" &
(data.combined$new.title == "Master." |
data.combined$new.title == "Mr.")))
[1] 0
# Refresh data frame
indexes.first.mr <- which(data.combined$new.title == "Mr." & data.combined$Pclass == "1")
first.mr.df <- data.combined[indexes.first.mr, ]
# Take a look at some of the high fares
indexes <- which(data.combined$ticket == "PC 17755" |
data.combined$ticket == "PC 17611" |
data.combined$ticket == "113760")
View(data.combined[indexes,])
# Visualize survival rates for 1st class "Mr." by fare
ggplot(first.mr.df, aes(x = Fare, fill = Survived)) +
geom_density(alpha = 0.5) +
ggtitle("1st Class 'Mr.' Survival Rates by fare")

# Engineer features based on all the passengers with the same ticket
ticket.party.size <- rep(0, nrow(data.combined))
avg.Fare <- rep(0.0, nrow(data.combined))
tickets <- unique(data.combined$ticket)
for (i in 1:length(tickets)) {
current.ticket <- tickets[i]
party.indexes <- which(data.combined$ticket == current.ticket)
current.avg.Fare <- data.combined[party.indexes[1], "Fare"] / length(party.indexes)
for (k in 1:length(party.indexes)) {
ticket.party.size[party.indexes[k]] <- length(party.indexes)
avg.Fare[party.indexes[k]] <- current.avg.Fare
}
}
data.combined$ticket.party.size <- ticket.party.size
data.combined$avg.Fare <- avg.Fare
# Refresh 1st class "Mr." dataframe
first.mr.df <- data.combined[indexes.first.mr, ]
summary(first.mr.df)
PassengerId Survived Pclass Name
Min. : 7.0 0 :75 1:169 Anderson, Mr. Harry : 1
1st Qu.: 371.0 1 :40 2: 0 Andrews, Mr. Thomas Jr : 1
Median : 646.0 None:54 3: 0 Artagaveytia, Mr. Ramon : 1
Mean : 655.7 Barkworth, Mr. Algernon Henry Wilson: 1
3rd Qu.: 967.0 Baumann, Mr. John D : 1
Max. :1299.0 Baxter, Mr. Quigg Edmond : 1
(Other) :163
Sex Age SibSp Parch Ticket
female: 0 Min. :17.00 Min. :0.0000 0 :141 113503 : 3
male :169 1st Qu.:31.00 1st Qu.:0.0000 1 : 20 110465 : 2
Median :42.00 Median :0.0000 2 : 6 112058 : 2
Mean :42.12 Mean :0.3077 3 : 1 113059 : 2
3rd Qu.:50.75 3rd Qu.:1.0000 4 : 1 113796 : 2
Max. :80.00 Max. :3.0000 5 : 0 12749 : 2
NA's :27 (Other): 0 (Other):156
Fare Cabin Embarked title family.size
Min. : 0.00 Length:169 : 0 Master.: 0 1 :106
1st Qu.: 27.72 Class :character C: 66 Miss. : 0 2 : 45
Median : 45.50 Mode :character Q: 0 Mr :160 3 : 13
Mean : 67.36 S:103 Mrs : 0 4 : 2
3rd Qu.: 77.29 Others : 9 6 : 2
Max. :512.33 5 : 1
(Other): 0
ticket ticket.first.char Cabin.first.char cabin.multiple
Length:169 1 :106 C :46 N:156
Class :character P : 44 :42 Y: 13
Mode :character 3 : 7 B :26
6 : 4 D :19
2 : 3 E :19
F : 2 A :16
(Other): 3 (Other): 1
last.name new.title ticket.party.size avg.Fare
Length:169 Mr. :169 Min. :1.000 Min. : 0.00
Class :character Dr. : 0 1st Qu.:1.000 1st Qu.: 26.29
Mode :character Lady. : 0 Median :1.000 Median : 28.54
Master.: 0 Mean :1.899 Mean : 32.16
Miss. : 0 3rd Qu.:2.000 3rd Qu.: 37.48
Mrs. : 0 Max. :7.000 Max. :128.08
(Other): 0
# Visualize new features
ggplot(first.mr.df[first.mr.df$Survived != "None",], aes(x = ticket.party.size, fill = Survived)) +
geom_density(alpha = 0.5) +
ggtitle("Survival Rates 1st Class 'Mr.' by ticket.party.size")

ggplot(first.mr.df[first.mr.df$Survived != "None",], aes(x = avg.Fare, fill = Survived)) +
geom_density(alpha = 0.5) +
ggtitle("Survival Rates 1st Class 'Mr.' by avg.fare")

# Hypothesis - ticket.party.size is highly correlated with avg.fare
summary(data.combined$avg.Fare)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 7.55 8.05 14.76 15.01 128.08 1
# One missing value, take a look
data.combined[is.na(data.combined$avg.Fare), ]
NA
# Get records for similar passengers and summarize avg.fares
indexes <- with(data.combined, which(Pclass == "3" & title == "Mr." & family.size == 1 &
ticket != "3701"))
similar.na.passengers <- data.combined[indexes,]
summary(similar.na.passengers$avg.Fare)
Min. 1st Qu. Median Mean 3rd Qu. Max.
# Use median since close to mean and a little higher than mean
data.combined[is.na(avg.Fare), "avg.Fare"] <- 7.840
# Leverage caret's preProcess function to normalize data
preproc.data.combined <- data.combined[, c("ticket.party.size", "avg.Fare")]
preProc <- preProcess(preproc.data.combined, method = c("center", "scale"))
postproc.data.combined <- predict(preProc, preproc.data.combined)
# Hypothesis refuted for all data
cor(postproc.data.combined$ticket.party.size, postproc.data.combined$avg.Fare)
[1] 0.09428625
# How about for just 1st class all-up?
indexes <- which(data.combined$Pclass == "1")
cor(postproc.data.combined$ticket.party.size[indexes],
postproc.data.combined$avg.Fare[indexes])
[1] 0.2576249
# Hypothesis refuted again
# OK, let's see if our feature engineering has made any difference
features <- c("Pclass", "new.title", "family.size", "ticket.party.size", "avg.Fare")
rpart.train.3 <- data.combined[1:891, features]
# Run CV and check out results
rpart.3.cv.1 <- rpart.cv(94622, rpart.train.3, rf.label, ctrl.3)
rpart.3.cv.1
CART
891 samples
5 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (3 fold, repeated 10 times)
Summary of sample sizes: 594, 594, 594, 594, 594, 594, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000000 0.8217733 0.6159969
0.01572898 0.8342312 0.6406869
0.03145796 0.8277217 0.6286595
0.04718693 0.8233446 0.6195142
0.06291591 0.7989899 0.5755318
0.07864489 0.7977553 0.5735997
0.09437387 0.7940516 0.5672334
0.11010284 0.7913580 0.5629801
0.12583182 0.7912458 0.5629478
0.14156080 0.7912458 0.5629478
0.15728978 0.7912458 0.5629478
0.17301875 0.7912458 0.5629478
0.18874773 0.7912458 0.5629478
0.20447671 0.7912458 0.5629478
0.22020569 0.7912458 0.5629478
0.23593466 0.7912458 0.5629478
0.25166364 0.7912458 0.5629478
0.26739262 0.7912458 0.5629478
0.28312160 0.7912458 0.5629478
0.29885057 0.7912458 0.5629478
0.31457955 0.7912458 0.5629478
0.33030853 0.7912458 0.5629478
0.34603751 0.7912458 0.5629478
0.36176649 0.7912458 0.5629478
0.37749546 0.7912458 0.5629478
0.39322444 0.7912458 0.5629478
0.40895342 0.7840629 0.5414002
0.42468240 0.7563412 0.4570712
0.44041137 0.7178451 0.3369192
0.45614035 0.6934905 0.2598962
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.01572898.
# Plot
prp(rpart.3.cv.1$finalModel, type = 0, extra = 1, under = TRUE)

NA
NA
- Submitting, scoring, and some analysis.
#
# Rpart scores 0.80383
#
# Subset our test records and features
test.submit.df <- data.combined[892:1309, features]
# Make predictions
rpart.3.preds <- predict(rpart.3.cv.1$finalModel, test.submit.df, type = "class")
table(rpart.3.preds)
rpart.3.preds
0 1
265 153
# Write out a CSV file for submission to Kaggle
submit.df <- data.frame(PassengerId = rep(892:1309), Survived = rpart.3.preds)
write.csv(submit.df, file = "RPART_SUB_20190731_1.csv", row.names = FALSE)
#
# Random forest scores 0.80861
#
features <- c("Pclass", "new.title", "ticket.party.size", "avg.Fare")
rf.train.temp <- data.combined[1:891, features]
library(randomForest)
set.seed(1234)
rf.temp <- randomForest(x = rf.train.temp, y = rf.label, ntree = 1000)
rf.temp
Call:
randomForest(x = rf.train.temp, y = rf.label, ntree = 1000)
Type of random forest: classification
Number of trees: 1000
No. of variables tried at each split: 2
OOB estimate of error rate: 16.39%
Confusion matrix:
0 1 class.error
0 501 48 0.08743169
1 98 244 0.28654971
test.submit.df <- data.combined[892:1309, features]
# Make predictions
rf.preds <- predict(rf.temp, test.submit.df)
table(rf.preds)
rf.preds
0 1
266 152
# Write out a CSV file for submission to Kaggle
submit.df <- data.frame(PassengerId = rep(892:1309), Survived = rf.preds)
write.csv(submit.df, file = "RF_SUB_20190731_1.csv", row.names = FALSE)
#
# If we want to improve our model, a good place to start is focusing on where it
# gets things wrong!
#
# First, let's explore our collection of features using mutual information to
# gain some additional insight. Our intuition is that the plot of our tree
# should align well to the definition of mutual information.
#install.packages("infotheo")
library(infotheo)
mutinformation(rf.label, data.combined$Pclass[1:891])
[1] 0.05810725
mutinformation(rf.label, data.combined$Sex[1:891])
[1] 0.1508705
mutinformation(rf.label, data.combined$SibSp[1:891])
[1] 0.02319709
mutinformation(rf.label, data.combined$Parch[1:891])
[1] 0.01636558
mutinformation(rf.label, discretize(data.combined$Fare[1:891]))
[1] 0.06616096
mutinformation(rf.label, data.combined$Embarked[1:891])
[1] 0.01666817
mutinformation(rf.label, data.combined$title[1:891])
[1] 0.1661094
mutinformation(rf.label, data.combined$family.size[1:891])
[1] 0.04778124
mutinformation(rf.label, data.combined$ticket.first.char[1:891])
[1] 0.0686185
mutinformation(rf.label, data.combined$cabin.multiple[1:891])
[1] 0.002248365
mutinformation(rf.label, data.combined$new.title[1:891])
[1] 0.1684553
mutinformation(rf.label, data.combined$ticket.party.size[1:891])
[1] 0.05860494
mutinformation(rf.label, discretize(data.combined$avg.Fare[1:891]))
[1] 0.05206484
# OK, now let's leverage the tsne algorithm to create a 2-D representation of our data
# suitable for visualization starting with folks our model gets right very often - folks
# with titles other than 'Mr."
#install.packages("Rtsne")
library(Rtsne)
most.correct <- data.combined[data.combined$new.title != "Mr.",]
indexes <- which(most.correct$Survived != "None")
# NOTE - Bug fix for original version. Rtsne needs a seed to ensure consistent
# output between runs.
set.seed(984357)
tsne.1 <- Rtsne(most.correct[, features], check_duplicates = FALSE)
ggplot(NULL, aes(x = tsne.1$Y[indexes, 1], y = tsne.1$Y[indexes, 2],
color = most.correct$Survived[indexes])) +
geom_point() +
labs(color = "Survived") +
ggtitle("tsne 2D Visualization of Features for new.title Other than 'Mr.'")

# To get a baseline, let's use conditional mutual information on the tsne X and
# Y features for females and boys in 1st and 2nd class. The intuition here is that
# the combination of these features should be higher than any individual feature
# we looked at above.
condinformation(most.correct$Survived[indexes], discretize(tsne.1$Y[indexes,]))
[1] 0.2432662
# As one more comparison, we can leverage conditional mutual information using
# the top two features used in our tree plot - new.title and pclass
condinformation(rf.label, data.combined[1:891, c("new.title", "Pclass")])
[1] 0.2420849
# OK, now let's take a look at adult males since our model has the biggest
# potential upside for improving (i.e., the tree predicts incorrectly for 86
# adult males). Let's visualize with tsne.
misters <- data.combined[data.combined$new.title == "Mr.",]
indexes <- which(misters$Survived != "None")
tsne.2 <- Rtsne(misters[, features], check_duplicates = FALSE)
ggplot(NULL, aes(x = tsne.2$Y[indexes, 1], y = tsne.2$Y[indexes, 2],
color = misters$Survived[indexes])) +
geom_point() +
labs(color = "Survived") +
ggtitle("tsne 2D Visualization of Features for new.title of 'Mr.'")

NA
NA
# Now conditional mutual information for tsne features for adult males
condinformation(misters$Survived[indexes], discretize(tsne.2$Y[indexes,]))
[1] 0.07005047
#
# Idea - How about creating tsne featues for all of the training data and
# using them in our model?
#
tsne.3 <- Rtsne(data.combined[, features], check_duplicates = FALSE)
ggplot(NULL, aes(x = tsne.3$Y[1:891, 1], y = tsne.3$Y[1:891, 2],
color = data.combined$Survived[1:891])) +
geom_point() +
labs(color = "Survived") +
ggtitle("tsne 2D Visualization of Features for all Training Data")

# Now conditional mutual information for tsne features for all training
condinformation(data.combined$Survived[1:891], discretize(tsne.3$Y[1:891,]))
[1] 0.1986334
# Add the tsne features to our data frame for use in model building
data.combined$tsne.x <- tsne.3$Y[,1]
data.combined$tsne.y <- tsne.3$Y[,2]
---
title: "Titanic: Machine Learning from Disaster "
output: html_notebook
---

```{r}
#Load row data
train <- read.csv('/Users/srikant/Downloads/titanic-machine-learning-from-disaster/train.csv', header = TRUE)
test <- read.csv('/Users/srikant/Downloads/titanic-machine-learning-from-disaster/test.csv', header = TRUE)
```
```{r}
#Add a "Survived" variable to the test set to allow for combining data sets
test.survived <- data.frame(Survived = rep("None", nrow(test)), test[,])
```
```{r}
# Combine data sets
data.combined <- rbind(train,test.survived)
colnames(test.survived) <- colnames(train)
```
```{r}
# A bit about R dada types 
str(data.combined)
data.combined$Pclass <- as.factor(data.combined$Pclass)
data.combined$Survived <- as.factor(data.combined$Survived)
```
```{r}
#Take a look at gross survival rates
table(data.combined$Survived)
```
```{r}
#Distribution accross the classes
table(data.combined$Pclass)
```
```{r}
#load ggplot2 package to use for visualization
library(ggplot2)
```
```{r}
#Hypothesis -Rich folks survived at a higher rate
train$Pclass <- as.factor(train$Pclass)
ggplot(train, aes(x = Pclass, fill = factor(Survived))) +
  geom_bar(width = 0.5) +
  xlab("Pclass")+
  ylab("Total Count") +
  labs(fill = "Survived")
```
```{r}
#Examining the first few names in the training data set
head(as.character(train$Name))

```
```{r}
# How many unique names are there accross both train & test?
length(unique(as.character(data.combined$Name)))
```

```{r}
#Two duplicate names, take a closer look
#first , get the duplicate names and store them as vector
dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined))), "name"]) 
```
```{r}
#next take a look at the record in the combined data set
data.combined[which(data.combined$Name %in% dup.names),]
```
```{r}
#what is up with Mr and Mrs thing?
library(stringr)
#any corelation with other variables (eg.,sibsp)?
misses <- data.combined[which(str_detect(data.combined$Name, "Miss")),]
misses[1:5,]
```
```{r}
  #Hypothesis -Name titles corelate with age
mrses <- data.combined[which(str_detect(data.combined$Name, "Mrs.")), ]
mrses[1:5,]
#check out males to see if the pattern continues
males <- data.combined[which(train$Sex == 'male'),]
males[1:5,]
```
```{r}
#Expand upon the real relationship between 'Survived' and 'Pclass' by adding the new' Title' variable to the data set and then explore a potential 3-dimensional relationship.

#create a utility function to help with the title extraction
extractTitle <- function(Name){
  name <- as.character(Name)
  
  if(length(grep("Miss.", Name)) > 0) {
    return("Miss.")
  }else if (length(grep("Master.", Name)) > 0)
  {return("Master.")
  }else if (length(grep("Mrs.", Name)) > 0) {
      return("Mrs")
  }else if (length(grep("Mr", Name)) > 0){
      return("Mr")
  }else{
      return("Others")
    }
}

titles <- NULL
for (i in 1:nrow(data.combined)) {
  titles <- c(titles, extractTitle(data.combined[i,"Name"]))
  
} 
data.combined$title <- as.factor(titles)
```
```{r}
#since we only have survived tables for the train set, only use the first 891 rows
ggplot(data.combined[1:891,], aes(x = title, fill = Survived))+
         geom_bar(stat = "count")+
         facet_wrap(~Pclass)+
         ggtitle("Pclass")+
         xlab("Title")+
         ylab("Total Count")+
         labs(fill = "Survived")
```
```{r}
#visulization the 3 ways relationship of sex,pclass and survival,compare to analysis
ggplot(data.combined[1:891,], aes(x = Sex, fill = Survived)) +
  geom_bar(stat = "count")+
         facet_wrap(~Pclass)+
         ggtitle("Pclass")+
         xlab("Sex")+
         ylab("Total Count")+
         labs(fill = "Survived")
```
```{r}
summary(data.combined$Age)
```
```{r}
summary(data.combined[1:891, "Age"])
```


```{r}
#just to be through take a look at survival rates rates broken out by sex,pclass a
ggplot(data.combined[1:891,], aes(x = Age, fill = Survived)) +
         facet_wrap(~Sex + Pclass)+
         geom_bar(stat = "count")+
         xlab("Age")+
         ylab("Total Count")+
         labs(fill = "Survived")
```
```{r}
ggplot(data.combined[1:891,], aes(x = Sex, fill = Survived)) +
         facet_wrap(~Sex + Pclass)+
         geom_bar(stat = "count")+
         xlab("Age")+
         ylab("Total Count")+
         labs(fill = "Survived")
```
```{r}
#validation that "Master." is a good proxy for male children
boys <- data.combined[which(data.combined$title == "Master."),]
summary(boys$Age)
```
```{r}
# we know that "Miss" is more complicated lets examine further 
misses <- data.combined[which(data.combined$title == "Miss."),]
summary(misses$Age)
```
```{r}
ggplot(misses[misses$Survived != "None",], aes(x = Age, fill = Survived)) +
         facet_wrap(~Pclass)+
         geom_bar(stat = "count")+
  ggtitle("Age for 'Miss.' by Pclass")
         xlab("Age")+
         ylab("Total Count")+
         labs(fill = "Survived")

```
```{r}
misses.alone <- misses[which(misses$SibSp == 0 & misses$Parch == 0),]
summary(misses.alone$Age)
length(which(misses.alone$Age <= 14.5))
```
```{r}
summary(data.combined$SibSp)
```
```{r}
#can we treat as a factor?
length(unique(data.combined$SibSp))
```
```{r}
#we beleive title is predective. visualize survival rates by sibsp,pclass and title
ggplot(data.combined[1:891,], aes(x = SibSp, fill = Survived)) +
  geom_bar(stat = "count")+
         facet_wrap(~Pclass + title)+
         ggtitle("Pclass, Title")+
         xlab("SibSp")+
         ylab("Total Count")+
  ylim(0,300)+
         labs(fill = "Survived")
```
```{r}
#treat the prach variable as a factor and vizualise
data.combined$Parch <- as.factor(data.combined$Parch)
ggplot(data.combined[1:891,], aes(x = Parch, fill = Survived)) +
  geom_bar(stat = "count")+
         facet_wrap(~Pclass + title)+
         ggtitle("Pclass, Title")+
         xlab("Parch")+
         ylab("Total Count")+
  ylim(0,300)+
         labs(fill = "Survived")
```
```{r}
#lets try some feature engineering what about creating a family six=ze feature 
temp.SibSp <- c(train$SibSp, test$SibSp)
temp.Parch <- c(train$Parch, test$Parch)
data.combined$family.size <- as.factor(temp.SibSp + temp.Parch +1)
```
```{r}
#Visualize it to see if it is predective
ggplot(data.combined[1:891,], aes(x = family.size,fill = Survived)) +
    geom_bar(stat = "count")+
         facet_wrap(~Pclass + title)+
         ggtitle("Pclass, Title")+
         xlab("family.Size")+
         ylab("Total Count")+
  ylim(0,300)+
         labs(fill = "Survived")
  
```
```{r}
# Take a look at the ticket variable
str(data.combined$Ticket)
```


```{r}
```
```{r}
# Based on the huge number of levels ticket really isn't a factor variable it is a string. 
# Convert it and display first 20
data.combined$ticket <- as.character(data.combined$Ticket)
data.combined$ticket[1:20]

```

```{r}
# There's no immediately apparent structure in the data, let's see if we can find some.
# We'll start with taking a look at just the first char for each
ticket.first.char <- ifelse(data.combined$ticket == "", " ", substr(data.combined$ticket, 1, 1))
unique(ticket.first.char)

```
```{r}
# OK, we can make a factor for analysis purposes and visualize
data.combined$ticket.first.char <- as.factor(ticket.first.char)
```
```{r}
# First, a high-level plot of the data
ggplot(data.combined[1:891,], aes(x = ticket.first.char, fill = Survived)) +
  geom_bar() +
  ggtitle("Survivability by ticket.first.char") +
  xlab("ticket.first.char") +
  ylab("Total Count") +
  ylim(0,350) +
  labs(fill = "Survived")
```
```{r}
# Ticket seems like it might be predictive, drill down a bit
ggplot(data.combined[1:891,], aes(x = ticket.first.char, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass) + 
  ggtitle("Pclass") +
  xlab("ticket.first.char") +
  ylab("Total Count") +
  ylim(0,300) +
  labs(fill = "Survived")
```
```{r}
# Lastly, see if we get a pattern when using combination of pclass & title
ggplot(data.combined[1:891,], aes(x = ticket.first.char, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass + title) + 
  ggtitle("Pclass, Title") +
  xlab("ticket.first.char") +
  ylab("Total Count") +
  ylim(0,200) +
  labs(fill = "Survived")

```
```{r}
# Next up - the fares Titanic passengers paid
summary(data.combined$Fare)
length(unique(data.combined$Fare))
```
```{r}
# Can't make fare a factor, treat as numeric & visualize with histogram
ggplot(data.combined, aes(x = Fare)) +
  geom_bar() +
  ggtitle("Combined Fare Distribution") +
  xlab("Fare") +
  ylab("Total Count") +
  ylim(0,200)
```
```{r}
# Let's check to see if fare has predictive power
ggplot(data.combined[1:891,], aes(x = Fare, fill = Survived)) +
  geom_bar(binwidth = 5) +
  facet_wrap(~Pclass + title) + 
  ggtitle("Pclass, Title") +
  xlab("fare") +
  ylab("Total Count") +
  ylim(0,50) + 
  labs(fill = "Survived")

```
```{r}
# Analysis of the cabin variable
str(data.combined$cabin)



```
```{r}
# Cabin really isn't a factor, make a string and the display first 100
data.combined$Cabin <- as.character(data.combined$Cabin)
data.combined$Cabin[1:100]

```
```{r}
# Replace empty cabins with a "U"
data.combined[which(data.combined$Cabin.first.char == ""), "Cabin"] <- "U"
data.combined$Cabin[1:100]

```


```{r}
# Take a look at just the first char as a factor
cabin.first.char <- as.factor(substr(data.combined$Cabin, 1, 1))
str(cabin.first.char)
levels(cabin.first.char)
```
```{r}
# Add to combined data set and plot 
data.combined$Cabin.first.char <- cabin.first.char
```
```{r}
# High level plot
ggplot(data.combined[,], aes(x = cabin.first.char, fill = Survived)) +
  geom_bar() +
  ggtitle("Survivability by cabin.first.char") +
  xlab("cabin.first.char") +
  ylab("Total Count") +
  ylim(0,750) +
  labs(fill = "Survived")

```

```{r}
# Could have some predictive power, drill in
ggplot(data.combined[,], aes(x = cabin.first.char, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass) +
  ggtitle("Survivability by cabin.first.char") +
  xlab("Pclass") +
  ylab("Total Count") +
  ylim(0,500) +
  labs(fill = "Survived")
  
```
```{r}
# Does this feature improve upon pclass + title?
ggplot(data.combined[,], aes(x = cabin.first.char, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass + title) +
  ggtitle("Pclass, Title") +
  xlab("cabin.first.char") +
  ylab("Total Count") +
  ylim(0,500) +
  labs(fill = "Survived")
```
```{r}
# What about folks with multiple cabins?
data.combined$cabin.multiple <- as.factor(ifelse(str_detect(data.combined$Cabin, " "), "Y", "N"))


```
```{r}

ggplot(data.combined[1:891,], aes(x = cabin.multiple, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass + title) +
  ggtitle("Pclass, Title") +
  xlab("cabin.multiple") +
  ylab("Total Count") +
  ylim(0,350) +
  labs(fill = "Survived")


```
```{r}
# Does survivability depend on where you got onboard the Titanic?
str(data.combined$Embarked)
levels(data.combined$Embarked)

```
```{r}
# Plot data for analysis
ggplot(data.combined[1:891,], aes(x = Embarked, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass + title) +
  ggtitle("Pclass, Title") +
  xlab("embarked") +
  ylab("Total Count") +
  ylim(0,300) +
  labs(fill = "Survived")

```

 - Exploratory Modeling
```{r}
library(randomForest)
# Train a Random Forest with the default parameters using pclass & title
rf.train.1 <- data.combined[1:891, c("Pclass", "title")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.1 <- randomForest(x = rf.train.1, y = rf.label, importance = TRUE, ntree = 1000)
rf.1
varImpPlot(rf.1)

```
```{r}
# Train a Random Forest using pclass, title, & sibsp
rf.train.2 <- data.combined[1:891, c("Pclass", "title", "SibSp")]
set.seed(1234)
rf.2 <- randomForest(x = rf.train.2, y = rf.label, importance = TRUE, ntree = 1000) 
rf.2
varImpPlot(rf.2)
```
```{r}
# Train a Random Forest using pclass, title, & parch
rf.train.3 <- data.combined[1:891, c("Pclass", "title", "Parch")]

set.seed(1234)
rf.3 <- randomForest(x = rf.train.3, y = rf.label, importance = TRUE, ntree = 1000)
rf.3
varImpPlot(rf.3)
```
```{r}
# Train a Random Forest using pclass, title, sibsp, parch
rf.train.4 <- data.combined[1:891, c("Pclass", "title", "SibSp", "Parch")]

set.seed(1234)
rf.4 <- randomForest(x = rf.train.4, y = rf.label, importance = TRUE, ntree = 1000)
rf.4
varImpPlot(rf.4)
```
```{r}
# Train a Random Forest using pclass, title, & family.size
rf.train.5 <- data.combined[1:891, c("Pclass", "title", "family.size")]

set.seed(1234)
rf.5 <- randomForest(x = rf.train.5, y = rf.label, importance = TRUE, ntree = 1000)
rf.5
varImpPlot(rf.5)

```
```{r}
# Train a Random Forest using pclass, title, sibsp, & family.size
rf.train.6 <- data.combined[1:891, c("Pclass", "title", "SibSp", "family.size")]

set.seed(1234)
rf.6 <- randomForest(x = rf.train.6, y = rf.label, importance = TRUE, ntree = 1000)
rf.6
varImpPlot(rf.6)

```
```{r}
# Train a Random Forest using pclass, title, parch, & family.size
rf.train.7 <- data.combined[1:891, c("Pclass", "title", "Parch", "family.size")]

set.seed(1234)
rf.7 <- randomForest(x = rf.train.7, y = rf.label, importance = TRUE, ntree = 1000)
rf.7
varImpPlot(rf.7)

```
```{r}
#cross validation 
# Subset our test records and features
test.submit.df <- data.combined[892:1309, c("Pclass", "title", "family.size")]

# Make predictions
rf.5.preds <- predict(rf.5, test.submit.df)
table(rf.5.preds)

```
```{r}
# Write out a CSV file for submission to Kaggle
submit.df <- data.frame(PassengerId = rep(892:1309), Survived = rf.5.preds)
write.csv(submit.df, file = "RF_SUB_20190730_1.csv", row.names = FALSE)
```
```{r}
# more accurate estimates
library(caret)
library(doSNOW)
set.seed(2348)
cv.10.folds <- createMultiFolds(rf.label, k = 10, times = 10)

# Check stratification
table(rf.label)
342 / 549

table(rf.label[cv.10.folds[[33]]])
308 / 494


```
```{r}
# Set up caret's trainControl object per above.
ctrl.1 <- trainControl(method = "repeatedcv", number = 10, repeats = 10,
                       index = cv.10.folds)
# Set up doSNOW package for multi-core training. This is helpful as we're going
# to be training a lot of trees.
# NOTE - This works on Windows and Mac, unlike doMC
cl <- makeCluster(6, type = "SOCK")
registerDoSNOW(cl)

```
```{r}
# Set seed for reproducibility and train
install.packages('e1071', dependencies=TRUE)
set.seed(34324)
rf.5.cv.1 <- train(x = rf.train.5, y = rf.label, method = "rf", tuneLength = 3,
                   ntree = 1000, trControl = ctrl.1)
```
```{r}
#Shutdown cluster
stopCluster(cl)


```
```{r}
set.seed(5983)
cv.5.folds <- createMultiFolds(rf.label, k = 5, times = 10)

ctrl.2 <- trainControl(method = "repeatedcv", number = 5, repeats = 10,
                       index = cv.5.folds)

cl <- makeCluster(6, type = "SOCK")
registerDoSNOW(cl)

set.seed(89472)
rf.5.cv.2 <- train(x = rf.train.5, y = rf.label, method = "rf", tuneLength = 3,
                   ntree = 1000, trControl = ctrl.2)

#Shutdown cluster
stopCluster(cl)

```
```{r}
# 5-fold CV isn't better. Move to 3-fold CV repeated 10 times. 
set.seed(37596)
cv.3.folds <- createMultiFolds(rf.label, k = 3, times = 10)

ctrl.3 <- trainControl(method = "repeatedcv", number = 3, repeats = 10,
                       index = cv.3.folds)

cl <- makeCluster(6, type = "SOCK")
registerDoSNOW(cl)

set.seed(94622)
rf.5.cv.3 <- train(x = rf.train.5, y = rf.label, method = "rf", tuneLength = 3,
                   ntree = 64, trControl = ctrl.3)

#Shutdown cluster
stopCluster(cl)

# Check out results
rf.5.cv.3


```
- Exploratory Modeling 2
```{r}
# Install and load packages
#install.packages("rpart")
#install.packages("rpart.plot")
library(rpart)
library(rpart.plot)

# Per video #5, let's use 3-fold CV repeated 10 times 

# Create utility function
rpart.cv <- function(seed, training, labels, ctrl) {
  cl <- makeCluster(6, type = "SOCK")
  registerDoSNOW(cl)
  
  set.seed(seed)
  # Leverage formula interface for training
  rpart.cv <- train(x = training, y = labels, method = "rpart", tuneLength = 30, 
                    trControl = ctrl)
  
  #Shutdown cluster
  stopCluster(cl)
  
  return (rpart.cv)
}

```
```{r}
# Grab features
features <- c("Pclass", "title", "family.size")
rpart.train.1 <- data.combined[1:891, features]

# Run CV and check out results
rpart.1.cv.1 <- rpart.cv(94622, rpart.train.1, rf.label, ctrl.3)
rpart.1.cv.1

# Plot
prp(rpart.1.cv.1$finalModel, type = 0, extra = 1, under = TRUE)


```
```{r}
# The plot bring out some interesting lines of investigation. Namely:
#      1 - Titles of "Mr." and "Other" are predicted to perish at an 
#          overall accuracy rate of 83.2 %.
#      2 - Titles of "Master.", "Miss.", & "Mrs." in 1st & 2nd class
#          are predicted to survive at an overall accuracy rate of 94.9%.
#      3 - Titles of "Master.", "Miss.", & "Mrs." in 3rd class with 
#          family sizes equal to 5, 6, 8, & 11 are predicted to perish
#          with 100% accuracy.
#      4 - Titles of "Master.", "Miss.", & "Mrs." in 3rd class with 
#          family sizes not equal to 5, 6, 8, or 11 are predicted to 
#          survive with 59.6% accuracy.

# Both rpart and rf confirm that title is important, let's investigate further
table(data.combined$title)

```
```{r}
# Parse out last name and title
data.combined[1:25, "Name"]

library(stringr)
name.splits <- str_split(data.combined$Name, ",")
name.splits[1]
last.names <- sapply(name.splits, "[", 1)
last.names[1:10]

```
```{r}
# Add last names to dataframe in case we find it useful later
data.combined$last.name <- last.names
```
```{r}
# Now for titles
name.splits <- str_split(sapply(name.splits, "[", 2), " ")
titles <- sapply(name.splits, "[", 2)
unique(titles)
```
```{r}
# What's up with a title of 'the'?
data.combined[which(titles == "the"),]

# Re-map titles to be more exact
titles[titles %in% c("Dona.", "the")] <- "Lady."
titles[titles %in% c("Ms.", "Mlle.")] <- "Miss."
titles[titles == "Mme."] <- "Mrs."
titles[titles %in% c("Jonkheer.", "Don.")] <- "Sir."
titles[titles %in% c("Col.", "Capt.", "Major.")] <- "Officer"
table(titles)

```
```{r}
# Make title a factor
data.combined$new.title <- as.factor(titles)
```
```{r}
# Visualize new version of title
ggplot(data.combined[1:891,], aes(x = new.title, fill = Survived)) +
  geom_bar() +
  facet_wrap(~ Pclass) + 
  ggtitle("Surival Rates for new.title by pclass")

```
```{r}
# Collapse titles based on visual analysis
indexes <- which(data.combined$new.title == "Lady."|data.combined$new.title == "Dr.")
data.combined$new.title[indexes] <- "Mrs."

```
```{r}
indexes <- which(
            data.combined$new.title == "Rev." |
                 data.combined$new.title == "Sir." |
                 data.combined$new.title == "Officer")
data.combined$new.title[indexes] <- "Mr."
```
```{r}
# Visualize 
ggplot(data.combined[1:891,], aes(x = new.title, fill = Survived)) +
  geom_bar() +
  facet_wrap(~ Pclass) +
  ggtitle("Surival Rates for Collapsed new.title by pclass")

```
```{r}
# Grab features
features <- c("Pclass", "new.title", "family.size")
rpart.train.2 <- data.combined[1:891, features]

# Run CV and check out results
rpart.2.cv.1 <- rpart.cv(94622, rpart.train.2, rf.label, ctrl.3)
rpart.2.cv.1


```
```{r}
# Plot
prp(rpart.2.cv.1$finalModel, type = 0, extra = 1, under = TRUE)
```
```{r}
# Dive in on 1st class "Mr."
indexes.first.mr <- which(data.combined$new.title == "Mr." & data.combined$Pclass == "1")
first.mr.df <- data.combined[indexes.first.mr, ]
summary(first.mr.df)
```
```{r}
# One female?
first.mr.df[first.mr.df$Sex == "female",]

```
```{r}
# Update new.title feature
indexes <- which(data.combined$new.title == "Mr." & 
                 data.combined$sex == "female")
data.combined$new.title[indexes] <- "Mrs."

# Any other gender slip-ups?
length(which(data.combined$sex == "female" & 
             (data.combined$new.title == "Master." |
              data.combined$new.title == "Mr.")))

```
```{r}
# Refresh data frame
indexes.first.mr <- which(data.combined$new.title == "Mr." & data.combined$Pclass == "1")
first.mr.df <- data.combined[indexes.first.mr, ]

```
```{r}
# Take a look at some of the high fares
indexes <- which(data.combined$ticket == "PC 17755" |
                 data.combined$ticket == "PC 17611" |
                 data.combined$ticket == "113760")
View(data.combined[indexes,])
```
```{r}
# Visualize survival rates for 1st class "Mr." by fare
ggplot(first.mr.df, aes(x = Fare, fill = Survived)) +
  geom_density(alpha = 0.5) +
  ggtitle("1st Class 'Mr.' Survival Rates by fare")

```
```{r}

# Engineer features based on all the passengers with the same ticket
ticket.party.size <- rep(0, nrow(data.combined))
avg.Fare <- rep(0.0, nrow(data.combined))
tickets <- unique(data.combined$ticket)

for (i in 1:length(tickets)) {
  current.ticket <- tickets[i]
  party.indexes <- which(data.combined$ticket == current.ticket)
  current.avg.Fare <- data.combined[party.indexes[1], "Fare"] / length(party.indexes)
  
  for (k in 1:length(party.indexes)) {
    ticket.party.size[party.indexes[k]] <- length(party.indexes)
    avg.Fare[party.indexes[k]] <- current.avg.Fare
  }
}
```
```{r}
data.combined$ticket.party.size <- ticket.party.size
data.combined$avg.Fare <- avg.Fare

# Refresh 1st class "Mr." dataframe
first.mr.df <- data.combined[indexes.first.mr, ]
summary(first.mr.df)

```
```{r}
# Visualize new features
ggplot(first.mr.df[first.mr.df$Survived != "None",], aes(x = ticket.party.size, fill = Survived)) +
  geom_density(alpha = 0.5) +
  ggtitle("Survival Rates 1st Class 'Mr.' by ticket.party.size")
```
```{r}
ggplot(first.mr.df[first.mr.df$Survived != "None",], aes(x = avg.Fare, fill = Survived)) +
  geom_density(alpha = 0.5) +
  ggtitle("Survival Rates 1st Class 'Mr.' by avg.fare")
```
```{r}
# Hypothesis - ticket.party.size is highly correlated with avg.fare
summary(data.combined$avg.Fare)
```
```{r}
# One missing value, take a look
data.combined[is.na(data.combined$avg.Fare), ]

```
```{r}
# Get records for similar passengers and summarize avg.fares
indexes <- with(data.combined, which(Pclass == "3" & title == "Mr." & family.size == 1 &
                                     ticket != "3701"))
similar.na.passengers <- data.combined[indexes,]
summary(similar.na.passengers$avg.Fare)

```
```{r}
# Use median since close to mean and a little higher than mean
data.combined[is.na(avg.Fare), "avg.Fare"] <- 7.840

```
```{r}
# Leverage caret's preProcess function to normalize data
preproc.data.combined <- data.combined[, c("ticket.party.size", "avg.Fare")]
preProc <- preProcess(preproc.data.combined, method = c("center", "scale"))

postproc.data.combined <- predict(preProc, preproc.data.combined)

```
```{r}
# Hypothesis refuted for all data
cor(postproc.data.combined$ticket.party.size, postproc.data.combined$avg.Fare)

```
```{r}
# How about for just 1st class all-up?
indexes <- which(data.combined$Pclass == "1")
cor(postproc.data.combined$ticket.party.size[indexes], 
    postproc.data.combined$avg.Fare[indexes])
# Hypothesis refuted again

```
```{r}
# OK, let's see if our feature engineering has made any difference
features <- c("Pclass", "new.title", "family.size", "ticket.party.size", "avg.Fare")
rpart.train.3 <- data.combined[1:891, features]

```
```{r}
# Run CV and check out results
rpart.3.cv.1 <- rpart.cv(94622, rpart.train.3, rf.label, ctrl.3)
rpart.3.cv.1

```
```{r}
# Plot
prp(rpart.3.cv.1$finalModel, type = 0, extra = 1, under = TRUE)


```
 - Submitting, scoring, and some analysis.
```{r}

#
# Rpart scores 0.80383
#
# Subset our test records and features
test.submit.df <- data.combined[892:1309, features]

```
```{r}
# Make predictions
rpart.3.preds <- predict(rpart.3.cv.1$finalModel, test.submit.df, type = "class")
table(rpart.3.preds)
```
```{r}
# Write out a CSV file for submission to Kaggle
submit.df <- data.frame(PassengerId = rep(892:1309), Survived = rpart.3.preds)
```


```{r}
write.csv(submit.df, file = "RPART_SUB_20190731_1.csv", row.names = FALSE)

```
```{r}
#
# Random forest scores 0.80861
#
features <- c("Pclass", "new.title", "ticket.party.size", "avg.Fare")
rf.train.temp <- data.combined[1:891, features]
```
```{r}
library(randomForest)
set.seed(1234)
rf.temp <- randomForest(x = rf.train.temp, y = rf.label, ntree = 1000)
rf.temp
```
```{r}
test.submit.df <- data.combined[892:1309, features]

```
```{r}
# Make predictions
rf.preds <- predict(rf.temp, test.submit.df)
table(rf.preds)
```
```{r}
# Write out a CSV file for submission to Kaggle
submit.df <- data.frame(PassengerId = rep(892:1309), Survived = rf.preds)

write.csv(submit.df, file = "RF_SUB_20190731_1.csv", row.names = FALSE)

```
```{r}
#
# If we want to improve our model, a good place to start is focusing on where it
# gets things wrong!
#


# First, let's explore our collection of features using mutual information to
# gain some additional insight. Our intuition is that the plot of our tree
# should align well to the definition of mutual information.
#install.packages("infotheo")
library(infotheo)

mutinformation(rf.label, data.combined$Pclass[1:891])
mutinformation(rf.label, data.combined$Sex[1:891])
mutinformation(rf.label, data.combined$SibSp[1:891])
mutinformation(rf.label, data.combined$Parch[1:891])
mutinformation(rf.label, discretize(data.combined$Fare[1:891]))
mutinformation(rf.label, data.combined$Embarked[1:891])
mutinformation(rf.label, data.combined$title[1:891])
mutinformation(rf.label, data.combined$family.size[1:891])
mutinformation(rf.label, data.combined$ticket.first.char[1:891])
mutinformation(rf.label, data.combined$cabin.multiple[1:891])
mutinformation(rf.label, data.combined$new.title[1:891])
mutinformation(rf.label, data.combined$ticket.party.size[1:891])
mutinformation(rf.label, discretize(data.combined$avg.Fare[1:891]))

```
```{r}
# OK, now let's leverage the tsne algorithm to create a 2-D representation of our data 
# suitable for visualization starting with folks our model gets right very often - folks
# with titles other than 'Mr."
#install.packages("Rtsne")
library(Rtsne)
most.correct <- data.combined[data.combined$new.title != "Mr.",]
indexes <- which(most.correct$Survived != "None")

```
```{r}
# NOTE - Bug fix for original version. Rtsne needs a seed to ensure consistent
# output between runs.
set.seed(984357)
tsne.1 <- Rtsne(most.correct[, features], check_duplicates = FALSE)
ggplot(NULL, aes(x = tsne.1$Y[indexes, 1], y = tsne.1$Y[indexes, 2], 
                 color = most.correct$Survived[indexes])) +
  geom_point() +
  labs(color = "Survived") +
  ggtitle("tsne 2D Visualization of Features for new.title Other than 'Mr.'")

```
```{r}
# To get a baseline, let's use conditional mutual information on the tsne X and
# Y features for females and boys in 1st and 2nd class. The intuition here is that
# the combination of these features should be higher than any individual feature
# we looked at above.
condinformation(most.correct$Survived[indexes], discretize(tsne.1$Y[indexes,]))
```
```{r}
# As one more comparison, we can leverage conditional mutual information using
# the top two features used in our tree plot - new.title and pclass
condinformation(rf.label, data.combined[1:891, c("new.title", "Pclass")])

```
```{r}
# OK, now let's take a look at adult males since our model has the biggest 
# potential upside for improving (i.e., the tree predicts incorrectly for 86
# adult males). Let's visualize with tsne.
misters <- data.combined[data.combined$new.title == "Mr.",]
indexes <- which(misters$Survived != "None")

```
```{r}
tsne.2 <- Rtsne(misters[, features], check_duplicates = FALSE)
ggplot(NULL, aes(x = tsne.2$Y[indexes, 1], y = tsne.2$Y[indexes, 2], 
                 color = misters$Survived[indexes])) +
  geom_point() +
  labs(color = "Survived") +
  ggtitle("tsne 2D Visualization of Features for new.title of 'Mr.'")


```
```{r}
 # Now conditional mutual information for tsne features for adult males
condinformation(misters$Survived[indexes], discretize(tsne.2$Y[indexes,]))

```
```{r}
#
# Idea - How about creating tsne featues for all of the training data and
# using them in our model?
#
tsne.3 <- Rtsne(data.combined[, features], check_duplicates = FALSE)
ggplot(NULL, aes(x = tsne.3$Y[1:891, 1], y = tsne.3$Y[1:891, 2], 
                 color = data.combined$Survived[1:891])) +
  geom_point() +
  labs(color = "Survived") +
  ggtitle("tsne 2D Visualization of Features for all Training Data")

```
```{r}
# Now conditional mutual information for tsne features for all training
condinformation(data.combined$Survived[1:891], discretize(tsne.3$Y[1:891,]))

# Add the tsne features to our data frame for use in model building
data.combined$tsne.x <- tsne.3$Y[,1]
data.combined$tsne.y <- tsne.3$Y[,2]

```




 







