1 Preparation

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rpart)
library(randomForest)

## Warning: package 'randomForest' was built under R version 3.4.3

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(e1071)

## Warning: package 'e1071' was built under R version 3.4.3

library(xgboost)

## Warning: package 'xgboost' was built under R version 3.4.4

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(caret)

## Warning: package 'caret' was built under R version 3.4.3

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

library(stringr)

## Warning: package 'stringr' was built under R version 3.4.4

library(doSNOW)

## Warning: package 'doSNOW' was built under R version 3.4.3

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 3.4.3

## Loading required package: iterators

## Warning: package 'iterators' was built under R version 3.4.3

## Loading required package: snow

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.4.4

The data is read with r and the train and test sets are combined and variables are given the appropriate data type. Next, we look for missing values. Tables are helpful as a backup. Although there are no NA in embarked, a couple are only spaces so this is a possible issue.

train <- read.csv("train.csv")
test  <- read.csv("test.csv")

# test data does not have 'Survived' variable; we will create one so we
# can combine the two data sets into one
test$Survived <- NA
titanic.full  <- rbind(train, test)
str(titanic.full)

## 'data.frame':    1309 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...

# make PClass and Survived a factor
titanic.full$Pclass <- as.factor(titanic.full$Pclass)
titanic.full$Survived <- as.factor(titanic.full$Survived)

# check for Missing Values
colSums(is.na(titanic.full))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0         418           0           0           0         263 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           1           0           0

table(titanic.full[, 'Embarked'])

## 
##       C   Q   S 
##   2 270 123 914

2 Advanced Imputation

2.1 Embarked

The two cases where there are no values will need to be explored.

titanic.full[titanic.full$Embarked == "",]

Now that we know ID 62 and 830 are missing, we can take a look at a ggplot to see which categorical assignment makes the most sense. Both passengers paid 80 for their fare and that is about the median value for those boarding from C as shown in the ggplot below. The red dashed line represents the 80 mark.

We can then assign both these values to C.

embark.na.rm <- titanic.full[1:891,] %>%
  filter(PassengerId != 62 & PassengerId != 830)

ggplot(embark.na.rm, aes(x = Embarked, y = Fare, fill = factor(Pclass))) +
  geom_boxplot() +
  geom_hline(aes(yintercept=80), 
             colour='red', linetype='dashed', lwd=2)

titanic.full$Embarked[c(62, 830)] <- 'C'
table(titanic.full[, 'Embarked'])

## 
##       C   Q   S 
##   0 272 123 914

2.2 Age

A linear model is created to fill in the missing 263 ages in the data set.

table(is.na(titanic.full$Age))

## 
## FALSE  TRUE 
##  1046   263

median(titanic.full$Age, na.rm = T)

## [1] 28

boxplot(titanic.full$Age)

boxplot.stats(titanic.full$Age)

## $stats
## [1]  0.17 21.00 28.00 39.00 66.00
## 
## $n
## [1] 1046
## 
## $conf
## [1] 27.12065 28.87935
## 
## $out
## [1] 71.0 70.5 71.0 80.0 70.0 70.0 74.0 67.0 76.0

top.qrt.age <- boxplot.stats(titanic.full$Age)$stats[5]
age.outlier.filter <- titanic.full$Age < top.qrt.age
titanic.full[age.outlier.filter,] #all of the rows that we want to include in the lm

age.equation = "Age ~ Pclass + Sex + Fare + SibSp + Parch + Embarked"

age.model <- lm(
  formula = age.equation,
  data = titanic.full[age.outlier.filter, ]
)

#shows all rows of observations with NA in age
age.row <- titanic.full[is.na(titanic.full$Age),
                        c("Pclass", "Sex", "Fare", "SibSp", "Parch", "Embarked")
                        ]

table(is.na(titanic.full$Age))

## 
## FALSE  TRUE 
##  1046   263

age.predictions <- predict(age.model, newdata = age.row)

titanic.full[is.na(titanic.full$Age), "Age" ] <- age.predictions

table(is.na(titanic.full$Age))

## 
## FALSE 
##  1309

Another approach to imputation of Age:

# create a dataset for rows that include a value for age

#data_Age_NA.rm <- titanic.full[!is.na(titanic.full$Age), ]

#Imputing Age with an rpart model. 

# Creating a decision tree for predicting the missing values of the Age variable

#age_model <- rpart(Age ~ SibSp + Parch + Pclass, data_Age_NA.rm)
#titanic.full$Age[is.na(titanic.full$Age)] <- predict(age_model, newdata = titanic.full[is.na(titanic.full$Age), ])

2.3 Fare

A linear model is created to fill in the missing Fare value.

table(is.na(titanic.full$Fare))

## 
## FALSE  TRUE 
##  1308     1

median(titanic.full$Fare, na.rm = TRUE)

## [1] 14.4542

boxplot(titanic.full$Fare)

top.qrt.fare <- boxplot.stats(titanic.full$Fare)$stats[5]
fare.outlier.filter <- titanic.full$Fare < top.qrt.fare
nonoutlier.fares.data <- titanic.full[fare.outlier.filter,]

fare.equation = "Fare ~ Pclass + Sex + Age + SibSp + Parch + Embarked"
Fare.model <- lm(
  formula = fare.equation,
  data = nonoutlier.fares.data
)
Fare.rows <- titanic.full[is.na(titanic.full$Fare), c("Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked")]

Fare.predictions <- predict(Fare.model, Fare.rows)

titanic.full[is.na(titanic.full$Fare), "Fare"] <- Fare.predictions
table(is.na(titanic.full$Fare))

## 
## FALSE 
##  1309

3 Feature Engineering

Now that the data is cleaned feature engineering can be built to use later on in modeling.

3.1 Women and Children

Titanic’s Certificate of Clearance delineated children from adults at 12. So, 12 years and older were considered adults. However, after running exploratory models with different ages, 14 is a better cutoff. Therefore, a dichotomous variable for children will likely be helpful. This also helps classify women as a variable as well to explore.

titanic.full$Child <- 0
titanic.full[titanic.full$Age < 14, "Child"] <- 1
titanic.full$Child <- as.factor(titanic.full$Child)

titanic.full$Woman <- 0
titanic.full[titanic.full$Age > 14 & titanic.full$Sex == "female", "Woman"] <- 1
titanic.full$Woman <- as.factor(titanic.full$Woman)

3.2 Titles

Extracting titles with regular expressions helps create title as a categorical variable worth exploring. Then the ggplot reveals four main titles (Master, Miss, Mr, and Mrs.)

titanic.full <- titanic.full %>% 
  mutate(Title = sub(".*,(.+\\.).*", "\\1", Name))

#The mutate command created white space which needed to be removed for later.
Title.ws.rm <- trimws(titanic.full$Title, which = "left")
titanic.full$Title <- Title.ws.rm

ggplot(titanic.full, aes(Title))+
  geom_bar()

It will be worth taking a closer look at the abnormal titles to determine if there is any meaning. Perhaps some titles are related to royalty and could have a better chance of survival. The variable SimpTitle inserted the main four titles for the large majority of observations, however for the abnormal titles, it is left blank. A new data set can be extracted by carving out all those observations with one of the four recognized main titles, leaving us with only the abnormal observations to examine.

The first ggplot shows simple survival results related to each abnormal total. I intentionally left NA’s as they are all from the test set. This allows us to see if each there will be any relevance to survival prediction. Two things to note: all four female titles survived, and all six males who were a Reverend perished. Given the prevalent odds that most males perished and most females survived this might just be noise. I could imagine females having an easier time surviving when they are not married and have no one else to put before them. The Reverends might have some logical explanation. Perhaps their morals caused them to sacrifice themselves for others. Or, perhaps they were all in the same location on the boat. There is only two Reverends to be predicted, but it might be worth testing the model on.

titanic.full <- titanic.full %>%
  mutate(SimpTitle = sub(".*?(Mr|Miss|Mrs|Master|$).*", "\\1", Name))

weirdos <- titanic.full[titanic.full$SimpTitle == "", c("PassengerId", "Survived", "Pclass", "Name","Sex", "Title","SimpTitle")]

ggplot(weirdos, aes(Title, fill = Survived))+
  geom_bar()

ggplot(weirdos, aes(Sex, fill = Survived))+
  geom_bar()

ggplot(weirdos, aes(Pclass, fill = Survived))+
  geom_bar()

This command will populate most of the values.

titanic.full <- titanic.full %>%
  mutate(SimpTitle = sub(".*?(Mr|Miss|Mrs|Master|$).*", "\\1", Name))

Next, we need to reassign the other titles to fit into one of the existing categories. For males, a vector grabbing their id numbers. Then it is passed in the second command to assign all those as males in SimpTitle. Since all the females left traveled alone, I will assume they are not married and classify them as “Miss”

m.weirdo_vec <- as.matrix(weirdos %>%
  filter(Sex == "male") %>%
  select(PassengerId))

titanic.full[titanic.full$PassengerId %in% m.weirdo_vec, "SimpTitle"] <- 'Mr' 

miss.weirdo_vec <- as.matrix(weirdos %>%
  filter(Sex == "female", 
         Title %in% c("Ms.","Mlle.","Mme.","Dona.", "the Countess.", "Dr.")) %>%
  select(PassengerId))

titanic.full[titanic.full$PassengerId %in% miss.weirdo_vec, "SimpTitle"] <- "Miss"

ggplot(titanic.full[1:891,], aes(SimpTitle, fill = Survived))+
  geom_bar()

3.3 Mother

Another venue to explore has to do with mothers. Did mothers have an advantage or disadvantage for survival? The way to figure this out is to hone in on women who are of the age of possible mothers which I estimated pretty arbitrarily with those who are not mothers and compare survival rates.

By the looks of it, if anything, being a mother is a bit of a detriment (especially in first class) but not by much. I still might keep it around to throw in the model just in case it adds something.

The ggplot reveals a slight detriment to being a mother. This makes some sense as they would put their kids on the boats first. It will be added to the model, although it might be more of a moderating variable related to FamSize.

titanic.full$Mother <- NA
for (i in 1:nrow(titanic.full)) {
  if (titanic.full$Sex[i] == "female" & titanic.full$Age[i] > 18 & titanic.full$Parch[i] > 0 & titanic.full$SibSp[i] == 1) {
    titanic.full$Mother[i] <- 1
  } else {
    titanic.full$Mother[i] <- 0
  }
}
titanic.full$Mother <- as.factor(titanic.full$Mother)

AgeKnown_w <- titanic.full[1:891,] %>%
  filter(Age > 17 & Age < 55, Sex == "female")

#add label on the legend here - see ggplot book. Also, fix scale if possible...
ggplot(AgeKnown_w, aes(Mother, fill = Survived))+
  geom_bar(position = "fill")+
  facet_wrap(~Pclass)

After inspecting the title for any more trends, it might be that there were prisoners on-board. I can’t say for sure, but several men (all over 19 years old) traveled in a party of one and carried a ticket with no price. Also, all of them embarked from S. Perhaps a nearby jail was returning convicts to the United States. Why not add this to our model and see if anything comes of it.

Prisoners <- titanic.full[1:891,] %>%
  filter(Fare == 0)

Prisoner_ID <- Prisoners[[1]]

titanic.full$Prisoner <- 0
titanic.full$Prisoner <- as.integer(titanic.full$Prisoner)
titanic.full[titanic.full$PassengerId %in% Prisoner_ID, "Prisoner"] <- 1

titanic.full$Prisoner <- as.factor(titanic.full$Prisoner)

3.4 Family Size

If families are traveling together, they were likely stuck together when the iceberg hit.

#Create a new variable - FamSize

titanic.full$FamSize <- titanic.full$SibSp + titanic.full$Parch +1

After careful inspection of Ticket variable, it looks apparent that FamSize isn’t the only worthwhile grouping, PartySize might be as well. Many tickets contain more passengers that are accounted in SibSp and Parch. It’s also possible that the Fare is a total for the ticket. This means that we will need to divide Fare by PartySize. It’s worth investigating.

ticket.party.size <- rep(0, nrow(titanic.full))
avg.fare <- rep(0.0, nrow(titanic.full))
tickets <- unique(titanic.full$Ticket)

for (i in 1:length(tickets)) {
  current.ticket <- tickets[i]
  party.indexes <- which(titanic.full$Ticket == current.ticket)
  current.avg.fare <- titanic.full[party.indexes[1], "Fare"] / length(party.indexes)

  for (k in 1:length(party.indexes)) {
    ticket.party.size[party.indexes[k]] <- length(party.indexes)
    avg.fare[party.indexes[k]] <- current.avg.fare
  }
}

titanic.full$PartySize <- ticket.party.size
titanic.full$AvgFare <- avg.fare

Now, we can look at how FamSize correlates with survival rates and compare this to our new variable, PartySize. Interesting enough, in general, if you are in a Family or Party of less than five but more than one, you are likely to survive. Otherwise, the odds of survival are severely reduced. One possible advantage PartySize might have is that it seems to be taking a lot of single FamSize survivors and spreading them across the PartySize they traveled with. This could be helpful to the model for not assuming PartySize of 1 will automatically lead to perishing.

#FamSize
ggplot(titanic.full[1:891,], aes(FamSize, fill = Survived)) +
  geom_bar(position = "dodge") +
  scale_x_continuous(breaks=c(1:11)) +
  labs(x = 'Family Size')

#PartySize
ggplot(titanic.full[1:891,], aes(PartySize, fill = Survived)) +
  geom_bar(position = "dodge") +
  scale_x_continuous(breaks=c(1:11)) +
  labs(x = 'PartySize')

4 Exploratory Modeling - Random Forest

4.1 Prepping the Data

RandomForest can’t run with character variables so they will need to be coerced into factors.

titanic.full$Title <- as.factor(titanic.full$Title)
titanic.full$SimpTitle <- as.factor(titanic.full$SimpTitle)

4.2 Women and Children First

As per protocol, the most obvious rudimentary model to start with must be to predict the women and children will survive and the men wont. Let’s see what the model would look like. This will provide a good standard to assess our final model on.

The results for running a RandomForest on Women and Children alone nets a 21.44 percent error rate in prediction on the training data set. This will act as a good standard to compare exploratory models against.

rf.train.1 <- titanic.full[1:891, c("Woman", "Child")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.1 <- randomForest(x = rf.train.1, y = rf.label, importance = TRUE, ntree = 1000)
rf.1

## 
## Call:
##  randomForest(x = rf.train.1, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 21.44%
## Confusion matrix:
##     0   1 class.error
## 0 449 100   0.1821494
## 1  91 251   0.2660819

varImpPlot(rf.1)

4.3 Title vs. SimpTitle

Since SimpTitle was created as a means to better group titles, it will be helpful to make sure it is more predictive.

rf.train.2 starts with the the original titles before re categorizing them. SimpTitle gives slightly better accuracy than Title.

rf.train.2 <- titanic.full[1:891, c("Title", "Woman", "Child")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.2 <- randomForest(x = rf.train.2, y = rf.label, importance = TRUE, ntree = 1000)
rf.2

## 
## Call:
##  randomForest(x = rf.train.2, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 21.44%
## Confusion matrix:
##     0   1 class.error
## 0 446 103   0.1876138
## 1  88 254   0.2573099

#Changing Title to SimpTitle

rf.train.3 <- titanic.full[1:891, c("SimpTitle", "Woman", "Child")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.3 <- randomForest(x = rf.train.3, y = rf.label, importance = TRUE, ntree = 1000)
rf.3

## 
## Call:
##  randomForest(x = rf.train.3, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 21.1%
## Confusion matrix:
##     0   1 class.error
## 0 451  98   0.1785064
## 1  90 252   0.2631579

Adding Title didn’t change the overall error rate from using the standard Women and Children model. By changing Title to SimpTitle we get a very slight increase in predictability.

4.4 Focus on Class

It’s no surprise adding Pclass to the model increases predictability of the train set.

rf.train.4 <- titanic.full[1:891, c("SimpTitle", "Woman", "Child", "Pclass")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.4 <- randomForest(x = rf.train.4, y = rf.label, importance = TRUE, ntree = 1000)
rf.4

## 
## Call:
##  randomForest(x = rf.train.4, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 19.87%
## Confusion matrix:
##     0   1 class.error
## 0 482  67   0.1220401
## 1 110 232   0.3216374

4.5 Focus on Age

Age is largely captured within the context of women and children but it actually helps add predictability to the model.

rf.train.5 <- titanic.full[1:891, c("SimpTitle", "Woman", "Child", "Pclass", "Age")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.5 <- randomForest(x = rf.train.5, y = rf.label, importance = TRUE, ntree = 1000)
rf.5

## 
## Call:
##  randomForest(x = rf.train.5, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 17.85%
## Confusion matrix:
##     0   1 class.error
## 0 493  56   0.1020036
## 1 103 239   0.3011696

4.6 Fare

Adding Fare helps the model as well. Also, by taking away Woman and Child, predictability improved.

rf.train.6 <- titanic.full[1:891, c("SimpTitle", "Woman", "Child", "Pclass", "Age", "Fare")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.6 <- randomForest(x = rf.train.6, y = rf.label, importance = TRUE, ntree = 1000)
rf.6

## 
## Call:
##  randomForest(x = rf.train.6, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 16.61%
## Confusion matrix:
##     0   1 class.error
## 0 502  47   0.0856102
## 1 101 241   0.2953216

# Taking out Woman and Child

rf.train.6 <- titanic.full[1:891, c("SimpTitle", "Pclass", "Age", "Fare")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.6 <- randomForest(x = rf.train.6, y = rf.label, importance = TRUE, ntree = 1000)
rf.6

## 
## Call:
##  randomForest(x = rf.train.6, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 14.81%
## Confusion matrix:
##     0   1 class.error
## 0 507  42  0.07650273
## 1  90 252  0.26315789

4.7 Finalizing Predictors

We also explored differentiating FamSize and PartySize. PartySize provided better prediction over FamSize. AvgFare did not change the resulting OOB, so Age is used instead. Rf.8 seems to be the best model to use moving forward.

rf.train.7 <- titanic.full[1:891, c("SimpTitle","FamSize", "Pclass", "Age", "Fare")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.7 <- randomForest(x = rf.train.7, y = rf.label, importance = TRUE, ntree = 1000)
rf.7

## 
## Call:
##  randomForest(x = rf.train.7, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 15.49%
## Confusion matrix:
##     0   1 class.error
## 0 504  45  0.08196721
## 1  93 249  0.27192982

rf.train.8 <- titanic.full[1:891, c("SimpTitle", "Age", "PartySize", "Pclass", "Fare")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.8 <- randomForest(x = rf.train.8, y = rf.label, importance = TRUE, ntree = 1000)
rf.8

## 
## Call:
##  randomForest(x = rf.train.8, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 14.81%
## Confusion matrix:
##     0   1 class.error
## 0 504  45  0.08196721
## 1  87 255  0.25438596

5 Writing Submission with RandomForest

RandomForest produced better predictive results than XGBOOST, at least according to the data tested on Kaggle’s public standings.

test.submit.df <- titanic.full[892:1309, c("SimpTitle", "Age", "PartySize", "Pclass", "Fare")]
test.submit.df

rf.8.preds <- predict(rf.8, test.submit.df)
table(rf.8.preds)

## rf.8.preds
##   0   1 
## 274 144

rf.predict <- data.frame(PassengerId = rep(892:1309), Survived = rf.8.preds)

write.csv(rf.predict, file = "RF_Predict.8.csv", row.names = FALSE)

# This gave a Kaggle public score of 77.03.

6 Cross Validation

set.seed(123)
cv.10.folds <- createMultiFolds(rf.label, k = 10, times = 10)

#check stratification of folds. CARET stratifies the folds automatically, but this shows that the original proportion of survived/failed is approximately equal in one of the 100 folds. This can be done with any number of the folds (fold 33 in this example)

table(rf.label)

## rf.label
##   0   1 
## 549 342

table(rf.label[cv.10.folds[[33]]])

## 
##   0   1 
## 494 308

ctrl.1 <- trainControl(method = "repeatedcv", number = 10, repeats = 10,
                       index = cv.10.folds)

#Now to use doSNOW. BE CAREFUL with the number. 6 takes a lot of CPU power so a typo can cause computer to overload. 

cl <- makeCluster(6, type = "SOCK")
registerDoSNOW(cl)

#set seed for reproducibiity and train

set.seed(234)
rf.9.cv.1 <- train(x =rf.train.8, y = rf.label, method = "rf", tuneLength = 3,
                   ntree = 1000, trControl = ctrl.1)

stopCluster(cl)

#Check results
rf.9.cv.1

## Random Forest 
## 
## 891 samples
##   5 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 801, 802, 802, 802, 803, 801, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.8492533  0.6731414
##   3     0.8427451  0.6618798
##   5     0.8242343  0.6250134
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

With ten folds, testing 90% of the train set to predict 10% might still lead to over-fitting. So the same process if repeated with 5 folds to see if that differs significantly.

set.seed(123)
cv.5.folds <- createMultiFolds(rf.label, k = 5, times = 10)

#check stratification of folds. CARET stratifies the folds automatically, but this shows that the original proportion of survived/failed is approximately equal in one of the 100 folds. This can be done with any number of the folds (fold 33 in this example)

ctrl.2 <- trainControl(method = "repeatedcv", number = 5, repeats = 10,
                       index = cv.5.folds)

#Now to use doSNOW. BE CAREFUL with the number. 6 takes a lot of CPU power so a typo can cause computer to overload. 

cl.2 <- makeCluster(6, type = "SOCK")
registerDoSNOW(cl)

#set seed for reproducibiity and train

set.seed(234)
rf.9.cv.2 <- train(x =rf.train.8, y = rf.label, method = "rf", tuneLength = 3,
                   ntree = 1000, trControl = ctrl.2)

stopCluster(cl)

#Check results
rf.9.cv.2

## Random Forest 
## 
## 891 samples
##   5 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 714, 713, 712, 713, 712, 713, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.8459059  0.6669259
##   3     0.8333284  0.6432557
##   5     0.8194265  0.6159449
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

With an mtry of 2, training with less data didn’t greatly reduce the accuracy. This provides some evidence that the model isn’t over-fitting. We will move to a three fold next.

Experts often say the proportion of test data vs. train data can be used as a basis for matching when setting folds. So with the test set being about 1/3 of the train set, 3 folds would be recommended using this rule of thumb.

set.seed(123)
cv.3.folds <- createMultiFolds(rf.label, k = 3, times = 10)

#check stratification of folds. CARET stratifies the folds automatically, but this shows that the original proportion of survived/failed is approximately equal in one of the 100 folds. This can be done with any number of the folds (fold 33 in this example)

ctrl.3 <- trainControl(method = "repeatedcv", number = 3, repeats = 10,
                       index = cv.3.folds)

#Now to use doSNOW. BE CAREFUL with the number. 6 takes a lot of CPU power so a typo can cause computer to overload. 

cl.3 <- makeCluster(6, type = "SOCK")
registerDoSNOW(cl)

#set seed for reproducibiity and train

set.seed(234)
rf.9.cv.3 <- train(x =rf.train.8, y = rf.label, method = "rf", tuneLength = 3,
                   ntree = 64, trControl = ctrl.3)

stopCluster(cl)

#Check results
rf.9.cv.3

## Random Forest 
## 
## 891 samples
##   5 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 10 times) 
## Summary of sample sizes: 594, 594, 594, 594, 594, 594, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.8410774  0.6548115
##   3     0.8299663  0.6345779
##   5     0.8180696  0.6118515
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

7 Investigating the Trees

Looking at individual trees to investigate further feature engineering possibilities.

rpart.cv <- function(seed, training, labels, ctrl) {
  cl <- makeCluster(6, type = "SOCK")
  registerDoSNOW(cl)
  
  set.seed(seed)
  rpart.cv <- train(x = training, y = labels, method = "rpart", tuneLength = 30,
                    trControl = ctrl)
  stopCluster(cl)
  
  return(rpart.cv)
}
  
  features <- c("Pclass", "Age", "Fare", "FamSize", "Sex")
  rpart.train.1 <- titanic.full[1:891, features]
  
rpart.1.cv.1 <- rpart.cv(94622, rpart.train.1, rf.label, ctrl.3)
rpart.1.cv.1

## CART 
## 
## 891 samples
##   5 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 10 times) 
## Summary of sample sizes: 594, 594, 594, 594, 594, 594, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.00000000  0.8127946  0.5946415
##   0.01532567  0.8095398  0.5813447
##   0.03065134  0.7940516  0.5490974
##   0.04597701  0.7848485  0.5366351
##   0.06130268  0.7848485  0.5366351
##   0.07662835  0.7867565  0.5421536
##   0.09195402  0.7867565  0.5421536
##   0.10727969  0.7867565  0.5421536
##   0.12260536  0.7867565  0.5421536
##   0.13793103  0.7867565  0.5421536
##   0.15325670  0.7867565  0.5421536
##   0.16858238  0.7867565  0.5421536
##   0.18390805  0.7867565  0.5421536
##   0.19923372  0.7867565  0.5421536
##   0.21455939  0.7867565  0.5421536
##   0.22988506  0.7867565  0.5421536
##   0.24521073  0.7867565  0.5421536
##   0.26053640  0.7867565  0.5421536
##   0.27586207  0.7867565  0.5421536
##   0.29118774  0.7867565  0.5421536
##   0.30651341  0.7867565  0.5421536
##   0.32183908  0.7867565  0.5421536
##   0.33716475  0.7867565  0.5421536
##   0.35249042  0.7867565  0.5421536
##   0.36781609  0.7867565  0.5421536
##   0.38314176  0.7793490  0.5204009
##   0.39846743  0.7721661  0.4992953
##   0.41379310  0.7721661  0.4992953
##   0.42911877  0.7285073  0.3645341
##   0.44444444  0.6811448  0.2159991
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.

prp(rpart.1.cv.1$finalModel, type = 0, extra = 1, under = TRUE)

The abundant combination of age parameters seems to indicate over fitting in the model. This explains the low 77.03 percent Kaggle score this earned. A more simple model will be needed. Let’s add SimpTitle back.

test.submit.df <- titanic.full[892:1309, c("SimpTitle", "Pclass", "Age", "Fare", "FamSize", "Sex")]
test.submit.df

rf.7.preds <- predict(rf.7, test.submit.df)
table(rf.7.preds)

## rf.7.preds
##   0   1 
## 277 141

rf.predict <- data.frame(PassengerId = rep(892:1309), Survived = rf.7.preds)

write.csv(rf.predict, file = "RF_Predict.csv", row.names = FALSE)


rpart.cv <- function(seed, training, labels, ctrl) {
  cl <- makeCluster(6, type = "SOCK")
  registerDoSNOW(cl)
  
  set.seed(seed)
  rpart.cv <- train(x = training, y = labels, method = "rpart", tuneLength = 30,
                    trControl = ctrl)
  stopCluster(cl)
  
  return(rpart.cv)
}
  
  features <- c("SimpTitle", "Pclass", "Age", "Fare", "FamSize", "Sex")
  rpart.train.1 <- titanic.full[1:891, features]
  
rpart.1.cv.1 <- rpart.cv(94622, rpart.train.1, rf.label, ctrl.3)
rpart.1.cv.1

## CART 
## 
## 891 samples
##   6 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 10 times) 
## Summary of sample sizes: 594, 594, 594, 594, 594, 594, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.00000000  0.8219978  0.6157609
##   0.01593063  0.8325477  0.6360169
##   0.03186126  0.8304153  0.6341271
##   0.04779189  0.8158249  0.6031977
##   0.06372252  0.7948373  0.5666157
##   0.07965316  0.7933782  0.5641667
##   0.09558379  0.7939394  0.5671504
##   0.11151442  0.7939394  0.5671504
##   0.12744505  0.7934905  0.5665049
##   0.14337568  0.7934905  0.5665049
##   0.15930631  0.7934905  0.5665049
##   0.17523694  0.7934905  0.5665049
##   0.19116757  0.7934905  0.5665049
##   0.20709821  0.7934905  0.5665049
##   0.22302884  0.7934905  0.5665049
##   0.23895947  0.7934905  0.5665049
##   0.25489010  0.7934905  0.5665049
##   0.27082073  0.7934905  0.5665049
##   0.28675136  0.7934905  0.5665049
##   0.30268199  0.7934905  0.5665049
##   0.31861262  0.7934905  0.5665049
##   0.33454325  0.7934905  0.5665049
##   0.35047389  0.7934905  0.5665049
##   0.36640452  0.7934905  0.5665049
##   0.38233515  0.7934905  0.5665049
##   0.39826578  0.7857464  0.5438313
##   0.41419641  0.7783389  0.5219249
##   0.43012704  0.7579125  0.4600631
##   0.44605767  0.7322110  0.3804362
##   0.46198830  0.6893378  0.2453079
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01593063.

prp(rpart.1.cv.1$finalModel, type = 0, extra = 1, under = TRUE)

This gives a more realistic tree path.

8 First Class Fit Men?

The following density graph reveals ages of about 18 - 42 for Mr. in First class have a better chance of surviving than perishing. It was worth exploring to see if there was any meaning underlying the following graph.

first.men <- titanic.full[1:891,] %>%
  filter(Pclass == 1 & Title == "Mr.")

FirstFitMen <- first.men %>%
  filter(Age > 24 & Age < 38)

table(FirstFitMen$Survived)

## 
##  0  1 
## 13 18

ggplot(first.men, aes(Age))+
  geom_density(aes(fill = Survived, alpha = .5))

titanic.full$FitMenFirst <- "NotFit"
titanic.full[titanic.full$Pclass == 1 & titanic.full$Sex == "male" & titanic.full$Age > 24 & titanic.full$Age < 38, "FitMenFirst"] <- "Fit"

titanic.full$FitMenFirst <- as.factor(titanic.full$FitMenFirst)

A dichotomous variable to represent first class males who are assumed to be “fit” due to their age is created. Now it can be added to our optimal randomForest prediction to see if it helps. Unfortunately, this still did not help the model as the OOB error estimate without FitMenFirst was 16.05.

rf.train.8 <- titanic.full[1:891, c("SimpTitle", "Pclass", "Age", "Fare", "FamSize", "Sex", "FitMenFirst")]
rf.label <- as.factor(train$Survived)

set.seed(1234)
rf.8 <- randomForest(x = rf.train.8, y = rf.label, importance = TRUE, ntree = 1000)
rf.8

## 
## Call:
##  randomForest(x = rf.train.8, y = rf.label, ntree = 1000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 16.84%
## Confusion matrix:
##     0   1 class.error
## 0 486  63   0.1147541
## 1  87 255   0.2543860

varImpPlot(rf.8)

9 Model Version 2 - XGBOOST

ONE HOT ENCODING

This process (using dummyvars) takes the categorical variables and re-codes them into multiple variables for each type of field within that variable with a binary assignment. This must be done for XGBOOST to work.

# Pick some predictor variables/features to use 
# Using the dummyVars function of the caret package for One Hot Encoding 
OHE_Titanic.full <- dummyVars(~ SimpTitle + Pclass + Sex, data = titanic.full, fullRank = FALSE)
OHE_titanic.full  <- predict(OHE_Titanic.full, newdata = titanic.full); head(OHE_titanic.full)

##   SimpTitle.Master SimpTitle.Miss SimpTitle.Mr SimpTitle.Mrs Pclass.1
## 1                0              0            1             0        0
## 2                0              0            0             1        1
## 3                0              1            0             0        0
## 4                0              0            0             1        1
## 5                0              0            1             0        0
## 6                0              0            1             0        0
##   Pclass.2 Pclass.3 Sex.female Sex.male
## 1        0        1          0        1
## 2        0        0          1        0
## 3        0        1          1        0
## 4        0        0          1        0
## 5        0        1          0        1
## 6        0        1          0        1

10 Creating the Predictor Set

# create the predictor set that you will use - needs to be a matrix for xgboost

#Predictor set is combining the columns below from titanic.full and the one hot encoded data set (data_train)
predictor_set <- cbind(titanic.full[, c('Age', 'Fare', 'PartySize')], OHE_titanic.full)

# The Predictor set is a data frame so it needs to be changed to a matrix for xgboost to work.
train_pred    <- as.matrix(predictor_set[1:891, ])
train_label   <- train[, "Survived"]; head(train_label)

## [1] 0 1 1 1 0 0

test_pred     <- predictor_set[892:1309, ]

11 Cross Validation

# Cross validation!
xgb_matrix   <- xgb.DMatrix(train_pred, label = train_label)
xgb_cv_mod_1 <- xgb.cv(data      = xgb_matrix, 
                       max_depth = 20,
                       eta       = 0.01,
                       nrounds   = 100,
                       nfold     = 10,
                       objective = 'binary:logistic',
                       metrics   = 'auc',
                       verbose   = 0)

# do more cross-validation with different parameter combinations to 
# see what happens to the AUC; also do cross-validation when adding/testing
# new features

12 Applying Model to Test Set

# After doing a cross-validation to tune the parameters, now we actually
# build the model on the full data set
xgb_mod <- xgboost(data        = xgb_matrix,
                   max_depth   = 20,
                   eta         = 0.01,
                   nrounds     = 100,
                   objective   = 'binary:logistic',
                   eval_metric = 'auc',
                   verbose     = 0)

# make predictions on the test data
xgb_matrix_test <- xgb.DMatrix(as.matrix(test_pred))
predict_probs   <- predict(xgb_mod, newdata = xgb_matrix_test)
predict_01      <- as.numeric(predict_probs > 0.5)
table(predict_01)

## predict_01
##   0   1 
## 270 148

# save out a file to upload to kaggle!
#file name is updated to reflect the current date
#this is super helpful when you are tweaking code and keeping track of multiple versions
write.csv(x         = data.frame(PassengerId = test$PassengerId,
                                 Survived    = predict_01),
          file      = paste0('XGBOOST_FIRST_TRY', 
                              Sys.Date(),  '.csv'),
          row.names = FALSE)

Titanic with randomForest and XGBOOST