1 Set my working directory and read in the training and test sets.
2 Feature Engineering
3 Transform variables appropriately
4 All variables with trees
- 4.1 Check the stability
5 Bagging
6 Random forrests

1 Set my working directory and read in the training and test sets.

setwd("~/Desktop/Titanic Dataset")

train <- read_csv("~/Desktop/Titanic Dataset/train.csv")
train<-train[,colnames(train)!="PassengerId"]

2 Feature Engineering

# Name feature engineering

# train$Name

F.E.Names<-matrix(data=NA, nrow=891, ncol=1)

colnames(F.E.Names)<-"Title"

for(i in 1:891){
  if(grepl(".?Mr..?", train$Name[i])){F.E.Names[i,1]<-"Mr"}
  if(grepl(".?Mrs..?", train$Name[i])){F.E.Names[i,1]<-"Mrs"} # this will override any problems caused by the first
  if(grepl(".?Miss..?", train$Name[i])){F.E.Names[i,1]<-"Miss"}
  if(grepl(".?Master..?", train$Name[i])){F.E.Names[i,1]<-"Master"}
  if(grepl(".?Dr..?", train$Name[i])){F.E.Names[i,1]<-"Dr"}
  if(grepl(".?Rev..?", train$Name[i])){F.E.Names[i,1]<-"Rev"}
  if(grepl(".?Don..?", train$Name[i])){F.E.Names[i,1]<-"Don"}
  if(grepl(".?Ms..?", train$Name[i])){F.E.Names[i,1]<-"Ms"}
  if(grepl(".?Mme..?", train$Name[i])){F.E.Names[i,1]<-"Mme"}
  if(grepl(".?Mlle..?", train$Name[i])){F.E.Names[i,1]<-"Mlle"}
  if(grepl(".?Col..?", train$Name[i])){F.E.Names[i,1]<-"Col"}
  if(grepl(".?Major..?", train$Name[i])){F.E.Names[i,1]<-"Major"}
  if(grepl(".?Jonkheer..?", train$Name[i])){F.E.Names[i,1]<-"Jonkheer"}
  if(grepl(".?Capt..?", train$Name[i])){F.E.Names[i,1]<-"Capt"}
  if(grepl(".?Countess..?", train$Name[i])){F.E.Names[i,1]<-"Countess"}
}


any(is.na(F.E.Names))

## [1] FALSE

#F.E.Names

3 Transform variables appropriately

train<-cbind(train, F.E.Names)
train<-data.frame(train)
names(train)

##  [1] "Survived" "Pclass"   "Name"     "Sex"      "Age"      "SibSp"   
##  [7] "Parch"    "Ticket"   "Fare"     "Cabin"    "Embarked" "Title"

train$Survived<-factor(train$Survived)
train$Pclass<-factor(train$Pclass)
train$Sex<-factor(train$Sex)
train$SibSp<-factor(train$SibSp)
train$Parch<-factor(train$Parch)
train$Embarked<-factor(train$Embarked)

Using the entire train data for cross validation

# Stratify the training set into 5 folds

set.seed(1)

folds <- createFolds(y=factor(train$Survived), k = 5, list = FALSE)

train$fold <- folds

train.set<-train

4 All variables with trees

  CV.error<-NULL 
  
  for (i in 1:5) { 
    
    valid.data <- subset(train.set, fold == i)
    
    train.data <- subset(train.set, fold != i) 
    
    treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked + Title, data = train.data, method = "class", control = rpart.control(minsplit = 1, cp = 0.004))
    
    tree.y<-valid.data$Survived
    
    tree.predy<-predict(treefit, newdata = valid.data[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked", "Title")], type ="class")
    
    ith.test.error<- mean(tree.y!=tree.predy) 
    
    CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
    
  }
  
  sum(CV.error)

## [1] 0.1739618

4.1 Check the stability

TREE.All<-NULL

for(j in 1:150){
  
  set.seed(j)
  
  folds <- createFolds(y=factor(train.set$Survived), k = 5, list = FALSE)
  
  train.set$fold <- folds
    
  CV.error<-NULL 
  
  for (i in 1:5) { 
    
    valid.data <- subset(train.set, fold == i)
    
    train.data <- subset(train.set, fold != i) 
    
    treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked + Title, data = train.data, method = "class", control = rpart.control(minsplit = 1, cp = 0.004))
    
    tree.y<-valid.data$Survived
    
    tree.predy<-predict(treefit, newdata = valid.data[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked", "Title")], type ="class")
    
    ith.test.error<- mean(tree.y!=tree.predy) 
    
    CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
    
  }
  
  TREE.All<-c(TREE.All, sum(CV.error))
  
}

TREE.All

##   [1] 0.1739618 0.1739618 0.1750842 0.1683502 0.1806958 0.1829405 0.1829405
##   [8] 0.1784512 0.1773288 0.1649832 0.1773288 0.1717172 0.1683502 0.1784512
##  [15] 0.1694725 0.1773288 0.1728395 0.1649832 0.1818182 0.1638608 0.1818182
##  [22] 0.1784512 0.1795735 0.1818182 0.1717172 0.1694725 0.1806958 0.1705948
##  [29] 0.1705948 0.1784512 0.1784512 0.1739618 0.1784512 0.1750842 0.1795735
##  [36] 0.1795735 0.1818182 0.1717172 0.1739618 0.1795735 0.1717172 0.1683502
##  [43] 0.1728395 0.1705948 0.1784512 0.1728395 0.1649832 0.1885522 0.1717172
##  [50] 0.1649832 0.1795735 0.1672278 0.1750842 0.1750842 0.1705948 0.1885522
##  [57] 0.1863075 0.1762065 0.1728395 0.1896745 0.1604938 0.1784512 0.1728395
##  [64] 0.1661055 0.1694725 0.1694725 0.1750842 0.1762065 0.1593715 0.1638608
##  [71] 0.1728395 0.1728395 0.1694725 0.1683502 0.1717172 0.1739618 0.1728395
##  [78] 0.1717172 0.1818182 0.1851852 0.1739618 0.1739618 0.1750842 0.1582492
##  [85] 0.1672278 0.1851852 0.1582492 0.1694725 0.1750842 0.1638608 0.1717172
##  [92] 0.1762065 0.1750842 0.1739618 0.1683502 0.1705948 0.1739618 0.1683502
##  [99] 0.1604938 0.1649832 0.1593715 0.1694725 0.1818182 0.1661055 0.1649832
## [106] 0.1728395 0.1717172 0.1739618 0.1672278 0.1717172 0.1762065 0.1773288
## [113] 0.1750842 0.1694725 0.1773288 0.1773288 0.1784512 0.1806958 0.1717172
## [120] 0.1694725 0.1683502 0.1773288 0.1784512 0.1762065 0.1784512 0.1627385
## [127] 0.1829405 0.1784512 0.1694725 0.1784512 0.1750842 0.1784512 0.1806958
## [134] 0.1638608 0.1649832 0.1739618 0.1717172 0.1795735 0.1672278 0.1638608
## [141] 0.1604938 0.1571268 0.1582492 0.1705948 0.1784512 0.1818182 0.1795735
## [148] 0.1829405 0.1773288 0.1705948

TREE<-data.frame(Classifier="TREE", CV.Error=TREE.All)

df<-rbind(TREE)

ggplot(df, aes(x=Classifier, y=CV.Error)) + geom_boxplot(color="green") + geom_jitter(alpha=0.1) + ylab("CV Error Rate")

summary(TREE)

##  Classifier    CV.Error     
##  TREE:150   Min.   :0.1571  
##             1st Qu.:0.1695  
##             Median :0.1740  
##             Mean   :0.1733  
##             3rd Qu.:0.1785  
##             Max.   :0.1897

treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked + Title, data = train.set, method = "class", control = rpart.control(minsplit = 1, cp = 0.004))

treefit

## n= 891 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##    1) root 891 342 0 (0.61616162 0.38383838)  
##      2) Title=Capt,Col,Don,Jonkheer,Mr,Rev 529  84 0 (0.84120983 0.15879017)  
##        4) Fare< 26.125 393  39 0 (0.90076336 0.09923664) *
##        5) Fare>=26.125 136  45 0 (0.66911765 0.33088235)  
##         10) Fare>=26.775 115  32 0 (0.72173913 0.27826087)  
##           20) Pclass=2 14   0 0 (1.00000000 0.00000000) *
##           21) Pclass=1,3 101  32 0 (0.68316832 0.31683168)  
##             42) Age>=49.5 20   3 0 (0.85000000 0.15000000) *
##             43) Age< 49.5 81  29 0 (0.64197531 0.35802469)  
##               86) SibSp=3,4,5,8 7   0 0 (1.00000000 0.00000000) *
##               87) SibSp=0,1 74  29 0 (0.60810811 0.39189189)  
##                174) Fare< 52.2771 34   9 0 (0.73529412 0.26470588) *
##                175) Fare>=52.2771 40  20 0 (0.50000000 0.50000000)  
##                  350) Fare>=77.00835 20   6 0 (0.70000000 0.30000000)  
##                    700) Fare< 86.2896 6   0 0 (1.00000000 0.00000000) *
##                    701) Fare>=86.2896 14   6 0 (0.57142857 0.42857143)  
##                     1402) Fare>=99.9896 11   3 0 (0.72727273 0.27272727) *
##                     1403) Fare< 99.9896 3   0 1 (0.00000000 1.00000000) *
##                  351) Fare< 77.00835 20   6 1 (0.30000000 0.70000000) *
##         11) Fare< 26.775 21   8 1 (0.38095238 0.61904762)  
##           22) Age>=53.5 5   0 0 (1.00000000 0.00000000) *
##           23) Age< 53.5 16   3 1 (0.18750000 0.81250000) *
##      3) Title=Countess,Dr,Major,Master,Miss,Mlle,Mme,Mrs,Ms 362 104 1 (0.28729282 0.71270718)  
##        6) Pclass=3 173  83 0 (0.52023121 0.47976879)  
##         12) Fare>=23.35 44   4 0 (0.90909091 0.09090909) *
##         13) Fare< 23.35 129  50 1 (0.38759690 0.61240310)  
##           26) Age>=16.5 95  43 1 (0.45263158 0.54736842)  
##             52) Fare>=7.8875 58  26 0 (0.55172414 0.44827586)  
##              104) Fare< 14.8729 34  10 0 (0.70588235 0.29411765) *
##              105) Fare>=14.8729 24   8 1 (0.33333333 0.66666667)  
##                210) Age< 21 2   0 0 (1.00000000 0.00000000) *
##                211) Age>=21 22   6 1 (0.27272727 0.72727273) *
##             53) Fare< 7.8875 37  11 1 (0.29729730 0.70270270)  
##              106) Age>=28.25 3   0 0 (1.00000000 0.00000000) *
##              107) Age< 28.25 34   8 1 (0.23529412 0.76470588) *
##           27) Age< 16.5 34   7 1 (0.20588235 0.79411765)  
##             54) SibSp=3 3   0 0 (1.00000000 0.00000000) *
##             55) SibSp=0,1,2 31   4 1 (0.12903226 0.87096774) *
##        7) Pclass=1,2 189  14 1 (0.07407407 0.92592593) *

rpart.plot(treefit,extra=104, box.palette="GnBu",
               branch.lty=3, shadow.col="gray", nn=TRUE)

plot(treefit, uniform=TRUE, margin=0.1)
text(treefit, use.n = TRUE, cex=.55)

5 Bagging

Recall that bagging is simply a special case of a random forest with m = p.

Bagging does not deal with missing observations

set.seed(1)
output.forest <-randomForest(Survived~Title+Pclass+Sex+SibSp+Fare, data = train, ntree=2500, importance=T, mtry=5)

print(output.forest)

## 
## Call:
##  randomForest(formula = Survived ~ Title + Pclass + Sex + SibSp +      Fare, data = train, ntree = 2500, importance = T, mtry = 5) 
##                Type of random forest: classification
##                      Number of trees: 2500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 17.85%
## Confusion matrix:
##     0   1 class.error
## 0 475  74   0.1347905
## 1  85 257   0.2485380

plot(output.forest)

6 Random forrests

set.seed(1)
output.forest <-randomForest(Survived~Title+Pclass+Sex+SibSp+Fare, data = train, ntree=2500, importance=T, mtry=1)

print(output.forest)

## 
## Call:
##  randomForest(formula = Survived ~ Title + Pclass + Sex + SibSp +      Fare, data = train, ntree = 2500, importance = T, mtry = 1) 
##                Type of random forest: classification
##                      Number of trees: 2500
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 18.63%
## Confusion matrix:
##     0   1 class.error
## 0 478  71   0.1293260
## 1  95 247   0.2777778

plot(output.forest)

set.seed(1)
output.forest <-randomForest(Survived~Title+Pclass+Sex+SibSp+Fare, data = train, ntree=2500, importance=T)

print(output.forest)

## 
## Call:
##  randomForest(formula = Survived ~ Title + Pclass + Sex + SibSp +      Fare, data = train, ntree = 2500, importance = T) 
##                Type of random forest: classification
##                      Number of trees: 2500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 17.28%
## Confusion matrix:
##     0   1 class.error
## 0 488  61   0.1111111
## 1  93 249   0.2719298

plot(output.forest)

set.seed(1)
output.forest <-randomForest(Survived~Title+Pclass+Sex+SibSp+Fare, data = train, ntree=2500, importance=T, mtry=3)

print(output.forest)

## 
## Call:
##  randomForest(formula = Survived ~ Title + Pclass + Sex + SibSp +      Fare, data = train, ntree = 2500, importance = T, mtry = 3) 
##                Type of random forest: classification
##                      Number of trees: 2500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 16.39%
## Confusion matrix:
##     0   1 class.error
## 0 488  61   0.1111111
## 1  85 257   0.2485380

plot(output.forest)

set.seed(1)
output.forest <-randomForest(Survived~Title+Pclass+Sex+SibSp+Fare, data = train, ntree=2500, importance=T, mtry=4)

print(output.forest)

## 
## Call:
##  randomForest(formula = Survived ~ Title + Pclass + Sex + SibSp +      Fare, data = train, ntree = 2500, importance = T, mtry = 4) 
##                Type of random forest: classification
##                      Number of trees: 2500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 17.28%
## Confusion matrix:
##     0   1 class.error
## 0 476  73   0.1329690
## 1  81 261   0.2368421

plot(output.forest)

3rd Titanic Try

George Gerogiannis

2017

1 Set my working directory and read in the training and test sets.

2 Feature Engineering

3 Transform variables appropriately

4 All variables with trees

4.1 Check the stability

5 Bagging

6 Random forrests