setwd("~/Desktop/Titanic Dataset")
train <- read_csv("~/Desktop/Titanic Dataset/train.csv")
train<-train[,colnames(train)!="PassengerId"]
# Name feature engineering
# train$Name
F.E.Names<-matrix(data=NA, nrow=891, ncol=1)
colnames(F.E.Names)<-"Title"
for(i in 1:891){
if(grepl(".?Mr..?", train$Name[i])){F.E.Names[i,1]<-"Mr"}
if(grepl(".?Mrs..?", train$Name[i])){F.E.Names[i,1]<-"Mrs"} # this will override any problems caused by the first
if(grepl(".?Miss..?", train$Name[i])){F.E.Names[i,1]<-"Miss"}
if(grepl(".?Master..?", train$Name[i])){F.E.Names[i,1]<-"Master"}
if(grepl(".?Dr..?", train$Name[i])){F.E.Names[i,1]<-"Dr"}
if(grepl(".?Rev..?", train$Name[i])){F.E.Names[i,1]<-"Rev"}
if(grepl(".?Don..?", train$Name[i])){F.E.Names[i,1]<-"Don"}
if(grepl(".?Ms..?", train$Name[i])){F.E.Names[i,1]<-"Ms"}
if(grepl(".?Mme..?", train$Name[i])){F.E.Names[i,1]<-"Mme"}
if(grepl(".?Mlle..?", train$Name[i])){F.E.Names[i,1]<-"Mlle"}
if(grepl(".?Col..?", train$Name[i])){F.E.Names[i,1]<-"Col"}
if(grepl(".?Major..?", train$Name[i])){F.E.Names[i,1]<-"Major"}
if(grepl(".?Jonkheer..?", train$Name[i])){F.E.Names[i,1]<-"Jonkheer"}
if(grepl(".?Capt..?", train$Name[i])){F.E.Names[i,1]<-"Capt"}
if(grepl(".?Countess..?", train$Name[i])){F.E.Names[i,1]<-"Countess"}
}
any(is.na(F.E.Names))
## [1] FALSE
#F.E.Names
train<-cbind(train, F.E.Names)
train<-data.frame(train)
names(train)
## [1] "Survived" "Pclass" "Name" "Sex" "Age" "SibSp"
## [7] "Parch" "Ticket" "Fare" "Cabin" "Embarked" "Title"
train$Survived<-factor(train$Survived)
train$Pclass<-factor(train$Pclass)
train$Sex<-factor(train$Sex)
train$SibSp<-factor(train$SibSp)
train$Parch<-factor(train$Parch)
train$Embarked<-factor(train$Embarked)
Using the entire train data for cross validation
# Stratify the training set into 5 folds
set.seed(1)
folds <- createFolds(y=factor(train$Survived), k = 5, list = FALSE)
train$fold <- folds
train.set<-train
CV.error<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked + Title, data = train.data, method = "class", control = rpart.control(minsplit = 1, cp = 0.004))
tree.y<-valid.data$Survived
tree.predy<-predict(treefit, newdata = valid.data[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked", "Title")], type ="class")
ith.test.error<- mean(tree.y!=tree.predy)
CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
sum(CV.error)
## [1] 0.1739618
TREE.All<-NULL
for(j in 1:150){
set.seed(j)
folds <- createFolds(y=factor(train.set$Survived), k = 5, list = FALSE)
train.set$fold <- folds
CV.error<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked + Title, data = train.data, method = "class", control = rpart.control(minsplit = 1, cp = 0.004))
tree.y<-valid.data$Survived
tree.predy<-predict(treefit, newdata = valid.data[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked", "Title")], type ="class")
ith.test.error<- mean(tree.y!=tree.predy)
CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
TREE.All<-c(TREE.All, sum(CV.error))
}
TREE.All
## [1] 0.1739618 0.1739618 0.1750842 0.1683502 0.1806958 0.1829405 0.1829405
## [8] 0.1784512 0.1773288 0.1649832 0.1773288 0.1717172 0.1683502 0.1784512
## [15] 0.1694725 0.1773288 0.1728395 0.1649832 0.1818182 0.1638608 0.1818182
## [22] 0.1784512 0.1795735 0.1818182 0.1717172 0.1694725 0.1806958 0.1705948
## [29] 0.1705948 0.1784512 0.1784512 0.1739618 0.1784512 0.1750842 0.1795735
## [36] 0.1795735 0.1818182 0.1717172 0.1739618 0.1795735 0.1717172 0.1683502
## [43] 0.1728395 0.1705948 0.1784512 0.1728395 0.1649832 0.1885522 0.1717172
## [50] 0.1649832 0.1795735 0.1672278 0.1750842 0.1750842 0.1705948 0.1885522
## [57] 0.1863075 0.1762065 0.1728395 0.1896745 0.1604938 0.1784512 0.1728395
## [64] 0.1661055 0.1694725 0.1694725 0.1750842 0.1762065 0.1593715 0.1638608
## [71] 0.1728395 0.1728395 0.1694725 0.1683502 0.1717172 0.1739618 0.1728395
## [78] 0.1717172 0.1818182 0.1851852 0.1739618 0.1739618 0.1750842 0.1582492
## [85] 0.1672278 0.1851852 0.1582492 0.1694725 0.1750842 0.1638608 0.1717172
## [92] 0.1762065 0.1750842 0.1739618 0.1683502 0.1705948 0.1739618 0.1683502
## [99] 0.1604938 0.1649832 0.1593715 0.1694725 0.1818182 0.1661055 0.1649832
## [106] 0.1728395 0.1717172 0.1739618 0.1672278 0.1717172 0.1762065 0.1773288
## [113] 0.1750842 0.1694725 0.1773288 0.1773288 0.1784512 0.1806958 0.1717172
## [120] 0.1694725 0.1683502 0.1773288 0.1784512 0.1762065 0.1784512 0.1627385
## [127] 0.1829405 0.1784512 0.1694725 0.1784512 0.1750842 0.1784512 0.1806958
## [134] 0.1638608 0.1649832 0.1739618 0.1717172 0.1795735 0.1672278 0.1638608
## [141] 0.1604938 0.1571268 0.1582492 0.1705948 0.1784512 0.1818182 0.1795735
## [148] 0.1829405 0.1773288 0.1705948
TREE<-data.frame(Classifier="TREE", CV.Error=TREE.All)
df<-rbind(TREE)
ggplot(df, aes(x=Classifier, y=CV.Error)) + geom_boxplot(color="green") + geom_jitter(alpha=0.1) + ylab("CV Error Rate")
summary(TREE)
## Classifier CV.Error
## TREE:150 Min. :0.1571
## 1st Qu.:0.1695
## Median :0.1740
## Mean :0.1733
## 3rd Qu.:0.1785
## Max. :0.1897
treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked + Title, data = train.set, method = "class", control = rpart.control(minsplit = 1, cp = 0.004))
treefit
## n= 891
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 891 342 0 (0.61616162 0.38383838)
## 2) Title=Capt,Col,Don,Jonkheer,Mr,Rev 529 84 0 (0.84120983 0.15879017)
## 4) Fare< 26.125 393 39 0 (0.90076336 0.09923664) *
## 5) Fare>=26.125 136 45 0 (0.66911765 0.33088235)
## 10) Fare>=26.775 115 32 0 (0.72173913 0.27826087)
## 20) Pclass=2 14 0 0 (1.00000000 0.00000000) *
## 21) Pclass=1,3 101 32 0 (0.68316832 0.31683168)
## 42) Age>=49.5 20 3 0 (0.85000000 0.15000000) *
## 43) Age< 49.5 81 29 0 (0.64197531 0.35802469)
## 86) SibSp=3,4,5,8 7 0 0 (1.00000000 0.00000000) *
## 87) SibSp=0,1 74 29 0 (0.60810811 0.39189189)
## 174) Fare< 52.2771 34 9 0 (0.73529412 0.26470588) *
## 175) Fare>=52.2771 40 20 0 (0.50000000 0.50000000)
## 350) Fare>=77.00835 20 6 0 (0.70000000 0.30000000)
## 700) Fare< 86.2896 6 0 0 (1.00000000 0.00000000) *
## 701) Fare>=86.2896 14 6 0 (0.57142857 0.42857143)
## 1402) Fare>=99.9896 11 3 0 (0.72727273 0.27272727) *
## 1403) Fare< 99.9896 3 0 1 (0.00000000 1.00000000) *
## 351) Fare< 77.00835 20 6 1 (0.30000000 0.70000000) *
## 11) Fare< 26.775 21 8 1 (0.38095238 0.61904762)
## 22) Age>=53.5 5 0 0 (1.00000000 0.00000000) *
## 23) Age< 53.5 16 3 1 (0.18750000 0.81250000) *
## 3) Title=Countess,Dr,Major,Master,Miss,Mlle,Mme,Mrs,Ms 362 104 1 (0.28729282 0.71270718)
## 6) Pclass=3 173 83 0 (0.52023121 0.47976879)
## 12) Fare>=23.35 44 4 0 (0.90909091 0.09090909) *
## 13) Fare< 23.35 129 50 1 (0.38759690 0.61240310)
## 26) Age>=16.5 95 43 1 (0.45263158 0.54736842)
## 52) Fare>=7.8875 58 26 0 (0.55172414 0.44827586)
## 104) Fare< 14.8729 34 10 0 (0.70588235 0.29411765) *
## 105) Fare>=14.8729 24 8 1 (0.33333333 0.66666667)
## 210) Age< 21 2 0 0 (1.00000000 0.00000000) *
## 211) Age>=21 22 6 1 (0.27272727 0.72727273) *
## 53) Fare< 7.8875 37 11 1 (0.29729730 0.70270270)
## 106) Age>=28.25 3 0 0 (1.00000000 0.00000000) *
## 107) Age< 28.25 34 8 1 (0.23529412 0.76470588) *
## 27) Age< 16.5 34 7 1 (0.20588235 0.79411765)
## 54) SibSp=3 3 0 0 (1.00000000 0.00000000) *
## 55) SibSp=0,1,2 31 4 1 (0.12903226 0.87096774) *
## 7) Pclass=1,2 189 14 1 (0.07407407 0.92592593) *
rpart.plot(treefit,extra=104, box.palette="GnBu",
branch.lty=3, shadow.col="gray", nn=TRUE)
plot(treefit, uniform=TRUE, margin=0.1)
text(treefit, use.n = TRUE, cex=.55)
Recall that bagging is simply a special case of a random forest with m = p.
Bagging does not deal with missing observations
set.seed(1)
output.forest <-randomForest(Survived~Title+Pclass+Sex+SibSp+Fare, data = train, ntree=2500, importance=T, mtry=5)
print(output.forest)
##
## Call:
## randomForest(formula = Survived ~ Title + Pclass + Sex + SibSp + Fare, data = train, ntree = 2500, importance = T, mtry = 5)
## Type of random forest: classification
## Number of trees: 2500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 17.85%
## Confusion matrix:
## 0 1 class.error
## 0 475 74 0.1347905
## 1 85 257 0.2485380
plot(output.forest)
set.seed(1)
output.forest <-randomForest(Survived~Title+Pclass+Sex+SibSp+Fare, data = train, ntree=2500, importance=T, mtry=1)
print(output.forest)
##
## Call:
## randomForest(formula = Survived ~ Title + Pclass + Sex + SibSp + Fare, data = train, ntree = 2500, importance = T, mtry = 1)
## Type of random forest: classification
## Number of trees: 2500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 18.63%
## Confusion matrix:
## 0 1 class.error
## 0 478 71 0.1293260
## 1 95 247 0.2777778
plot(output.forest)
set.seed(1)
output.forest <-randomForest(Survived~Title+Pclass+Sex+SibSp+Fare, data = train, ntree=2500, importance=T)
print(output.forest)
##
## Call:
## randomForest(formula = Survived ~ Title + Pclass + Sex + SibSp + Fare, data = train, ntree = 2500, importance = T)
## Type of random forest: classification
## Number of trees: 2500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 17.28%
## Confusion matrix:
## 0 1 class.error
## 0 488 61 0.1111111
## 1 93 249 0.2719298
plot(output.forest)
set.seed(1)
output.forest <-randomForest(Survived~Title+Pclass+Sex+SibSp+Fare, data = train, ntree=2500, importance=T, mtry=3)
print(output.forest)
##
## Call:
## randomForest(formula = Survived ~ Title + Pclass + Sex + SibSp + Fare, data = train, ntree = 2500, importance = T, mtry = 3)
## Type of random forest: classification
## Number of trees: 2500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 16.39%
## Confusion matrix:
## 0 1 class.error
## 0 488 61 0.1111111
## 1 85 257 0.2485380
plot(output.forest)
set.seed(1)
output.forest <-randomForest(Survived~Title+Pclass+Sex+SibSp+Fare, data = train, ntree=2500, importance=T, mtry=4)
print(output.forest)
##
## Call:
## randomForest(formula = Survived ~ Title + Pclass + Sex + SibSp + Fare, data = train, ntree = 2500, importance = T, mtry = 4)
## Type of random forest: classification
## Number of trees: 2500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 17.28%
## Confusion matrix:
## 0 1 class.error
## 0 476 73 0.1329690
## 1 81 261 0.2368421
plot(output.forest)