1 Set my working directory and read in the training and test sets.

setwd("~/Desktop/Titanic Dataset")

train <- read_csv("~/Desktop/Titanic Dataset/train.csv")
names(train)
##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Cabin"       "Embarked"
sum(is.na(train$PassengerId)) # PassengerId should be discarded
## [1] 0
sum(is.na(train$Survived))
## [1] 0
sum(is.na(train$Pclass))
## [1] 0
sum(is.na(train$Name)) # Name should be discarded
## [1] 0
sum(is.na(train$Sex))
## [1] 0
sum(is.na(train$Age)) # There are 177 missing ages
## [1] 177
sum(is.na(train$SibSp))
## [1] 0
sum(is.na(train$Parch))
## [1] 0
sum(is.na(train$Ticket))
## [1] 0
sum(is.na(train$Fare))
## [1] 0
sum(is.na(train$Cabin)) # There are 687 missing observations
## [1] 687
sum(is.na(train$Embarked)) # There are 2 missing observations
## [1] 2
train<-data.frame(train)
train<-data.frame(train[,c("Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked")])

train$Survived<-factor(train$Survived)
train$Pclass<-factor(train$Pclass)
train$Sex<-factor(train$Sex)
train$SibSp<-factor(train$SibSp)
train$Parch<-factor(train$Parch)
train$Embarked<-factor(train$Embarked)

summary(train)
##  Survived Pclass      Sex           Age        SibSp   Parch  
##  0:549    1:216   female:314   Min.   : 0.42   0:608   0:678  
##  1:342    2:184   male  :577   1st Qu.:20.12   1:209   1:118  
##           3:491                Median :28.00   2: 28   2: 80  
##                                Mean   :29.70   3: 16   3:  5  
##                                3rd Qu.:38.00   4: 18   4:  4  
##                                Max.   :80.00   5:  5   5:  5  
##                                NA's   :177     8:  7   6:  1  
##     Ticket               Fare           Cabin           Embarked  
##  Length:891         Min.   :  0.00   Length:891         C   :168  
##  Class :character   1st Qu.:  7.91   Class :character   Q   : 77  
##  Mode  :character   Median : 14.45   Mode  :character   S   :644  
##                     Mean   : 32.20                      NA's:  2  
##                     3rd Qu.: 31.00                                
##                     Max.   :512.33                                
## 

Using the entire train datza for cross validation

# Stratify the training set into 5 folds

set.seed(1)

folds <- createFolds(y=factor(train$Survived), k = 5, list = FALSE)

train$fold <- folds

train.set<-train

2 All variables with trees

  CV.error<-NULL 
  
  for (i in 1:5) { 
    
    valid.data <- subset(train.set, fold == i)
    
    train.data <- subset(train.set, fold != i) 
    
    treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked, data = train.data, method = "class", control = rpart.control(minsplit = 1, cp = 0.004))
    
    tree.y<-valid.data$Survived
    
    tree.predy<-predict(treefit, newdata = valid.data[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked")], type ="class")
    
    ith.test.error<- mean(tree.y!=tree.predy) 
    
    CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
    
  }
  
  sum(CV.error)
## [1] 0.1773288
tree_fit_bayes<-function(MinSplit, ComplexityParameter){
  
   CV.error<-NULL 
  
  for (i in 1:5) { 
    
    valid.data <- subset(train.set, fold == i)
    
    train.data <- subset(train.set, fold != i) 
    
    treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked, data = train.data, method = "class", control = rpart.control(minsplit = MinSplit, cp = ComplexityParameter))
    
    tree.y<-valid.data$Survived
    
    tree.predy<-predict(treefit, newdata = valid.data[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked")], type ="class")
    
    ith.test.error<- mean(tree.y!=tree.predy) 
    
    CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
    
  }
   
   list(Score=-sum(CV.error), pred=0)

}

set.seed(1)

OPT_Res<- BayesianOptimization(tree_fit_bayes, bounds= list(MinSplit = c(1L, 10L),
                                                           ComplexityParameter = c(0.0001, 0.05)),
                               init_grid_dt = NULL, init_points = 10, 
                               n_iter = 5, acq = "ucb", kappa =2.576,
                               eps=0, verbose = TRUE)
## elapsed = 0.09   Round = 1   MinSplit = 3.0000   ComplexityParameter = 0.0104    Value = -0.1807 
## elapsed = 0.09   Round = 2   MinSplit = 4.0000   ComplexityParameter = 0.0089    Value = -0.1897 
## elapsed = 0.08   Round = 3   MinSplit = 6.0000   ComplexityParameter = 0.0344    Value = -0.2121 
## elapsed = 0.08   Round = 4   MinSplit = 9.0000   ComplexityParameter = 0.0193    Value = -0.1897 
## elapsed = 0.07   Round = 5   MinSplit = 3.0000   ComplexityParameter = 0.0385    Value = -0.2132 
## elapsed = 0.08   Round = 6   MinSplit = 9.0000   ComplexityParameter = 0.0249    Value = -0.1897 
## elapsed = 0.07   Round = 7   MinSplit = 10.0000  ComplexityParameter = 0.0359    Value = -0.2121 
## elapsed = 0.07   Round = 8   MinSplit = 7.0000   ComplexityParameter = 0.0496    Value = -0.2132 
## elapsed = 0.08   Round = 9   MinSplit = 7.0000   ComplexityParameter = 0.0191    Value = -0.1897 
## elapsed = 0.07   Round = 10  MinSplit = 2.0000   ComplexityParameter = 0.0389    Value = -0.2132 
## elapsed = 0.09   Round = 11  MinSplit = 10.0000  ComplexityParameter = 0.0133    Value = -0.1886 
## elapsed = 0.10   Round = 12  MinSplit = 10.0000  ComplexityParameter = 0.0001    Value = -0.1863 
## elapsed = 0.11   Round = 13  MinSplit = 9.0000   ComplexityParameter = 0.0028    Value = -0.1829 
## elapsed = 0.08   Round = 14  MinSplit = 1.0000   ComplexityParameter = 0.0283    Value = -0.2054 
## elapsed = 0.07   Round = 15  MinSplit = 10.0000  ComplexityParameter = 0.0448    Value = -0.2132 
## 
##  Best Parameters Found: 
## Round = 1    MinSplit = 3.0000   ComplexityParameter = 0.0104    Value = -0.1807

2.1 Check the stability

TREE.All<-NULL

for(j in 1:150){
  
  set.seed(j)
  
  folds <- createFolds(y=factor(train.set$Survived), k = 5, list = FALSE)
  
  train.set$fold <- folds
  
  
    # Using SVM linear kernel cost = 11173.88 - the best linear kernel SVM model considered.
    
  CV.error<-NULL 
  
  for (i in 1:5) { 
    
    valid.data <- subset(train.set, fold == i)
    
    train.data <- subset(train.set, fold != i) 
    
    treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked, data = train.data, method = "class", control = rpart.control(minsplit = 3, cp = 0.0104))
    
    tree.y<-valid.data$Survived
    
    tree.predy<-predict(treefit, newdata = valid.data[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked")], type ="class")
    
    ith.test.error<- mean(tree.y!=tree.predy) 
    
    CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)  
    
  }
  
  TREE.All<-c(TREE.All, sum(CV.error))
  
}

TREE.All 
##   [1] 0.1806958 0.1829405 0.1986532 0.1784512 0.1784512 0.1964085 0.1952862
##   [8] 0.1863075 0.1907969 0.1896745 0.1975309 0.1919192 0.1863075 0.2053872
##  [15] 0.1874299 0.1896745 0.1907969 0.1829405 0.1818182 0.1874299 0.1885522
##  [22] 0.1919192 0.2042649 0.1907969 0.1952862 0.1840629 0.1997755 0.1829405
##  [29] 0.1829405 0.1851852 0.1964085 0.1762065 0.1829405 0.1874299 0.1975309
##  [36] 0.1851852 0.1806958 0.1952862 0.1885522 0.1863075 0.1840629 0.1851852
##  [43] 0.1964085 0.1863075 0.1851852 0.1930415 0.1919192 0.1919192 0.1863075
##  [50] 0.1863075 0.1930415 0.1885522 0.1784512 0.1851852 0.1896745 0.1964085
##  [57] 0.1930415 0.2031425 0.1863075 0.2020202 0.1885522 0.1874299 0.1885522
##  [64] 0.1885522 0.1851852 0.1829405 0.1919192 0.1851852 0.1840629 0.1964085
##  [71] 0.1773288 0.1840629 0.1907969 0.1818182 0.1919192 0.1762065 0.1986532
##  [78] 0.1863075 0.1818182 0.2087542 0.1919192 0.1907969 0.1863075 0.1784512
##  [85] 0.2042649 0.1874299 0.1863075 0.1818182 0.1885522 0.1885522 0.1851852
##  [92] 0.1851852 0.1986532 0.1851852 0.1896745 0.1874299 0.1840629 0.1919192
##  [99] 0.1840629 0.1851852 0.1840629 0.1986532 0.1930415 0.1840629 0.1874299
## [106] 0.1829405 0.1941639 0.1806958 0.1806958 0.1975309 0.1863075 0.1952862
## [113] 0.1874299 0.1829405 0.1863075 0.1840629 0.1806958 0.1818182 0.1840629
## [120] 0.1784512 0.1851852 0.2031425 0.1919192 0.1863075 0.1874299 0.1930415
## [127] 0.1907969 0.1964085 0.1885522 0.1851852 0.1863075 0.1840629 0.1885522
## [134] 0.1863075 0.2031425 0.1952862 0.1762065 0.1952862 0.1952862 0.1818182
## [141] 0.1829405 0.1818182 0.1863075 0.1907969 0.1907969 0.1896745 0.1896745
## [148] 0.2008979 0.1885522 0.1930415
TREE<-data.frame(Classifier="TREE", CV.Error=TREE.All)

df<-rbind(TREE)

ggplot(df, aes(x=Classifier, y=CV.Error)) + geom_boxplot(color="green") + geom_jitter(alpha=0.1) + ylab("CV Error Rate")

summary(TREE)
##  Classifier    CV.Error     
##  TREE:150   Min.   :0.1762  
##             1st Qu.:0.1841  
##             Median :0.1874  
##             Mean   :0.1887  
##             3rd Qu.:0.1919  
##             Max.   :0.2088

3 Prediction

train <- read_csv("~/Desktop/Titanic Dataset/train.csv")

train<-data.frame(train)

train<-data.frame(train[,c("Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked")])

train$Survived<-factor(train$Survived)
train$Pclass<-factor(train$Pclass)
train$Sex<-factor(train$Sex)
train$SibSp<-factor(train$SibSp)
train$Parch<-factor(train$Parch)
train$Embarked<-factor(train$Embarked)


test <- read_csv("~/Desktop/Titanic Dataset/test.csv")

test <-data.frame(test)

test<-data.frame(test[,c("Pclass", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked")])

test$Pclass<-factor(test$Pclass)
test$Sex<-factor(test$Sex)
test$SibSp<-factor(test$SibSp)
test$Parch<-factor(test$Parch)
test$Embarked<-factor(test$Embarked)
    treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked, data = train, method = "class", control = rpart.control(minsplit = 3, cp = 0.0104))
    
    tree.predy<-predict(treefit, newdata = test[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked")], type ="class")

    treefit
## n= 891 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 891 342 0 (0.61616162 0.38383838)  
##    2) Sex=male 577 109 0 (0.81109185 0.18890815)  
##      4) Age>=6.5 553  93 0 (0.83182640 0.16817360) *
##      5) Age< 6.5 24   8 1 (0.33333333 0.66666667)  
##       10) SibSp=3,4,5 9   1 0 (0.88888889 0.11111111) *
##       11) SibSp=0,1,2 15   0 1 (0.00000000 1.00000000) *
##    3) Sex=female 314  81 1 (0.25796178 0.74203822)  
##      6) Pclass=3 144  72 0 (0.50000000 0.50000000)  
##       12) Fare>=23.35 27   3 0 (0.88888889 0.11111111) *
##       13) Fare< 23.35 117  48 1 (0.41025641 0.58974359) *
##      7) Pclass=1,2 170   9 1 (0.05294118 0.94705882) *

4 write the csv file

submission<-data.frame(PassengerId = seq(892, 1309), Survived = tree.predy)

submission
##     PassengerId Survived
## 1           892        0
## 2           893        1
## 3           894        0
## 4           895        0
## 5           896        1
## 6           897        0
## 7           898        1
## 8           899        0
## 9           900        1
## 10          901        0
## 11          902        0
## 12          903        0
## 13          904        1
## 14          905        0
## 15          906        1
## 16          907        1
## 17          908        0
## 18          909        0
## 19          910        1
## 20          911        1
## 21          912        0
## 22          913        0
## 23          914        1
## 24          915        0
## 25          916        1
## 26          917        0
## 27          918        1
## 28          919        0
## 29          920        0
## 30          921        0
## 31          922        0
## 32          923        0
## 33          924        1
## 34          925        0
## 35          926        0
## 36          927        0
## 37          928        1
## 38          929        1
## 39          930        0
## 40          931        0
## 41          932        0
## 42          933        0
## 43          934        0
## 44          935        1
## 45          936        1
## 46          937        0
## 47          938        0
## 48          939        0
## 49          940        1
## 50          941        1
## 51          942        0
## 52          943        0
## 53          944        1
## 54          945        1
## 55          946        0
## 56          947        0
## 57          948        0
## 58          949        0
## 59          950        0
## 60          951        1
## 61          952        0
## 62          953        0
## 63          954        0
## 64          955        1
## 65          956        0
## 66          957        1
## 67          958        1
## 68          959        0
## 69          960        0
## 70          961        1
## 71          962        1
## 72          963        0
## 73          964        1
## 74          965        0
## 75          966        1
## 76          967        0
## 77          968        0
## 78          969        1
## 79          970        0
## 80          971        1
## 81          972        1
## 82          973        0
## 83          974        0
## 84          975        0
## 85          976        0
## 86          977        0
## 87          978        1
## 88          979        1
## 89          980        1
## 90          981        1
## 91          982        1
## 92          983        0
## 93          984        1
## 94          985        0
## 95          986        0
## 96          987        0
## 97          988        1
## 98          989        0
## 99          990        1
## 100         991        0
## 101         992        1
## 102         993        0
## 103         994        0
## 104         995        0
## 105         996        1
## 106         997        0
## 107         998        0
## 108         999        0
## 109        1000        0
## 110        1001        0
## 111        1002        0
## 112        1003        1
## 113        1004        1
## 114        1005        1
## 115        1006        1
## 116        1007        0
## 117        1008        0
## 118        1009        1
## 119        1010        0
## 120        1011        1
## 121        1012        1
## 122        1013        0
## 123        1014        1
## 124        1015        0
## 125        1016        0
## 126        1017        1
## 127        1018        0
## 128        1019        1
## 129        1020        0
## 130        1021        0
## 131        1022        0
## 132        1023        0
## 133        1024        0
## 134        1025        0
## 135        1026        0
## 136        1027        0
## 137        1028        0
## 138        1029        0
## 139        1030        1
## 140        1031        0
## 141        1032        0
## 142        1033        1
## 143        1034        0
## 144        1035        0
## 145        1036        0
## 146        1037        0
## 147        1038        0
## 148        1039        0
## 149        1040        0
## 150        1041        0
## 151        1042        1
## 152        1043        0
## 153        1044        0
## 154        1045        1
## 155        1046        0
## 156        1047        0
## 157        1048        1
## 158        1049        1
## 159        1050        0
## 160        1051        1
## 161        1052        1
## 162        1053        0
## 163        1054        1
## 164        1055        0
## 165        1056        0
## 166        1057        1
## 167        1058        0
## 168        1059        0
## 169        1060        1
## 170        1061        1
## 171        1062        0
## 172        1063        0
## 173        1064        0
## 174        1065        0
## 175        1066        0
## 176        1067        1
## 177        1068        1
## 178        1069        0
## 179        1070        1
## 180        1071        1
## 181        1072        0
## 182        1073        0
## 183        1074        1
## 184        1075        0
## 185        1076        1
## 186        1077        0
## 187        1078        1
## 188        1079        0
## 189        1080        0
## 190        1081        0
## 191        1082        0
## 192        1083        0
## 193        1084        0
## 194        1085        0
## 195        1086        0
## 196        1087        0
## 197        1088        1
## 198        1089        1
## 199        1090        0
## 200        1091        1
## 201        1092        1
## 202        1093        1
## 203        1094        0
## 204        1095        1
## 205        1096        0
## 206        1097        0
## 207        1098        1
## 208        1099        0
## 209        1100        1
## 210        1101        0
## 211        1102        0
## 212        1103        0
## 213        1104        0
## 214        1105        1
## 215        1106        1
## 216        1107        0
## 217        1108        1
## 218        1109        0
## 219        1110        1
## 220        1111        0
## 221        1112        1
## 222        1113        0
## 223        1114        1
## 224        1115        0
## 225        1116        1
## 226        1117        1
## 227        1118        0
## 228        1119        1
## 229        1120        0
## 230        1121        0
## 231        1122        0
## 232        1123        1
## 233        1124        0
## 234        1125        0
## 235        1126        0
## 236        1127        0
## 237        1128        0
## 238        1129        0
## 239        1130        1
## 240        1131        1
## 241        1132        1
## 242        1133        1
## 243        1134        0
## 244        1135        0
## 245        1136        0
## 246        1137        0
## 247        1138        1
## 248        1139        0
## 249        1140        1
## 250        1141        1
## 251        1142        1
## 252        1143        0
## 253        1144        0
## 254        1145        0
## 255        1146        0
## 256        1147        0
## 257        1148        0
## 258        1149        0
## 259        1150        1
## 260        1151        0
## 261        1152        0
## 262        1153        0
## 263        1154        1
## 264        1155        1
## 265        1156        0
## 266        1157        0
## 267        1158        0
## 268        1159        0
## 269        1160        1
## 270        1161        0
## 271        1162        0
## 272        1163        0
## 273        1164        1
## 274        1165        1
## 275        1166        0
## 276        1167        1
## 277        1168        0
## 278        1169        0
## 279        1170        0
## 280        1171        0
## 281        1172        1
## 282        1173        1
## 283        1174        1
## 284        1175        1
## 285        1176        1
## 286        1177        0
## 287        1178        0
## 288        1179        0
## 289        1180        0
## 290        1181        0
## 291        1182        0
## 292        1183        1
## 293        1184        0
## 294        1185        0
## 295        1186        0
## 296        1187        0
## 297        1188        1
## 298        1189        0
## 299        1190        0
## 300        1191        0
## 301        1192        0
## 302        1193        0
## 303        1194        0
## 304        1195        0
## 305        1196        1
## 306        1197        1
## 307        1198        0
## 308        1199        1
## 309        1200        0
## 310        1201        1
## 311        1202        0
## 312        1203        0
## 313        1204        0
## 314        1205        1
## 315        1206        1
## 316        1207        1
## 317        1208        0
## 318        1209        0
## 319        1210        0
## 320        1211        0
## 321        1212        0
## 322        1213        0
## 323        1214        0
## 324        1215        0
## 325        1216        1
## 326        1217        0
## 327        1218        1
## 328        1219        0
## 329        1220        0
## 330        1221        0
## 331        1222        1
## 332        1223        0
## 333        1224        0
## 334        1225        1
## 335        1226        0
## 336        1227        0
## 337        1228        0
## 338        1229        0
## 339        1230        0
## 340        1231        0
## 341        1232        0
## 342        1233        0
## 343        1234        0
## 344        1235        1
## 345        1236        0
## 346        1237        1
## 347        1238        0
## 348        1239        1
## 349        1240        0
## 350        1241        1
## 351        1242        1
## 352        1243        0
## 353        1244        0
## 354        1245        0
## 355        1246        1
## 356        1247        0
## 357        1248        1
## 358        1249        0
## 359        1250        0
## 360        1251        1
## 361        1252        0
## 362        1253        1
## 363        1254        1
## 364        1255        0
## 365        1256        1
## 366        1257        0
## 367        1258        0
## 368        1259        0
## 369        1260        1
## 370        1261        0
## 371        1262        0
## 372        1263        1
## 373        1264        0
## 374        1265        0
## 375        1266        1
## 376        1267        1
## 377        1268        1
## 378        1269        0
## 379        1270        0
## 380        1271        0
## 381        1272        0
## 382        1273        0
## 383        1274        1
## 384        1275        1
## 385        1276        0
## 386        1277        1
## 387        1278        0
## 388        1279        0
## 389        1280        0
## 390        1281        0
## 391        1282        0
## 392        1283        1
## 393        1284        0
## 394        1285        0
## 395        1286        0
## 396        1287        1
## 397        1288        0
## 398        1289        1
## 399        1290        0
## 400        1291        0
## 401        1292        1
## 402        1293        0
## 403        1294        1
## 404        1295        0
## 405        1296        0
## 406        1297        0
## 407        1298        0
## 408        1299        0
## 409        1300        1
## 410        1301        1
## 411        1302        1
## 412        1303        1
## 413        1304        1
## 414        1305        0
## 415        1306        1
## 416        1307        0
## 417        1308        0
## 418        1309        0