setwd("~/Desktop/Titanic Dataset")
train <- read_csv("~/Desktop/Titanic Dataset/train.csv")
names(train)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
sum(is.na(train$PassengerId)) # PassengerId should be discarded
## [1] 0
sum(is.na(train$Survived))
## [1] 0
sum(is.na(train$Pclass))
## [1] 0
sum(is.na(train$Name)) # Name should be discarded
## [1] 0
sum(is.na(train$Sex))
## [1] 0
sum(is.na(train$Age)) # There are 177 missing ages
## [1] 177
sum(is.na(train$SibSp))
## [1] 0
sum(is.na(train$Parch))
## [1] 0
sum(is.na(train$Ticket))
## [1] 0
sum(is.na(train$Fare))
## [1] 0
sum(is.na(train$Cabin)) # There are 687 missing observations
## [1] 687
sum(is.na(train$Embarked)) # There are 2 missing observations
## [1] 2
train<-data.frame(train)
train<-data.frame(train[,c("Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked")])
train$Survived<-factor(train$Survived)
train$Pclass<-factor(train$Pclass)
train$Sex<-factor(train$Sex)
train$SibSp<-factor(train$SibSp)
train$Parch<-factor(train$Parch)
train$Embarked<-factor(train$Embarked)
summary(train)
## Survived Pclass Sex Age SibSp Parch
## 0:549 1:216 female:314 Min. : 0.42 0:608 0:678
## 1:342 2:184 male :577 1st Qu.:20.12 1:209 1:118
## 3:491 Median :28.00 2: 28 2: 80
## Mean :29.70 3: 16 3: 5
## 3rd Qu.:38.00 4: 18 4: 4
## Max. :80.00 5: 5 5: 5
## NA's :177 8: 7 6: 1
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 C :168
## Class :character 1st Qu.: 7.91 Class :character Q : 77
## Mode :character Median : 14.45 Mode :character S :644
## Mean : 32.20 NA's: 2
## 3rd Qu.: 31.00
## Max. :512.33
##
Using the entire train datza for cross validation
# Stratify the training set into 5 folds
set.seed(1)
folds <- createFolds(y=factor(train$Survived), k = 5, list = FALSE)
train$fold <- folds
train.set<-train
CV.error<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked, data = train.data, method = "class", control = rpart.control(minsplit = 1, cp = 0.004))
tree.y<-valid.data$Survived
tree.predy<-predict(treefit, newdata = valid.data[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked")], type ="class")
ith.test.error<- mean(tree.y!=tree.predy)
CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
sum(CV.error)
## [1] 0.1773288
tree_fit_bayes<-function(MinSplit, ComplexityParameter){
CV.error<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked, data = train.data, method = "class", control = rpart.control(minsplit = MinSplit, cp = ComplexityParameter))
tree.y<-valid.data$Survived
tree.predy<-predict(treefit, newdata = valid.data[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked")], type ="class")
ith.test.error<- mean(tree.y!=tree.predy)
CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
list(Score=-sum(CV.error), pred=0)
}
set.seed(1)
OPT_Res<- BayesianOptimization(tree_fit_bayes, bounds= list(MinSplit = c(1L, 10L),
ComplexityParameter = c(0.0001, 0.05)),
init_grid_dt = NULL, init_points = 10,
n_iter = 5, acq = "ucb", kappa =2.576,
eps=0, verbose = TRUE)
## elapsed = 0.09 Round = 1 MinSplit = 3.0000 ComplexityParameter = 0.0104 Value = -0.1807
## elapsed = 0.09 Round = 2 MinSplit = 4.0000 ComplexityParameter = 0.0089 Value = -0.1897
## elapsed = 0.08 Round = 3 MinSplit = 6.0000 ComplexityParameter = 0.0344 Value = -0.2121
## elapsed = 0.08 Round = 4 MinSplit = 9.0000 ComplexityParameter = 0.0193 Value = -0.1897
## elapsed = 0.07 Round = 5 MinSplit = 3.0000 ComplexityParameter = 0.0385 Value = -0.2132
## elapsed = 0.08 Round = 6 MinSplit = 9.0000 ComplexityParameter = 0.0249 Value = -0.1897
## elapsed = 0.07 Round = 7 MinSplit = 10.0000 ComplexityParameter = 0.0359 Value = -0.2121
## elapsed = 0.07 Round = 8 MinSplit = 7.0000 ComplexityParameter = 0.0496 Value = -0.2132
## elapsed = 0.08 Round = 9 MinSplit = 7.0000 ComplexityParameter = 0.0191 Value = -0.1897
## elapsed = 0.07 Round = 10 MinSplit = 2.0000 ComplexityParameter = 0.0389 Value = -0.2132
## elapsed = 0.09 Round = 11 MinSplit = 10.0000 ComplexityParameter = 0.0133 Value = -0.1886
## elapsed = 0.10 Round = 12 MinSplit = 10.0000 ComplexityParameter = 0.0001 Value = -0.1863
## elapsed = 0.11 Round = 13 MinSplit = 9.0000 ComplexityParameter = 0.0028 Value = -0.1829
## elapsed = 0.08 Round = 14 MinSplit = 1.0000 ComplexityParameter = 0.0283 Value = -0.2054
## elapsed = 0.07 Round = 15 MinSplit = 10.0000 ComplexityParameter = 0.0448 Value = -0.2132
##
## Best Parameters Found:
## Round = 1 MinSplit = 3.0000 ComplexityParameter = 0.0104 Value = -0.1807
TREE.All<-NULL
for(j in 1:150){
set.seed(j)
folds <- createFolds(y=factor(train.set$Survived), k = 5, list = FALSE)
train.set$fold <- folds
# Using SVM linear kernel cost = 11173.88 - the best linear kernel SVM model considered.
CV.error<-NULL
for (i in 1:5) {
valid.data <- subset(train.set, fold == i)
train.data <- subset(train.set, fold != i)
treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked, data = train.data, method = "class", control = rpart.control(minsplit = 3, cp = 0.0104))
tree.y<-valid.data$Survived
tree.predy<-predict(treefit, newdata = valid.data[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked")], type ="class")
ith.test.error<- mean(tree.y!=tree.predy)
CV.error<-c(CV.error,(nrow(valid.data)/nrow(train.set))*ith.test.error)
}
TREE.All<-c(TREE.All, sum(CV.error))
}
TREE.All
## [1] 0.1806958 0.1829405 0.1986532 0.1784512 0.1784512 0.1964085 0.1952862
## [8] 0.1863075 0.1907969 0.1896745 0.1975309 0.1919192 0.1863075 0.2053872
## [15] 0.1874299 0.1896745 0.1907969 0.1829405 0.1818182 0.1874299 0.1885522
## [22] 0.1919192 0.2042649 0.1907969 0.1952862 0.1840629 0.1997755 0.1829405
## [29] 0.1829405 0.1851852 0.1964085 0.1762065 0.1829405 0.1874299 0.1975309
## [36] 0.1851852 0.1806958 0.1952862 0.1885522 0.1863075 0.1840629 0.1851852
## [43] 0.1964085 0.1863075 0.1851852 0.1930415 0.1919192 0.1919192 0.1863075
## [50] 0.1863075 0.1930415 0.1885522 0.1784512 0.1851852 0.1896745 0.1964085
## [57] 0.1930415 0.2031425 0.1863075 0.2020202 0.1885522 0.1874299 0.1885522
## [64] 0.1885522 0.1851852 0.1829405 0.1919192 0.1851852 0.1840629 0.1964085
## [71] 0.1773288 0.1840629 0.1907969 0.1818182 0.1919192 0.1762065 0.1986532
## [78] 0.1863075 0.1818182 0.2087542 0.1919192 0.1907969 0.1863075 0.1784512
## [85] 0.2042649 0.1874299 0.1863075 0.1818182 0.1885522 0.1885522 0.1851852
## [92] 0.1851852 0.1986532 0.1851852 0.1896745 0.1874299 0.1840629 0.1919192
## [99] 0.1840629 0.1851852 0.1840629 0.1986532 0.1930415 0.1840629 0.1874299
## [106] 0.1829405 0.1941639 0.1806958 0.1806958 0.1975309 0.1863075 0.1952862
## [113] 0.1874299 0.1829405 0.1863075 0.1840629 0.1806958 0.1818182 0.1840629
## [120] 0.1784512 0.1851852 0.2031425 0.1919192 0.1863075 0.1874299 0.1930415
## [127] 0.1907969 0.1964085 0.1885522 0.1851852 0.1863075 0.1840629 0.1885522
## [134] 0.1863075 0.2031425 0.1952862 0.1762065 0.1952862 0.1952862 0.1818182
## [141] 0.1829405 0.1818182 0.1863075 0.1907969 0.1907969 0.1896745 0.1896745
## [148] 0.2008979 0.1885522 0.1930415
TREE<-data.frame(Classifier="TREE", CV.Error=TREE.All)
df<-rbind(TREE)
ggplot(df, aes(x=Classifier, y=CV.Error)) + geom_boxplot(color="green") + geom_jitter(alpha=0.1) + ylab("CV Error Rate")
summary(TREE)
## Classifier CV.Error
## TREE:150 Min. :0.1762
## 1st Qu.:0.1841
## Median :0.1874
## Mean :0.1887
## 3rd Qu.:0.1919
## Max. :0.2088
train <- read_csv("~/Desktop/Titanic Dataset/train.csv")
train<-data.frame(train)
train<-data.frame(train[,c("Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked")])
train$Survived<-factor(train$Survived)
train$Pclass<-factor(train$Pclass)
train$Sex<-factor(train$Sex)
train$SibSp<-factor(train$SibSp)
train$Parch<-factor(train$Parch)
train$Embarked<-factor(train$Embarked)
test <- read_csv("~/Desktop/Titanic Dataset/test.csv")
test <-data.frame(test)
test<-data.frame(test[,c("Pclass", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked")])
test$Pclass<-factor(test$Pclass)
test$Sex<-factor(test$Sex)
test$SibSp<-factor(test$SibSp)
test$Parch<-factor(test$Parch)
test$Embarked<-factor(test$Embarked)
treefit<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked, data = train, method = "class", control = rpart.control(minsplit = 3, cp = 0.0104))
tree.predy<-predict(treefit, newdata = test[, c("Pclass", "Sex", "Age", "SibSp", "Fare", "Embarked")], type ="class")
treefit
## n= 891
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 891 342 0 (0.61616162 0.38383838)
## 2) Sex=male 577 109 0 (0.81109185 0.18890815)
## 4) Age>=6.5 553 93 0 (0.83182640 0.16817360) *
## 5) Age< 6.5 24 8 1 (0.33333333 0.66666667)
## 10) SibSp=3,4,5 9 1 0 (0.88888889 0.11111111) *
## 11) SibSp=0,1,2 15 0 1 (0.00000000 1.00000000) *
## 3) Sex=female 314 81 1 (0.25796178 0.74203822)
## 6) Pclass=3 144 72 0 (0.50000000 0.50000000)
## 12) Fare>=23.35 27 3 0 (0.88888889 0.11111111) *
## 13) Fare< 23.35 117 48 1 (0.41025641 0.58974359) *
## 7) Pclass=1,2 170 9 1 (0.05294118 0.94705882) *
submission<-data.frame(PassengerId = seq(892, 1309), Survived = tree.predy)
submission
## PassengerId Survived
## 1 892 0
## 2 893 1
## 3 894 0
## 4 895 0
## 5 896 1
## 6 897 0
## 7 898 1
## 8 899 0
## 9 900 1
## 10 901 0
## 11 902 0
## 12 903 0
## 13 904 1
## 14 905 0
## 15 906 1
## 16 907 1
## 17 908 0
## 18 909 0
## 19 910 1
## 20 911 1
## 21 912 0
## 22 913 0
## 23 914 1
## 24 915 0
## 25 916 1
## 26 917 0
## 27 918 1
## 28 919 0
## 29 920 0
## 30 921 0
## 31 922 0
## 32 923 0
## 33 924 1
## 34 925 0
## 35 926 0
## 36 927 0
## 37 928 1
## 38 929 1
## 39 930 0
## 40 931 0
## 41 932 0
## 42 933 0
## 43 934 0
## 44 935 1
## 45 936 1
## 46 937 0
## 47 938 0
## 48 939 0
## 49 940 1
## 50 941 1
## 51 942 0
## 52 943 0
## 53 944 1
## 54 945 1
## 55 946 0
## 56 947 0
## 57 948 0
## 58 949 0
## 59 950 0
## 60 951 1
## 61 952 0
## 62 953 0
## 63 954 0
## 64 955 1
## 65 956 0
## 66 957 1
## 67 958 1
## 68 959 0
## 69 960 0
## 70 961 1
## 71 962 1
## 72 963 0
## 73 964 1
## 74 965 0
## 75 966 1
## 76 967 0
## 77 968 0
## 78 969 1
## 79 970 0
## 80 971 1
## 81 972 1
## 82 973 0
## 83 974 0
## 84 975 0
## 85 976 0
## 86 977 0
## 87 978 1
## 88 979 1
## 89 980 1
## 90 981 1
## 91 982 1
## 92 983 0
## 93 984 1
## 94 985 0
## 95 986 0
## 96 987 0
## 97 988 1
## 98 989 0
## 99 990 1
## 100 991 0
## 101 992 1
## 102 993 0
## 103 994 0
## 104 995 0
## 105 996 1
## 106 997 0
## 107 998 0
## 108 999 0
## 109 1000 0
## 110 1001 0
## 111 1002 0
## 112 1003 1
## 113 1004 1
## 114 1005 1
## 115 1006 1
## 116 1007 0
## 117 1008 0
## 118 1009 1
## 119 1010 0
## 120 1011 1
## 121 1012 1
## 122 1013 0
## 123 1014 1
## 124 1015 0
## 125 1016 0
## 126 1017 1
## 127 1018 0
## 128 1019 1
## 129 1020 0
## 130 1021 0
## 131 1022 0
## 132 1023 0
## 133 1024 0
## 134 1025 0
## 135 1026 0
## 136 1027 0
## 137 1028 0
## 138 1029 0
## 139 1030 1
## 140 1031 0
## 141 1032 0
## 142 1033 1
## 143 1034 0
## 144 1035 0
## 145 1036 0
## 146 1037 0
## 147 1038 0
## 148 1039 0
## 149 1040 0
## 150 1041 0
## 151 1042 1
## 152 1043 0
## 153 1044 0
## 154 1045 1
## 155 1046 0
## 156 1047 0
## 157 1048 1
## 158 1049 1
## 159 1050 0
## 160 1051 1
## 161 1052 1
## 162 1053 0
## 163 1054 1
## 164 1055 0
## 165 1056 0
## 166 1057 1
## 167 1058 0
## 168 1059 0
## 169 1060 1
## 170 1061 1
## 171 1062 0
## 172 1063 0
## 173 1064 0
## 174 1065 0
## 175 1066 0
## 176 1067 1
## 177 1068 1
## 178 1069 0
## 179 1070 1
## 180 1071 1
## 181 1072 0
## 182 1073 0
## 183 1074 1
## 184 1075 0
## 185 1076 1
## 186 1077 0
## 187 1078 1
## 188 1079 0
## 189 1080 0
## 190 1081 0
## 191 1082 0
## 192 1083 0
## 193 1084 0
## 194 1085 0
## 195 1086 0
## 196 1087 0
## 197 1088 1
## 198 1089 1
## 199 1090 0
## 200 1091 1
## 201 1092 1
## 202 1093 1
## 203 1094 0
## 204 1095 1
## 205 1096 0
## 206 1097 0
## 207 1098 1
## 208 1099 0
## 209 1100 1
## 210 1101 0
## 211 1102 0
## 212 1103 0
## 213 1104 0
## 214 1105 1
## 215 1106 1
## 216 1107 0
## 217 1108 1
## 218 1109 0
## 219 1110 1
## 220 1111 0
## 221 1112 1
## 222 1113 0
## 223 1114 1
## 224 1115 0
## 225 1116 1
## 226 1117 1
## 227 1118 0
## 228 1119 1
## 229 1120 0
## 230 1121 0
## 231 1122 0
## 232 1123 1
## 233 1124 0
## 234 1125 0
## 235 1126 0
## 236 1127 0
## 237 1128 0
## 238 1129 0
## 239 1130 1
## 240 1131 1
## 241 1132 1
## 242 1133 1
## 243 1134 0
## 244 1135 0
## 245 1136 0
## 246 1137 0
## 247 1138 1
## 248 1139 0
## 249 1140 1
## 250 1141 1
## 251 1142 1
## 252 1143 0
## 253 1144 0
## 254 1145 0
## 255 1146 0
## 256 1147 0
## 257 1148 0
## 258 1149 0
## 259 1150 1
## 260 1151 0
## 261 1152 0
## 262 1153 0
## 263 1154 1
## 264 1155 1
## 265 1156 0
## 266 1157 0
## 267 1158 0
## 268 1159 0
## 269 1160 1
## 270 1161 0
## 271 1162 0
## 272 1163 0
## 273 1164 1
## 274 1165 1
## 275 1166 0
## 276 1167 1
## 277 1168 0
## 278 1169 0
## 279 1170 0
## 280 1171 0
## 281 1172 1
## 282 1173 1
## 283 1174 1
## 284 1175 1
## 285 1176 1
## 286 1177 0
## 287 1178 0
## 288 1179 0
## 289 1180 0
## 290 1181 0
## 291 1182 0
## 292 1183 1
## 293 1184 0
## 294 1185 0
## 295 1186 0
## 296 1187 0
## 297 1188 1
## 298 1189 0
## 299 1190 0
## 300 1191 0
## 301 1192 0
## 302 1193 0
## 303 1194 0
## 304 1195 0
## 305 1196 1
## 306 1197 1
## 307 1198 0
## 308 1199 1
## 309 1200 0
## 310 1201 1
## 311 1202 0
## 312 1203 0
## 313 1204 0
## 314 1205 1
## 315 1206 1
## 316 1207 1
## 317 1208 0
## 318 1209 0
## 319 1210 0
## 320 1211 0
## 321 1212 0
## 322 1213 0
## 323 1214 0
## 324 1215 0
## 325 1216 1
## 326 1217 0
## 327 1218 1
## 328 1219 0
## 329 1220 0
## 330 1221 0
## 331 1222 1
## 332 1223 0
## 333 1224 0
## 334 1225 1
## 335 1226 0
## 336 1227 0
## 337 1228 0
## 338 1229 0
## 339 1230 0
## 340 1231 0
## 341 1232 0
## 342 1233 0
## 343 1234 0
## 344 1235 1
## 345 1236 0
## 346 1237 1
## 347 1238 0
## 348 1239 1
## 349 1240 0
## 350 1241 1
## 351 1242 1
## 352 1243 0
## 353 1244 0
## 354 1245 0
## 355 1246 1
## 356 1247 0
## 357 1248 1
## 358 1249 0
## 359 1250 0
## 360 1251 1
## 361 1252 0
## 362 1253 1
## 363 1254 1
## 364 1255 0
## 365 1256 1
## 366 1257 0
## 367 1258 0
## 368 1259 0
## 369 1260 1
## 370 1261 0
## 371 1262 0
## 372 1263 1
## 373 1264 0
## 374 1265 0
## 375 1266 1
## 376 1267 1
## 377 1268 1
## 378 1269 0
## 379 1270 0
## 380 1271 0
## 381 1272 0
## 382 1273 0
## 383 1274 1
## 384 1275 1
## 385 1276 0
## 386 1277 1
## 387 1278 0
## 388 1279 0
## 389 1280 0
## 390 1281 0
## 391 1282 0
## 392 1283 1
## 393 1284 0
## 394 1285 0
## 395 1286 0
## 396 1287 1
## 397 1288 0
## 398 1289 1
## 399 1290 0
## 400 1291 0
## 401 1292 1
## 402 1293 0
## 403 1294 1
## 404 1295 0
## 405 1296 0
## 406 1297 0
## 407 1298 0
## 408 1299 0
## 409 1300 1
## 410 1301 1
## 411 1302 1
## 412 1303 1
## 413 1304 1
## 414 1305 0
## 415 1306 1
## 416 1307 0
## 417 1308 0
## 418 1309 0