train <- read_csv("/Users/apple/Desktop/A04/titanic_train.csv") %>%
mutate(Test_Data = 0)
## Rows: 891 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv("/Users/apple/Desktop/A04/titanic_test.csv") %>%
mutate(Test_Data = 1)
## Rows: 418 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (6): PassengerId, Pclass, Age, SibSp, Parch, Fare
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- read_csv("/Users/apple/Desktop/A04/titanic3.csv")
## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (7): pclass, survived, age, sibsp, parch, fare, body
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(train)
## tibble [891 × 13] (S3: tbl_df/tbl/data.frame)
## $ PassengerId: num [1:891] 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : num [1:891] 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : num [1:891] 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr [1:891] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr [1:891] "male" "female" "female" "female" ...
## $ Age : num [1:891] 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : num [1:891] 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : num [1:891] 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr [1:891] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num [1:891] 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr [1:891] NA "C85" NA "C123" ...
## $ Embarked : chr [1:891] "S" "C" "S" "S" ...
## $ Test_Data : num [1:891] 0 0 0 0 0 0 0 0 0 0 ...
str(test)
## tibble [418 × 12] (S3: tbl_df/tbl/data.frame)
## $ PassengerId: num [1:418] 892 893 894 895 896 897 898 899 900 901 ...
## $ Pclass : num [1:418] 3 3 2 3 3 3 3 2 3 3 ...
## $ Name : chr [1:418] "Kelly, Mr. James" "Wilkes, Mrs. James (Ellen Needs)" "Myles, Mr. Thomas Francis" "Wirz, Mr. Albert" ...
## $ Sex : chr [1:418] "male" "female" "male" "male" ...
## $ Age : num [1:418] 34.5 47 62 27 22 14 30 26 18 21 ...
## $ SibSp : num [1:418] 0 1 0 0 1 0 0 1 0 2 ...
## $ Parch : num [1:418] 0 0 0 0 1 0 0 1 0 0 ...
## $ Ticket : chr [1:418] "330911" "363272" "240276" "315154" ...
## $ Fare : num [1:418] 7.83 7 9.69 8.66 12.29 ...
## $ Cabin : chr [1:418] NA NA NA NA ...
## $ Embarked : chr [1:418] "Q" "S" "Q" "S" ...
## $ Test_Data : num [1:418] 1 1 1 1 1 1 1 1 1 1 ...
We can see that the training dataset has 891 records and each record has 12 attributes. The testing dataset has 418 records records and each record has 11 attributes.
# Add a "Survived" attribute to the test dataset to allow for combining with train dataset
test <- data.frame(test[1], Survived = rep("NA", nrow(test)), test[ , 2:ncol(test)])
# Combine data sets. Append test.survived to train
data <- rbind(train, test)
summary(data)
## PassengerId Survived Pclass Name
## Min. : 1 Length:1309 Min. :1.000 Length:1309
## 1st Qu.: 328 Class :character 1st Qu.:2.000 Class :character
## Median : 655 Mode :character Median :3.000 Mode :character
## Mean : 655 Mean :2.295
## 3rd Qu.: 982 3rd Qu.:3.000
## Max. :1309 Max. :3.000
##
## Sex Age SibSp Parch
## Length:1309 Min. : 0.17 Min. :0.0000 Min. :0.000
## Class :character 1st Qu.:21.00 1st Qu.:0.0000 1st Qu.:0.000
## Mode :character Median :28.00 Median :0.0000 Median :0.000
## Mean :29.88 Mean :0.4989 Mean :0.385
## 3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :80.00 Max. :8.0000 Max. :9.000
## NA's :263
## Ticket Fare Cabin Embarked
## Length:1309 Min. : 0.000 Length:1309 Length:1309
## Class :character 1st Qu.: 7.896 Class :character Class :character
## Mode :character Median : 14.454 Mode :character Mode :character
## Mean : 33.295
## 3rd Qu.: 31.275
## Max. :512.329
## NA's :1
## Test_Data
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3193
## 3rd Qu.:1.0000
## Max. :1.0000
##
# Exam PassengerID
length(data$PassengerId)
## [1] 1309
length(unique(data$PassengerId))
## [1] 1309
It proves the PassengerID has no missing value and duplication.However, I will remove this variable from the final model since the relationship between the survive rate and names does not exist.
# Exam Survived
data$Survived <- as.factor(data$Survived)
table(data$Survived, dnn = "Number of Survived in the Data")
## Number of Survived in the Data
## 0 1 NA
## 549 342 418
prop.table(table(as.factor(train$Survived), dnn = "Survive and death ratio in the Train"))
## Survive and death ratio in the Train
## 0 1
## 0.6161616 0.3838384
we know the survive rate in the train dataset is 61.62%. This rate should be maintained in the test too.
# Examine Pclass value,
# It should be factor rather than int.
data$Pclass <- as.factor(data$Pclass)
# Distribution across classes into a table
table(data$Pclass, dnn = "Pclass values in the Data")
## Pclass values in the Data
## 1 2 3
## 323 277 709
# Plot the death distribution across classes with Train data
ggplot(train, aes(x = Pclass, fill = factor(Survived))) +
geom_bar(width = 0.3) +
xlab("Pclass") +
ylab("Total Count") +
labs(fill = "Survived")
# Here the survival chances of a passenger who is in class-1 are higher
than who is a class-2 and class-3.
# Examine Name
data$Name <- as.character(data$Name)
dup.names <- data[which(duplicated(data$Name)), "Name"]
dup.names
We can see there are two duplicated names. However, we cannot use Name as a predictor because it has no generalization.
# check Sex
# plot Survived over Sex on train
ggplot(data[1:891,], aes(x = Sex, fill = Survived)) +
geom_bar(width = 0.3) +
xlab("Sex") +
ylab("Total Count") +
labs(fill = "Survived")
# This graph shows that the male death rate is higher than the
female’s.
# Examine Age
summary(data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.17 21.00 28.00 29.88 39.00 80.00 263
# plot Survived on age group using train dataset
ggplot(data[1:891,], aes(x = Age, fill = Survived)) +
geom_histogram(binwidth = 10) +
xlab("Age") +
ylab("Total Count")
## Warning: Removed 177 rows containing non-finite values (stat_bin).
# We can see here is a large number of missing value.
# Examine SibSp
summary(data$SibSp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4989 1.0000 8.0000
data$SibSp_f <- as.factor(data$SibSp)
summary(data$SibSp_f)
## 0 1 2 3 4 5 8
## 891 319 42 20 22 6 9
# Examine Parch
summary(data$Parch)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.385 0.000 9.000
# Treat it as a factor, so we know the value distribution
data$Parch_f <- as.factor(data$Parch)
summary(data$Parch_f)
## 0 1 2 3 4 5 6 9
## 1002 170 113 8 6 6 2 2
# Plot on the survive on SibSp
ggplot(data[1:891,], aes(x = SibSp_f, fill = Survived)) +
geom_bar(width = 0.5) +
xlab("SibSp_f") +
ylab("Total Count") +
labs(fill = "Survived")
#It seems that passenger who have two companies tend to have a better survival rate.
# Examine Ticket
data$Ticket_f <- as.factor(data$Ticket)
summary(data$Ticket_f)
## CA. 2343 1601 CA 2144 3101295
## 11 8 8 7
## 347077 347082 PC 17608 S.O.C. 14879
## 7 7 7 7
## 113781 19950 347088 382652
## 6 6 6 6
## 113503 16966 220845 349909
## 5 5 5 5
## 4133 PC 17757 W./C. 6608 113760
## 5 5 5 4
## 12749 17421 230136 24160
## 4 4 4 4
## 2666 36928 C.A. 2315 C.A. 33112
## 4 4 4 4
## C.A. 34651 LINE PC 17483 PC 17755
## 4 4 4 4
## PC 17760 SC/Paris 2123 W./C. 6607 110152
## 4 4 4 3
## 110413 11767 13502 19877
## 3 3 3 3
## 19928 230080 239853 248727
## 3 3 3 3
## 248738 26360 2650 2653
## 3 3 3 3
## 2661 2662 2668 2678
## 3 3 3 3
## 28220 29103 29106 29750
## 3 3 3 3
## 315153 33638 345773 347080
## 3 3 3 3
## 347742 35273 363291 367226
## 3 3 3 3
## 370129 371110 A/4 48871 A/5. 851
## 3 3 3 3
## C 4001 C.A. 2673 C.A. 31029 C.A. 31921
## 3 3 3 3
## C.A. 37671 F.C.C. 13529 PC 17558 PC 17569
## 3 3 3 3
## PC 17572 PC 17582 PC 17756 PC 17758
## 3 3 3 3
## PC 17761 PP 9549 S.C./PARIS 2079 SOTON/O.Q. 3101315
## 3 3 3 3
## 110465 110813 111361 112058
## 2 2 2 2
## 112378 113059 113505 113509
## 2 2 2 2
## 113572 113773 113776 113789
## 2 2 2 2
## 113796 113798 113803 (Other)
## 2 2 2 947
# Plot on the survive on Ticket
ggplot(data[1:891,], aes(x = Ticket_f, fill = Survived)) +
geom_bar() +
xlab("Ticket Number") +
ylab("Total Count") +
labs(fill = "Survived")
# Ticket has no missing value, but, there is no obvious relation with
the survive rate.
# Examine Fare
summary(data$Fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 7.896 14.454 33.295 31.275 512.329 1
# plot fare relation with survive
ggplot(data[1:891,], aes(x = Fare, fill = Survived)) +
geom_histogram(binwidth = 5) +
xlab("Fare") +
ylab("Total Count") +
ylim(0,50) +
labs(fill = "Survived")
## Warning: Removed 6 rows containing missing values (geom_bar).
Here is one missing value. It is not clear about the prediction power of the Fare.
# Replacing na with the mean value
data[is.na(data$Fare), ]
data$Fare[is.na(data$Fare)] <- mean(data$Fare, na.rm = TRUE)
# Examine Cabin
data$Cabin <- as.factor(data$Cabin)
summary(data$Cabin)
## C23 C25 C27 B57 B59 B63 B66 G6 B96 B98 C22 C26
## 6 5 5 4 4
## C78 D F2 F33 F4
## 4 4 4 4 4
## A34 B51 B53 B55 B58 B60 C101 E101
## 3 3 3 3 3
## E34 B18 B20 B22 B28
## 3 2 2 2 2
## B35 B41 B45 B49 B5
## 2 2 2 2 2
## B69 B71 B77 B78 C106
## 2 2 2 2 2
## C116 C123 C124 C125 C126
## 2 2 2 2 2
## C2 C31 C32 C46 C52
## 2 2 2 2 2
## C54 C55 C57 C6 C62 C64 C65
## 2 2 2 2 2
## C68 C7 C80 C83 C85
## 2 2 2 2 2
## C86 C89 C92 C93 D10 D12
## 2 2 2 2 2
## D15 D17 D19 D20 D21
## 2 2 2 2 2
## D26 D28 D30 D33 D35
## 2 2 2 2 2
## D36 D37 E121 E24 E25
## 2 2 2 2 2
## E31 E33 E44 E46 E50
## 2 2 2 2 2
## E67 E8 F G63 F G73 A10
## 2 2 2 2 1
## A11 A14 A16 A18 A19
## 1 1 1 1 1
## A20 A21 A23 A24 A26
## 1 1 1 1 1
## A29 A31 A32 A36 A5
## 1 1 1 1 1
## A6 A7 A9 (Other) NA's
## 1 1 1 88 1014
str(data$Cabin)
## Factor w/ 186 levels "A10","A11","A14",..: NA 107 NA 71 NA NA 164 NA NA NA ...
# Take a look at just the first char as a factor and add to data as a new attribute
data$cabinfirstchar<- as.factor(substr(data$Cabin, 1, 1))
# first cabin letter survival plot
ggplot(data[1:891,], aes(x = cabinfirstchar, fill = Survived)) +
geom_bar() +
xlab("First Cabin Letter") +
ylab("Total Count") +
ylim(0,750) +
labs(fill = "Survived")
Here we can see that Cabin has a large number of missing value. It does not make sense to predict the cabin numbers with too little information. So Cabin will not be used in the final model.
# Examine Embarked
data$Embarked <- as.factor(data$Embarked)
summary(data$Embarked)
## C Q S NA's
## 270 123 914 2
# Plot data distribution and the survival rate
ggplot(data[1:891,], aes(x = Embarked, fill = Survived)) +
geom_bar(width=0.5) +
xlab("Embarked") +
ylab("Total Count") +
labs(fill = "Survived")
# Replacing NA with "C"
data[is.na(data$Embarked), ]
data$Embarked[is.na(data$Embarked)] <- "C"
summary(data$Parch)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.385 0.000 9.000
# Plot on the survive on Parch
ggplot(data[1:891,], aes(x = Parch, fill = Survived)) +
geom_bar(width = 0.5) +
xlab("Parch") +
ylab("Total Count") +
labs(fill = "Survived")
The plot shows us that it is definitely have an impact on survival.
set.seed(123)
rf_model <- randomForest(factor(Survived) ~ Pclass + Sex + Fare + SibSp + Parch + Ticket, data = train)
rf_model
##
## Call:
## randomForest(formula = factor(Survived) ~ Pclass + Sex + Fare + SibSp + Parch + Ticket, data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 18.97%
## Confusion matrix:
## 0 1 class.error
## 0 490 59 0.1074681
## 1 110 232 0.3216374
prediction <- predict(rf_model, test)
prediction
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 0 1 0 0 1 0 1 1 1 0 0 0 1 0 1 1
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0
## 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 1 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0
## 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
## 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1
## 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 0 1 1 0 0 1 1 0 1 0 1 0 0 1 0 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
## 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0
## 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
## 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1
## 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
## 1 1 1 0 0 0 0 1 1 0 1 0 0 1 0 1
## 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 145 146 147 148 149 150 151 152 <NA> 154 155 156 157 158 159 160
## 0 0 0 0 0 1 1 0 <NA> 1 1 0 1 1 0 1
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
## 1 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1
## 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
## 1 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0
## 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
## 0 0 0 0 1 1 0 0 1 1 0 1 0 0 1 0
## 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
## 1 0 0 0 0 1 1 0 1 0 1 0 1 0 1 0
## 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
## 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
## 1 1 1 0 0 0 1 1 1 1 1 0 0 0 0 0
## 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
## 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0
## 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 1 0 0 1 0 0 0 0 1 0 1 1 1 0 0 1
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
## 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
## 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
## 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0
## 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
## 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0
## 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
## 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0
## 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
## 0 1 1 0 1 0 0 0 0 1 1 0 1 0 0 1
## 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
## 1 0 0 1 0 0 1 1 1 0 0 1 0 0 0 1
## 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
## 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0
## 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
## 1 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0
## 417 418
## 0 1
## Levels: 0 1
# Error
plot(rf_model, ylim=c(0,0.36), main = 'RF_MODEL')
legend('topright', colnames(rf_model$err.rate), col=1:3, fill=1:3)
Notice that the default parameter ‘mtry = 2’ and ntree = 500. It means the number of variables tried at each split is now 2 and the number of trees that can be built is 500. The model’s estimated OOB error rate is 18.97%. So the overall accuracy of the model has reached 81.03%.