Loading the libraries
library(ggplot2)
library(dplyr)
library(randomForest)
library(caret)
Loading the data
setwd("C:/Users/tit0v/R/titanic-kaggle-project")
titanic.train <- read.table("train.csv", sep = ','
,stringsAsFactors = TRUE, header = TRUE,na.strings =c(""," "))
titanic.test <- read.table("test.csv", sep = ','
,stringsAsFactors = TRUE, header = TRUE,na.strings =c(""," "))
Creating a extra column to help me split the dataset after bind
titanic.train$istrain <- T
titanic.test$istrain <- F
Adding the survived column for test set
titanic.test$Survived <- NA
binding the train and test set
titanic.full <- rbind(titanic.train,titanic.test)
Categorical casting
str(titanic.full)
## 'data.frame': 1309 obs. of 13 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 186 levels "A10","A14","A16",..: NA 82 NA 56 NA NA 130 NA NA NA ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## $ istrain : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
titanic.full$Pclass <- as.factor(titanic.full$Pclass)
titanic.full$Parch <- as.factor(titanic.full$Parch)
titanic.full$SibSp <- as.factor(titanic.full$SibSp)
removing cabin column
titanic.full$Cabin <- NULL
Number of NA
sapply(titanic.full, function(x) sum(is.na(x)))
## PassengerId Survived Pclass Name Sex Age
## 0 418 0 0 0 263
## SibSp Parch Ticket Fare Embarked istrain
## 0 0 0 1 2 0
na.values.df <-sapply(titanic.full, function(x) sum(is.na(x)))
na.values.df[na.values.df>0]
## Survived Age Fare Embarked
## 418 263 1 2
removing Embarked NA
titanic.full[is.na(titanic.full$Embarked), "Embarked"] <- "S"
levels(titanic.train$Embarked)
## [1] "C" "Q" "S"
table(titanic.full$Embarked)
##
## C Q S
## 270 123 916
Removing NA from Fare
median(titanic.full$Fare, na.rm = T)
## [1] 14.4542
titanic.full[is.na(titanic.full$Fare), ]
## PassengerId Survived Pclass Name Sex Age SibSp Parch
## 1044 1044 NA 3 Storey, Mr. Thomas male 60.5 0 0
## Ticket Fare Embarked istrain
## 1044 3701 NA S FALSE
titanic.full$Fare[is.na(titanic.full$Fare)] <- median(titanic.full$Fare, na.rm = T)
removing NA from Age with Linear Regression
top_quant <- boxplot.stats(titanic.full$Age)$stats[5]
agee <- titanic.full[titanic.full$Age <= top_quant,]
na.age <- titanic.full[is.na(titanic.full$Age),]
lm_model <- lm(Age~Sex + SibSp + Pclass,
data=agee)
age_pred <- predict(lm_model, newdata =na.age[c("Sex", "SibSp","Pclass")] )
titanic.full[is.na(titanic.full$Age),]$Age <- round(age_pred)
Split the data and loading the y_test
titanic.train <- titanic.full[titanic.full$istrain==T,]
titanic.test <- titanic.full[titanic.full$istrain==F,]
titanic.train$Survived <- as.factor(titanic.train$Survived)
titanic.train$Title <- as.factor(titanic.train$Title)
titanic.test$Title <- as.factor(titanic.test$Title)
str(titanic.train)
## 'data.frame': 891 obs. of 13 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 27 54 2 27 14 ...
## $ SibSp : Factor w/ 7 levels "0","1","2","3",..: 2 2 1 2 1 1 1 4 1 2 ...
## $ Parch : Factor w/ 8 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 3 1 ...
## $ Ticket : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## $ istrain : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ Title : Factor w/ 5 levels "high class female",..: 4 5 3 5 4 4 4 2 5 5 ...
y_test<- read.table("gender_submission.csv", sep = ','
,stringsAsFactors = TRUE, header = TRUE)
Random Forest
Fitting the model, as we can see there is no difference in OOB error after 100 trees
rf_model <- randomForest(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title,
data=titanic.train,nodesize = 0.01 * nrow(titanic.test))
plot(rf_model)

tcontrol <- trainControl(method = "cv", number = 10, repeats = 3)
## Warning: `repeats` has no meaning for this resampling method.
tunegrid <- expand.grid(mtry = seq(2,12,2))
modelRF <- train(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, ntree=100,
data=titanic.train, method = "rf",trControl = tcontrol, tuneGrid=tunegrid)
print(modelRF)
## Random Forest
##
## 891 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 802, 802, 802, 801, 802, 802, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8081523 0.5863965
## 4 0.8260799 0.6197813
## 6 0.8339326 0.6375999
## 8 0.8350562 0.6411427
## 10 0.8361548 0.6464815
## 12 0.8339326 0.6414916
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.
plot(modelRF)

So we gonna choose 100 trees and 10 features
Random Forest
rf2_model <- randomForest(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, ntree=100,
mtry=10,data=titanic.train,nodesize = 0.01 * nrow(titanic.test))
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
pred_test <- predict(rf2_model,
newdata = titanic.test[c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Title")])
pred_train <- predict(rf_model,
newdata = titanic.train[c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Title")])
Confusion Matrix
confusionMatrix(pred_test , as.factor(y_test$Survived))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 235 34
## 1 31 118
##
## Accuracy : 0.8445
## 95% CI : (0.8061, 0.8779)
## No Information Rate : 0.6364
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6626
##
## Mcnemar's Test P-Value : 0.8041
##
## Sensitivity : 0.8835
## Specificity : 0.7763
## Pos Pred Value : 0.8736
## Neg Pred Value : 0.7919
## Prevalence : 0.6364
## Detection Rate : 0.5622
## Detection Prevalence : 0.6435
## Balanced Accuracy : 0.8299
##
## 'Positive' Class : 0
##
confusionMatrix(pred_train , as.factor(titanic.train$Survived))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 526 70
## 1 23 272
##
## Accuracy : 0.8956
## 95% CI : (0.8737, 0.9149)
## No Information Rate : 0.6162
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7735
##
## Mcnemar's Test P-Value : 1.842e-06
##
## Sensitivity : 0.9581
## Specificity : 0.7953
## Pos Pred Value : 0.8826
## Neg Pred Value : 0.9220
## Prevalence : 0.6162
## Detection Rate : 0.5903
## Detection Prevalence : 0.6689
## Balanced Accuracy : 0.8767
##
## 'Positive' Class : 0
##
Cross validation
tcontrol2 <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
modelRF2 <- train(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, ntree=100,
data=titanic.train, method = "rf",trControl = tcontrol2, tuneGrid=expand.grid(.mtry=c(10)))
modelRF2$results
## mtry Accuracy Kappa AccuracySD KappaSD
## 1 10 0.8330141 0.639339 0.03352578 0.0741429
PassengerId <- titanic.test$PassengerId
output.df <- as.data.frame(PassengerId)
output.df$Survived <- pred_test
write.csv(output.df, file="kaggle_rf.csv", row.names = F)