Loading the libraries

library(ggplot2)
library(dplyr)
library(randomForest)
library(caret)

Loading the data

setwd("C:/Users/tit0v/R/titanic-kaggle-project")

titanic.train <- read.table("train.csv", sep = ','
                                  ,stringsAsFactors = TRUE, header = TRUE,na.strings =c(""," "))
titanic.test <- read.table("test.csv", sep = ','
                            ,stringsAsFactors = TRUE, header = TRUE,na.strings =c(""," "))

Creating a extra column to help me split the dataset after bind

titanic.train$istrain <- T
titanic.test$istrain <- F

Adding the survived column for test set

titanic.test$Survived <- NA

binding the train and test set

titanic.full <- rbind(titanic.train,titanic.test)

Categorical casting

str(titanic.full)
## 'data.frame':    1309 obs. of  13 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 186 levels "A10","A14","A16",..: NA 82 NA 56 NA NA 130 NA NA NA ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
##  $ istrain    : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
titanic.full$Pclass <- as.factor(titanic.full$Pclass)
titanic.full$Parch <- as.factor(titanic.full$Parch)
titanic.full$SibSp <- as.factor(titanic.full$SibSp)

removing cabin column

titanic.full$Cabin <- NULL

Number of NA

sapply(titanic.full, function(x) sum(is.na(x)))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0         418           0           0           0         263 
##       SibSp       Parch      Ticket        Fare    Embarked     istrain 
##           0           0           0           1           2           0
na.values.df <-sapply(titanic.full, function(x) sum(is.na(x)))
na.values.df[na.values.df>0]
## Survived      Age     Fare Embarked 
##      418      263        1        2

removing Embarked NA

titanic.full[is.na(titanic.full$Embarked), "Embarked"] <- "S"
levels(titanic.train$Embarked)
## [1] "C" "Q" "S"
table(titanic.full$Embarked)
## 
##   C   Q   S 
## 270 123 916

Removing NA from Fare

median(titanic.full$Fare, na.rm = T)
## [1] 14.4542
titanic.full[is.na(titanic.full$Fare), ]
##      PassengerId Survived Pclass               Name  Sex  Age SibSp Parch
## 1044        1044       NA      3 Storey, Mr. Thomas male 60.5     0     0
##      Ticket Fare Embarked istrain
## 1044   3701   NA        S   FALSE
titanic.full$Fare[is.na(titanic.full$Fare)] <- median(titanic.full$Fare, na.rm = T)

removing NA from Age with Linear Regression

top_quant <- boxplot.stats(titanic.full$Age)$stats[5]
agee <- titanic.full[titanic.full$Age <= top_quant,]
na.age <- titanic.full[is.na(titanic.full$Age),]
lm_model <- lm(Age~Sex + SibSp + Pclass,
               data=agee)

age_pred <- predict(lm_model, newdata =na.age[c("Sex", "SibSp","Pclass")]  )
titanic.full[is.na(titanic.full$Age),]$Age <- round(age_pred)

Extracting the Title from Names

titanic.full$Title <- vector("character",length=nrow(titanic.full))
for (i in 1:nrow(titanic.full)) {
   x <- as.character(titanic.full$Name[i])
   m <- regexec(",(\\s+\\w+)+\\.", x) 
   titanic.full$Title[i] <- unlist(strsplit(unlist(regmatches(x,m))," "))[2]
}
levels(as.factor(titanic.full$Title))
##  [1] "Capt."     "Col."      "Don."      "Dona."     "Dr."       "Jonkheer."
##  [7] "Lady."     "Major."    "Master."   "Miss."     "Mlle."     "Mme."     
## [13] "Mr."       "Mrs."      "Ms."       "Rev."      "Sir."      "the"
titanic.full$Title[titanic.full$Title %in% c("Don.","Rev.","Dr.","Major.","Master.", "Sir.","Col.","Capt.","Jonkheer.")] <- "High class male"
titanic.full$Title[titanic.full$Title %in% c("Dona.","Lady.","Mlle.","Mme.", "the", "Ms.")] <- "high class female"

Split the data and loading the y_test

titanic.train <- titanic.full[titanic.full$istrain==T,]
titanic.test <- titanic.full[titanic.full$istrain==F,]
titanic.train$Survived <- as.factor(titanic.train$Survived)
titanic.train$Title <- as.factor(titanic.train$Title)
titanic.test$Title <- as.factor(titanic.test$Title)
str(titanic.train)
## 'data.frame':    891 obs. of  13 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass     : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 27 54 2 27 14 ...
##  $ SibSp      : Factor w/ 7 levels "0","1","2","3",..: 2 2 1 2 1 1 1 4 1 2 ...
##  $ Parch      : Factor w/ 8 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 3 1 ...
##  $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
##  $ istrain    : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ Title      : Factor w/ 5 levels "high class female",..: 4 5 3 5 4 4 4 2 5 5 ...
y_test<- read.table("gender_submission.csv", sep = ','
                    ,stringsAsFactors = TRUE, header = TRUE)

Random Forest

Fitting the model, as we can see there is no difference in OOB error after 100 trees

rf_model <- randomForest(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title,
                         data=titanic.train,nodesize = 0.01 * nrow(titanic.test))
plot(rf_model)

tcontrol <- trainControl(method = "cv", number = 10, repeats = 3)
## Warning: `repeats` has no meaning for this resampling method.
tunegrid <- expand.grid(mtry = seq(2,12,2))
modelRF <- train(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, ntree=100,
                 data=titanic.train, method = "rf",trControl = tcontrol, tuneGrid=tunegrid)
print(modelRF)
## Random Forest 
## 
## 891 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 802, 802, 802, 801, 802, 802, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8081523  0.5863965
##    4    0.8260799  0.6197813
##    6    0.8339326  0.6375999
##    8    0.8350562  0.6411427
##   10    0.8361548  0.6464815
##   12    0.8339326  0.6414916
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.
plot(modelRF)

So we gonna choose 100 trees and 10 features

Random Forest

rf2_model <- randomForest(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, ntree=100,
                         mtry=10,data=titanic.train,nodesize = 0.01 * nrow(titanic.test))
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
pred_test <- predict(rf2_model, 
                   newdata = titanic.test[c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Title")])
pred_train <- predict(rf_model, 
                     newdata = titanic.train[c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Title")])

Confusion Matrix

confusionMatrix(pred_test , as.factor(y_test$Survived))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 235  34
##          1  31 118
##                                           
##                Accuracy : 0.8445          
##                  95% CI : (0.8061, 0.8779)
##     No Information Rate : 0.6364          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6626          
##                                           
##  Mcnemar's Test P-Value : 0.8041          
##                                           
##             Sensitivity : 0.8835          
##             Specificity : 0.7763          
##          Pos Pred Value : 0.8736          
##          Neg Pred Value : 0.7919          
##              Prevalence : 0.6364          
##          Detection Rate : 0.5622          
##    Detection Prevalence : 0.6435          
##       Balanced Accuracy : 0.8299          
##                                           
##        'Positive' Class : 0               
## 
confusionMatrix(pred_train , as.factor(titanic.train$Survived))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 526  70
##          1  23 272
##                                           
##                Accuracy : 0.8956          
##                  95% CI : (0.8737, 0.9149)
##     No Information Rate : 0.6162          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7735          
##                                           
##  Mcnemar's Test P-Value : 1.842e-06       
##                                           
##             Sensitivity : 0.9581          
##             Specificity : 0.7953          
##          Pos Pred Value : 0.8826          
##          Neg Pred Value : 0.9220          
##              Prevalence : 0.6162          
##          Detection Rate : 0.5903          
##    Detection Prevalence : 0.6689          
##       Balanced Accuracy : 0.8767          
##                                           
##        'Positive' Class : 0               
## 

Cross validation

tcontrol2 <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
modelRF2 <- train(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, ntree=100,
                 data=titanic.train, method = "rf",trControl = tcontrol2, tuneGrid=expand.grid(.mtry=c(10)))
modelRF2$results
##   mtry  Accuracy    Kappa AccuracySD   KappaSD
## 1   10 0.8330141 0.639339 0.03352578 0.0741429
PassengerId <- titanic.test$PassengerId
output.df <- as.data.frame(PassengerId)
output.df$Survived <- pred_test

write.csv(output.df, file="kaggle_rf.csv", row.names = F)