Titanic Project

Loading the libraries

library(ggplot2)
library(dplyr)
library(randomForest)
library(caret)

Loading the data

setwd("C:/Users/tit0v/R/titanic-kaggle-project")

titanic.train <- read.table("train.csv", sep = ','
                                  ,stringsAsFactors = TRUE, header = TRUE,na.strings =c(""," "))
titanic.test <- read.table("test.csv", sep = ','
                            ,stringsAsFactors = TRUE, header = TRUE,na.strings =c(""," "))

Creating a extra column to help me split the dataset after bind

titanic.train$istrain <- T
titanic.test$istrain <- F

Adding the survived column for test set

titanic.test$Survived <- NA

binding the train and test set

titanic.full <- rbind(titanic.train,titanic.test)

Categorical casting

str(titanic.full)

## 'data.frame':    1309 obs. of  13 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 186 levels "A10","A14","A16",..: NA 82 NA 56 NA NA 130 NA NA NA ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
##  $ istrain    : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...

titanic.full$Pclass <- as.factor(titanic.full$Pclass)
titanic.full$Parch <- as.factor(titanic.full$Parch)
titanic.full$SibSp <- as.factor(titanic.full$SibSp)

removing cabin column

titanic.full$Cabin <- NULL

Number of NA

sapply(titanic.full, function(x) sum(is.na(x)))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0         418           0           0           0         263 
##       SibSp       Parch      Ticket        Fare    Embarked     istrain 
##           0           0           0           1           2           0

na.values.df <-sapply(titanic.full, function(x) sum(is.na(x)))
na.values.df[na.values.df>0]

## Survived      Age     Fare Embarked 
##      418      263        1        2

removing Embarked NA

titanic.full[is.na(titanic.full$Embarked), "Embarked"] <- "S"
levels(titanic.train$Embarked)

## [1] "C" "Q" "S"

table(titanic.full$Embarked)

## 
##   C   Q   S 
## 270 123 916

Removing NA from Fare

median(titanic.full$Fare, na.rm = T)

## [1] 14.4542

titanic.full[is.na(titanic.full$Fare), ]

##      PassengerId Survived Pclass               Name  Sex  Age SibSp Parch
## 1044        1044       NA      3 Storey, Mr. Thomas male 60.5     0     0
##      Ticket Fare Embarked istrain
## 1044   3701   NA        S   FALSE

titanic.full$Fare[is.na(titanic.full$Fare)] <- median(titanic.full$Fare, na.rm = T)

removing NA from Age with Linear Regression

top_quant <- boxplot.stats(titanic.full$Age)$stats[5]
agee <- titanic.full[titanic.full$Age <= top_quant,]
na.age <- titanic.full[is.na(titanic.full$Age),]
lm_model <- lm(Age~Sex + SibSp + Pclass,
               data=agee)

age_pred <- predict(lm_model, newdata =na.age[c("Sex", "SibSp","Pclass")]  )
titanic.full[is.na(titanic.full$Age),]$Age <- round(age_pred)

Extracting the Title from Names

titanic.full$Title <- vector("character",length=nrow(titanic.full))
for (i in 1:nrow(titanic.full)) {
   x <- as.character(titanic.full$Name[i])
   m <- regexec(",(\\s+\\w+)+\\.", x) 
   titanic.full$Title[i] <- unlist(strsplit(unlist(regmatches(x,m))," "))[2]
}
levels(as.factor(titanic.full$Title))

##  [1] "Capt."     "Col."      "Don."      "Dona."     "Dr."       "Jonkheer."
##  [7] "Lady."     "Major."    "Master."   "Miss."     "Mlle."     "Mme."     
## [13] "Mr."       "Mrs."      "Ms."       "Rev."      "Sir."      "the"

titanic.full$Title[titanic.full$Title %in% c("Don.","Rev.","Dr.","Major.","Master.", "Sir.","Col.","Capt.","Jonkheer.")] <- "High class male"
titanic.full$Title[titanic.full$Title %in% c("Dona.","Lady.","Mlle.","Mme.", "the", "Ms.")] <- "high class female"

Split the data and loading the y_test

titanic.train <- titanic.full[titanic.full$istrain==T,]
titanic.test <- titanic.full[titanic.full$istrain==F,]
titanic.train$Survived <- as.factor(titanic.train$Survived)
titanic.train$Title <- as.factor(titanic.train$Title)
titanic.test$Title <- as.factor(titanic.test$Title)
str(titanic.train)

## 'data.frame':    891 obs. of  13 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass     : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 27 54 2 27 14 ...
##  $ SibSp      : Factor w/ 7 levels "0","1","2","3",..: 2 2 1 2 1 1 1 4 1 2 ...
##  $ Parch      : Factor w/ 8 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 3 1 ...
##  $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
##  $ istrain    : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ Title      : Factor w/ 5 levels "high class female",..: 4 5 3 5 4 4 4 2 5 5 ...

y_test<- read.table("gender_submission.csv", sep = ','
                    ,stringsAsFactors = TRUE, header = TRUE)

Random Forest

Fitting the model, as we can see there is no difference in OOB error after 100 trees

rf_model <- randomForest(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title,
                         data=titanic.train,nodesize = 0.01 * nrow(titanic.test))
plot(rf_model)

tcontrol <- trainControl(method = "cv", number = 10, repeats = 3)

## Warning: `repeats` has no meaning for this resampling method.

tunegrid <- expand.grid(mtry = seq(2,12,2))
modelRF <- train(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, ntree=100,
                 data=titanic.train, method = "rf",trControl = tcontrol, tuneGrid=tunegrid)
print(modelRF)

## Random Forest 
## 
## 891 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 802, 802, 802, 801, 802, 802, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8081523  0.5863965
##    4    0.8260799  0.6197813
##    6    0.8339326  0.6375999
##    8    0.8350562  0.6411427
##   10    0.8361548  0.6464815
##   12    0.8339326  0.6414916
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.

plot(modelRF)

So we gonna choose 100 trees and 10 features

Random Forest

rf2_model <- randomForest(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, ntree=100,
                         mtry=10,data=titanic.train,nodesize = 0.01 * nrow(titanic.test))

## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range

pred_test <- predict(rf2_model, 
                   newdata = titanic.test[c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Title")])
pred_train <- predict(rf_model, 
                     newdata = titanic.train[c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Title")])

Confusion Matrix

confusionMatrix(pred_test , as.factor(y_test$Survived))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 235  34
##          1  31 118
##                                           
##                Accuracy : 0.8445          
##                  95% CI : (0.8061, 0.8779)
##     No Information Rate : 0.6364          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6626          
##                                           
##  Mcnemar's Test P-Value : 0.8041          
##                                           
##             Sensitivity : 0.8835          
##             Specificity : 0.7763          
##          Pos Pred Value : 0.8736          
##          Neg Pred Value : 0.7919          
##              Prevalence : 0.6364          
##          Detection Rate : 0.5622          
##    Detection Prevalence : 0.6435          
##       Balanced Accuracy : 0.8299          
##                                           
##        'Positive' Class : 0               
##

confusionMatrix(pred_train , as.factor(titanic.train$Survived))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 526  70
##          1  23 272
##                                           
##                Accuracy : 0.8956          
##                  95% CI : (0.8737, 0.9149)
##     No Information Rate : 0.6162          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7735          
##                                           
##  Mcnemar's Test P-Value : 1.842e-06       
##                                           
##             Sensitivity : 0.9581          
##             Specificity : 0.7953          
##          Pos Pred Value : 0.8826          
##          Neg Pred Value : 0.9220          
##              Prevalence : 0.6162          
##          Detection Rate : 0.5903          
##    Detection Prevalence : 0.6689          
##       Balanced Accuracy : 0.8767          
##                                           
##        'Positive' Class : 0               
##

Cross validation

tcontrol2 <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
modelRF2 <- train(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, ntree=100,
                 data=titanic.train, method = "rf",trControl = tcontrol2, tuneGrid=expand.grid(.mtry=c(10)))
modelRF2$results

##   mtry  Accuracy    Kappa AccuracySD   KappaSD
## 1   10 0.8330141 0.639339 0.03352578 0.0741429

PassengerId <- titanic.test$PassengerId
output.df <- as.data.frame(PassengerId)
output.df$Survived <- pred_test

write.csv(output.df, file="kaggle_rf.csv", row.names = F)

Titanic Project

Stavros Oikonomou

22/3/2020

Loading the libraries

Loading the data

Creating a extra column to help me split the dataset after bind

Adding the survived column for test set

binding the train and test set

Categorical casting

removing cabin column

Number of NA

removing Embarked NA

Removing NA from Fare

removing NA from Age with Linear Regression

Extracting the Title from Names

Split the data and loading the y_test

Random Forest

Fitting the model, as we can see there is no difference in OOB error after 100 trees

So we gonna choose 100 trees and 10 features

Random Forest

Confusion Matrix

Cross validation