Introduction

Dataset taken from kaggle: (https://www.kaggle.com/c/titanic)

Goal: It is your job to predict if a passenger survived the sinking of the Titanic or not.

For each PassengerId in the test set, you must predict a 0 or 1 value for the Survived variable.

Metric: Your score is the percentage of passengers you correctly predict. This is known simply as “accuracy”.

Variables:

2 datasets are provided: train and test. Test is without the “survived” information. We will start by doing an EDA on the train dataset.

EDA

test <- read.csv("C:/Users/Ndee/Downloads/all/test.csv", stringsAsFactors=FALSE)
train <- read.csv("C:/Users/Ndee/Downloads/all/train.csv", stringsAsFactors=FALSE)
summary(train)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
eda_train <- na.omit(train) # remove the NA for the EDA 

# chart
hist(eda_train$Age)

hist(eda_train$Fare)

table(eda_train$Survived, eda_train$Sex) # female had a better probablity to survived
##    
##     female male
##   0     64  360
##   1    197   93
table(eda_train$Survived, eda_train$Embarked) # interestingly people from Cherbourg survived above 50%
##    
##           C   Q   S
##   0   0  51  20 353
##   1   2  79   8 201
boxplot(Fare~Embarked,data=eda_train, main="Ticket fare and Port of embarkment",
   xlab="Port of Embarkation", ylab="Passenger fare") 

# People who embarked 
table(eda_train$Pclass, eda_train$Embarked)
##    
##           C   Q   S
##   1   2  74   2 108
##   2   0  15   2 156
##   3   0  41  24 290
# There is a higher ratio of class 1 among passengers, around 57%
pairs(eda_train[, c(2,3,6,7,8,10)])

boxplot(Age~Pclass,data=eda_train, main="Age per Ticket Class",
   xlab="Ticket Class", ylab="Age") 

hist(eda_train$SibSp)

hist(eda_train$Parch)

rm(eda_train)

Cleaning

~ 20% of Age variable is missing:

# variable class change
#train$Survived <- as.numeric(train$Survived)
train$Pclass <- as.character(train$Pclass)

# Dealing with NA

## column Embarked: 
train$Embarked[train$Embarked == ""] <- "S"

## missing age

mr_match <- grepl("Mr.", x = train$Name, fixed = TRUE)
miss_match <- grepl("Miss.", x = train$Name, fixed = TRUE)
mrs_match <- grepl("Mrs.", x = train$Name, fixed = TRUE)
master_match <- grepl("Master", x = train$Name, fixed = TRUE)
rev_match <- grepl("Rev", x = train$Name, fixed = TRUE)
doc_match <- grepl("Dr.", x = train$Name, fixed = TRUE)
# 864 out of 891 (so 97%) had one of those titles
# We use median because the density show the Mr. age is right skewed 
train$Age[mr_match == TRUE & is.na(train$Age)] <- median(na.omit(train$Age[mr_match == TRUE]))

train$Age[miss_match == TRUE & is.na(train$Age)] <-   median(na.omit(train$Age[miss_match == TRUE]))

train$Age[mrs_match == TRUE & is.na(train$Age)] <-   median(na.omit(train$Age[mrs_match == TRUE]))

train$Age[master_match == TRUE & is.na(train$Age)] <- median(na.omit(train$Age[master_match == TRUE]))

train$Age[rev_match == TRUE & is.na(train$Age)] <- median(na.omit(train$Age[rev_match == TRUE]))

train$Age[doc_match == TRUE & is.na(train$Age)] <- median(na.omit(train$Age[doc_match == TRUE]))


# transform discrete variables into new columns
# Embarked
train$EmbarkedS <- ifelse(train$Embarked == "S", train$EmbarkedS <- 1, train$EmbarkedS <- 0)
train$EmbarkedC <- ifelse(train$Embarked == "C", train$EmbarkedC <- 1, train$EmbarkedC <- 0)
train$EmbarkedQ <- ifelse(train$Embarked == "Q", train$EmbarkedQ <- 1, train$EmbarkedQ <- 0)

# Pclass
train$Pclass1 <- ifelse(train$Pclass == "1", train$Pclass1 <- 1, train$Pclass1 <- 0)
train$Pclass2 <- ifelse(train$Pclass == "2", train$Pclass2 <- 1, train$Pclass2 <- 0)
train$Pclass3 <- ifelse(train$Pclass == "3", train$Pclass3 <- 1, train$Pclass3 <- 0)

# Sex
train$SexF <- ifelse(train$Sex == "female", train$SexF <- 1, train$SexF <- 0)
train$SexM <- ifelse(train$Sex == "male", train$SexM <- 1, train$SexM <- 0)

# remove Name, PassegengerID, Cabin, Ticket, Sex, Pclass, Embarked
train <- train[, -c(1,3,4,5,11,12,9)]

# to avoid singularities, we need to remove one class per category
# let`s remove EmbarkedS, Pclass1, SexM
train <- train[, -c(6,9,13)]

Simple algorythm and accuracy result

We will divide the “train” dataset that we were given into 60% for training, 20% for cross validation and 20% for final testing.

# 60% training & 20% test and 20% cross validation randomly sampled
set.seed(99)
ds <- train[sample(1:nrow(train)), ]

d_train <- ds[1:535, ]
d_test <- ds[536:713, ]
d_cv <- ds[714:891, ]

rm(train)

# simple algorythm
glm1 <- glm(family = "binomial", Survived ~ ., data = d_train)
summary(glm1)
## 
## Call:
## glm(formula = Survived ~ ., family = "binomial", data = d_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6641  -0.5300  -0.3874   0.5679   2.5059  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.891560   0.540498   1.650  0.09904 .  
## Age         -0.045277   0.010841  -4.177 2.96e-05 ***
## SibSp       -0.390311   0.144286  -2.705  0.00683 ** 
## Parch       -0.035114   0.165662  -0.212  0.83214    
## Fare         0.001469   0.003120   0.471  0.63769    
## EmbarkedC    0.107349   0.322681   0.333  0.73938    
## EmbarkedQ    0.167319   0.466062   0.359  0.71959    
## Pclass2     -0.771282   0.397049  -1.943  0.05207 .  
## Pclass3     -2.006578   0.399185  -5.027 4.99e-07 ***
## SexF         2.956503   0.268880  10.996  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 698.46  on 534  degrees of freedom
## Residual deviance: 445.79  on 525  degrees of freedom
## AIC: 465.79
## 
## Number of Fisher Scoring iterations: 5
# Pclass3, SexF, Age and SibSp look promising with a low p value

# Checking the Accuracy
pred1 <- predict(glm1, d_cv, type = "response")
result1 <- ifelse(pred1 > 0.5, result1 <- 1, result1 <- 0)

print(table(result1, d_cv$Survived))
##        
## result1  0  1
##       0 97 24
##       1 12 45
print(sum((result1 - d_cv$Survived)^2) / 178) # error rate
## [1] 0.2022472
print((97+45)/178) # accuracy: true positive + true negative divided by total examples 
## [1] 0.7977528

79.8% accuracy on the cross validation set

Variable selection

We will use the regsubsets function to select the key variables

reg_glm2 <- regsubsets(Survived ~., d_train)
reg_sum <- summary(reg_glm2)
print(reg_sum)
## Subset selection object
## Call: regsubsets.formula(Survived ~ ., d_train)
## 9 Variables  (and intercept)
##           Forced in Forced out
## Age           FALSE      FALSE
## SibSp         FALSE      FALSE
## Parch         FALSE      FALSE
## Fare          FALSE      FALSE
## EmbarkedC     FALSE      FALSE
## EmbarkedQ     FALSE      FALSE
## Pclass2       FALSE      FALSE
## Pclass3       FALSE      FALSE
## SexF          FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          Age SibSp Parch Fare EmbarkedC EmbarkedQ Pclass2 Pclass3 SexF
## 1  ( 1 ) " " " "   " "   " "  " "       " "       " "     " "     "*" 
## 2  ( 1 ) " " " "   " "   " "  " "       " "       " "     "*"     "*" 
## 3  ( 1 ) "*" " "   " "   " "  " "       " "       " "     "*"     "*" 
## 4  ( 1 ) "*" "*"   " "   " "  " "       " "       " "     "*"     "*" 
## 5  ( 1 ) "*" "*"   " "   " "  " "       " "       "*"     "*"     "*" 
## 6  ( 1 ) "*" "*"   " "   "*"  " "       " "       "*"     "*"     "*" 
## 7  ( 1 ) "*" "*"   " "   "*"  " "       "*"       "*"     "*"     "*" 
## 8  ( 1 ) "*" "*"   " "   "*"  "*"       "*"       "*"     "*"     "*"
par(mfrow= c(2,2))
plot(reg_sum$rss, xlab = "number of variables", ylab = "RSS", type = "b")
plot(reg_sum$adjr2, xlab = "Number of variables", ylab = "Adjusted Rsqr", type = "b")
which.max(reg_sum$adjr2)
## [1] 5
points(5, reg_sum$adjr2[5], col= "red", cex= 2, pch= 20)

plot(reg_sum$cp, xlab = "Number of variables", ylab = "cp", type = "b")
which.min(reg_sum$cp)
## [1] 5
points(5, reg_sum$cp[5], col= "red", cex= 2, pch= 20)

plot(reg_sum$bic, xlab = "Number of variables", ylab = "BIC", type = "b")
which.min(reg_sum$bic)
## [1] 5
points(5, reg_sum$bic[5], col= "red", cex= 2, pch= 20)

rm(reg_glm2)

# taking 5 variable into account
glm2 <- glm(family = "binomial", Survived ~ SibSp + Age + Pclass2 + Pclass3 + SexF, data = d_train)

summary(glm2)
## 
## Call:
## glm(formula = Survived ~ SibSp + Age + Pclass2 + Pclass3 + SexF, 
##     family = "binomial", data = d_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6754  -0.5325  -0.3920   0.5656   2.5581  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.06612    0.45758   2.330  0.01981 *  
## SibSp       -0.39900    0.13610  -2.932  0.00337 ** 
## Age         -0.04612    0.01071  -4.305 1.67e-05 ***
## Pclass2     -0.89437    0.34350  -2.604  0.00922 ** 
## Pclass3     -2.11780    0.32099  -6.598 4.17e-11 ***
## SexF         2.97569    0.25882  11.497  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 698.46  on 534  degrees of freedom
## Residual deviance: 446.29  on 529  degrees of freedom
## AIC: 458.29
## 
## Number of Fisher Scoring iterations: 5
# Checking the Accuracy
pred2 <- predict(glm2, d_cv, type = "response")
result2 <- ifelse(pred2 > 0.5, result2 <- 1, result2 <- 0)

print(table(result2, d_cv$Survived))
##        
## result2  0  1
##       0 97 24
##       1 12 45
print(sum((result2 - d_cv$Survived)^2) / 178) # error rate
## [1] 0.2022472
print((97+45)/178) # accuracy: true positive + true negative divided by total examples
## [1] 0.7977528
# the exact same result

Although we have the same accuracy result, we gain in interpretatibility

Choosing the threshold

We used so far 0.5 as the threshold for logistic regression. We will optimize this using accuracy (true positive + true negative divided by the total examples)

error_cv <- rep(0, 100)
predict_cv <- predict(glm2, d_cv, type = "response")

for (i in 1:100){
  
  predict <- ifelse(predict_cv > i*0.01, predict <- 1, predict <- 0)
  error_cv[i] <- sum((predict - d_cv$Survived)^2) / 178

}

error_cv <- data.frame(threshold = seq(0.01,1,0.01), value = error_cv)
plot(y= error_cv$value, x= error_cv$threshold, xlab = "Threshold", ylab = "Error rate", main = "Choosing the right threshold")

print(which.min(error_cv$value)) # 0.59
## [1] 59
# accuracy
result3 <- ifelse(predict_cv > 0.59, result3 <- 1, result3 <- 0)

print(table(result3, d_cv$Survived))
##        
## result3   0   1
##       0 101  25
##       1   8  44
print(sum((result3 - d_cv$Survived)^2) / 178) # error rate
## [1] 0.1853933
print((101+44)/178) # accuracy
## [1] 0.8146067
rm(error_cv, predict_cv, result3)

It improves the overall accuracy to 81% but the share of survivor which has been correctly guessed is not really good

Learning curve

Learning curve on the CV set. We will evaluate the training error on the first i training example and perform the error rate on the ENTIRE test set.

We will calculate the error rate with the misclassification error as follow: it is 1 if I am wrong 0 if it is well predicted

m <- nrow(d_train)

error_train <- vector(mode = "numeric", length = m-20)
error_cv <- vector(mode = "numeric", length = m-20)

for (i in 20:m){
  
  glm3 <- glm(family = "binomial", Survived ~ SibSp + Age + Pclass2 + Pclass3 + SexF, data = d_train[1:i, ])
  
  # error on train
  predict_train <- predict(glm3, d_train[1:i, ], type = "response")
  predict_train <- ifelse(predict_train > 0.59, predict_train <- 1, 
                          predict_train <- 0)
  error_train[i-1] <- sum((predict_train - d_train$Survived[1:i])^2) / i
  # sum all errors (each error counting as 1) divided by the number of cases
  
  # error on cv
  predict_cv <- predict(glm3, d_cv, type = "response")
  predict_cv <- ifelse(predict_cv > 0.59, predict_cv <- 1, 
                          predict_cv <- 0)
  error_cv[i-1] <- sum((predict_cv - d_cv$Survived)^2) / 178  
  # the error cv is done on the whole set
}

# plot and interpretation
learning_curve <- data.frame(error_train, error_cv, i= 1:length(error_cv)+20) 
learning_curve <- gather(learning_curve, category, value, -i)


ggplot(learning_curve, aes(x= i, y= value, colour= category, group= category))+
  geom_line(size= 1.5)+
  theme_bw()+
  ggtitle("Learning curve on the cross validation set")

The learning curve seems to indicate that we suffer from high bias. As we get more and more training examples, the cross validation error is not going down much.

Potential solutions include to get new features and / or to add polynomial features.

Although not presented here, I did try to create a new feature based on what I could see as a trend among the wrong predictions. However it didn’t improve the cross validation accuracy. Moreover, polynomial features on the Age variable was not successful neither. Therefore, I decided to move forward and check the accuracy on the test set before submitting the final test set to kaggle.

Test set accuracy

# Checking the Accuracy
pred3 <- predict(glm2, d_test, type = "response")
result3 <- ifelse(pred3 > 0.59, result3 <- 1, result3 <- 0)

print(table(result3, d_test$Survived))
##        
## result3  0  1
##       0 85 33
##       1 12 48
print(sum((result3 - d_test$Survived)^2) / 178) # error rate
## [1] 0.252809
print((85+48)/178) # accuracy: true positive + true negative divided by total examples
## [1] 0.747191

The test set accuracy is a bit lower, with 74.7% of accuracy.

Preparing the final test set

# variable class change
test$Pclass <- as.character(test$Pclass)

# Dealing with NA
## column Embarked: 
test$Embarked[test$Embarked == ""] <- "S"

## missing age
mr_match <- grepl("Mr.", x = test$Name, fixed = TRUE)
miss_match <- grepl("Miss.", x = test$Name, fixed = TRUE)
mrs_match <- grepl("Mrs.", x = test$Name, fixed = TRUE)
master_match <- grepl("Master", x = test$Name, fixed = TRUE)
rev_match <- grepl("Rev", x = test$Name, fixed = TRUE)
doc_match <- grepl("Dr.", x = test$Name, fixed = TRUE)
# We use median because the density show the Mr. age is right skewed 
test$Age[mr_match == TRUE & is.na(test$Age)] <-  median(na.omit(test$Age[mr_match == TRUE]))

test$Age[miss_match == TRUE & is.na(test$Age)] <-  median(na.omit(test$Age[miss_match == TRUE]))

test$Age[mrs_match == TRUE & is.na(test$Age)] <- median(na.omit(test$Age[mrs_match == TRUE]))

test$Age[master_match == TRUE & is.na(test$Age)] <-  median(na.omit(test$Age[master_match == TRUE]))

test$Age[rev_match == TRUE & is.na(test$Age)] <- median(na.omit(test$Age[rev_match == TRUE]))

test$Age[doc_match == TRUE & is.na(test$Age)] <- median(na.omit(test$Age[doc_match == TRUE]))

# row 980 is without age and she is a Ms. So we will add the median of mrs
test$Age[test$PassengerId == 980] <- 36.5

# transform discrete variables into new columns

# Embarked
test$EmbarkedS <- ifelse(test$Embarked == "S", test$EmbarkedS <- 1, test$EmbarkedS <- 0)
test$EmbarkedC <- ifelse(test$Embarked == "C", test$EmbarkedC <- 1, test$EmbarkedC <- 0)
test$EmbarkedQ <- ifelse(test$Embarked == "Q", test$EmbarkedQ <- 1, test$EmbarkedQ <- 0)

# Pclass
test$Pclass1 <- ifelse(test$Pclass == "1", test$Pclass1 <- 1, test$Pclass1 <- 0)
test$Pclass2 <- ifelse(test$Pclass == "2", test$Pclass2 <- 1, test$Pclass2 <- 0)
test$Pclass3 <- ifelse(test$Pclass == "3", test$Pclass3 <- 1, test$Pclass3 <- 0)

# Sex
test$SexF <- ifelse(test$Sex == "female", test$SexF <- 1, test$SexF <- 0)
test$SexM <- ifelse(test$Sex == "male", test$SexM <- 1, test$SexM <- 0)

# remove columns
test <- test[, -c(2,3,4,10,11,12,8,15,19)]

Final file preparation for Kaggle

# Checking the Accuracy
pred_final <- predict(glm2, test, type = "response")
result_final <- ifelse(pred_final > 0.59, result_final <- 1, result_final <- 0)

titanic_test <- data.frame(PassengerId <- test$PassengerId,
                           Survived <- result_final)
colnames(titanic_test) <- c("PassengerId", "Survived")

# write.csv(titanic_test, "titanic_test.csv", row.names=FALSE)

The final result given by kaggle is 0.78947 accuracy