Dataset taken from kaggle: (https://www.kaggle.com/c/titanic)
Goal: It is your job to predict if a passenger survived the sinking of the Titanic or not.
For each PassengerId in the test set, you must predict a 0 or 1 value for the Survived variable.
Metric: Your score is the percentage of passengers you correctly predict. This is known simply as “accuracy”.
Variables:
2 datasets are provided: train and test. Test is without the “survived” information. We will start by doing an EDA on the train dataset.
test <- read.csv("C:/Users/Ndee/Downloads/all/test.csv", stringsAsFactors=FALSE)
train <- read.csv("C:/Users/Ndee/Downloads/all/train.csv", stringsAsFactors=FALSE)
summary(train)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
eda_train <- na.omit(train) # remove the NA for the EDA
# chart
hist(eda_train$Age)
hist(eda_train$Fare)
table(eda_train$Survived, eda_train$Sex) # female had a better probablity to survived
##
## female male
## 0 64 360
## 1 197 93
table(eda_train$Survived, eda_train$Embarked) # interestingly people from Cherbourg survived above 50%
##
## C Q S
## 0 0 51 20 353
## 1 2 79 8 201
boxplot(Fare~Embarked,data=eda_train, main="Ticket fare and Port of embarkment",
xlab="Port of Embarkation", ylab="Passenger fare")
# People who embarked
table(eda_train$Pclass, eda_train$Embarked)
##
## C Q S
## 1 2 74 2 108
## 2 0 15 2 156
## 3 0 41 24 290
# There is a higher ratio of class 1 among passengers, around 57%
pairs(eda_train[, c(2,3,6,7,8,10)])
boxplot(Age~Pclass,data=eda_train, main="Age per Ticket Class",
xlab="Ticket Class", ylab="Age")
hist(eda_train$SibSp)
hist(eda_train$Parch)
rm(eda_train)
~ 20% of Age variable is missing:
We will fill the NA information with the median age of individuals from the same categories. We will check the name and use the median of other Ms. / Ms / Mr etc…
drop Name column
# variable class change
#train$Survived <- as.numeric(train$Survived)
train$Pclass <- as.character(train$Pclass)
# Dealing with NA
## column Embarked:
train$Embarked[train$Embarked == ""] <- "S"
## missing age
mr_match <- grepl("Mr.", x = train$Name, fixed = TRUE)
miss_match <- grepl("Miss.", x = train$Name, fixed = TRUE)
mrs_match <- grepl("Mrs.", x = train$Name, fixed = TRUE)
master_match <- grepl("Master", x = train$Name, fixed = TRUE)
rev_match <- grepl("Rev", x = train$Name, fixed = TRUE)
doc_match <- grepl("Dr.", x = train$Name, fixed = TRUE)
# 864 out of 891 (so 97%) had one of those titles
# We use median because the density show the Mr. age is right skewed
train$Age[mr_match == TRUE & is.na(train$Age)] <- median(na.omit(train$Age[mr_match == TRUE]))
train$Age[miss_match == TRUE & is.na(train$Age)] <- median(na.omit(train$Age[miss_match == TRUE]))
train$Age[mrs_match == TRUE & is.na(train$Age)] <- median(na.omit(train$Age[mrs_match == TRUE]))
train$Age[master_match == TRUE & is.na(train$Age)] <- median(na.omit(train$Age[master_match == TRUE]))
train$Age[rev_match == TRUE & is.na(train$Age)] <- median(na.omit(train$Age[rev_match == TRUE]))
train$Age[doc_match == TRUE & is.na(train$Age)] <- median(na.omit(train$Age[doc_match == TRUE]))
# transform discrete variables into new columns
# Embarked
train$EmbarkedS <- ifelse(train$Embarked == "S", train$EmbarkedS <- 1, train$EmbarkedS <- 0)
train$EmbarkedC <- ifelse(train$Embarked == "C", train$EmbarkedC <- 1, train$EmbarkedC <- 0)
train$EmbarkedQ <- ifelse(train$Embarked == "Q", train$EmbarkedQ <- 1, train$EmbarkedQ <- 0)
# Pclass
train$Pclass1 <- ifelse(train$Pclass == "1", train$Pclass1 <- 1, train$Pclass1 <- 0)
train$Pclass2 <- ifelse(train$Pclass == "2", train$Pclass2 <- 1, train$Pclass2 <- 0)
train$Pclass3 <- ifelse(train$Pclass == "3", train$Pclass3 <- 1, train$Pclass3 <- 0)
# Sex
train$SexF <- ifelse(train$Sex == "female", train$SexF <- 1, train$SexF <- 0)
train$SexM <- ifelse(train$Sex == "male", train$SexM <- 1, train$SexM <- 0)
# remove Name, PassegengerID, Cabin, Ticket, Sex, Pclass, Embarked
train <- train[, -c(1,3,4,5,11,12,9)]
# to avoid singularities, we need to remove one class per category
# let`s remove EmbarkedS, Pclass1, SexM
train <- train[, -c(6,9,13)]
We will divide the “train” dataset that we were given into 60% for training, 20% for cross validation and 20% for final testing.
# 60% training & 20% test and 20% cross validation randomly sampled
set.seed(99)
ds <- train[sample(1:nrow(train)), ]
d_train <- ds[1:535, ]
d_test <- ds[536:713, ]
d_cv <- ds[714:891, ]
rm(train)
# simple algorythm
glm1 <- glm(family = "binomial", Survived ~ ., data = d_train)
summary(glm1)
##
## Call:
## glm(formula = Survived ~ ., family = "binomial", data = d_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6641 -0.5300 -0.3874 0.5679 2.5059
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.891560 0.540498 1.650 0.09904 .
## Age -0.045277 0.010841 -4.177 2.96e-05 ***
## SibSp -0.390311 0.144286 -2.705 0.00683 **
## Parch -0.035114 0.165662 -0.212 0.83214
## Fare 0.001469 0.003120 0.471 0.63769
## EmbarkedC 0.107349 0.322681 0.333 0.73938
## EmbarkedQ 0.167319 0.466062 0.359 0.71959
## Pclass2 -0.771282 0.397049 -1.943 0.05207 .
## Pclass3 -2.006578 0.399185 -5.027 4.99e-07 ***
## SexF 2.956503 0.268880 10.996 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 698.46 on 534 degrees of freedom
## Residual deviance: 445.79 on 525 degrees of freedom
## AIC: 465.79
##
## Number of Fisher Scoring iterations: 5
# Pclass3, SexF, Age and SibSp look promising with a low p value
# Checking the Accuracy
pred1 <- predict(glm1, d_cv, type = "response")
result1 <- ifelse(pred1 > 0.5, result1 <- 1, result1 <- 0)
print(table(result1, d_cv$Survived))
##
## result1 0 1
## 0 97 24
## 1 12 45
print(sum((result1 - d_cv$Survived)^2) / 178) # error rate
## [1] 0.2022472
print((97+45)/178) # accuracy: true positive + true negative divided by total examples
## [1] 0.7977528
79.8% accuracy on the cross validation set
We will use the regsubsets function to select the key variables
reg_glm2 <- regsubsets(Survived ~., d_train)
reg_sum <- summary(reg_glm2)
print(reg_sum)
## Subset selection object
## Call: regsubsets.formula(Survived ~ ., d_train)
## 9 Variables (and intercept)
## Forced in Forced out
## Age FALSE FALSE
## SibSp FALSE FALSE
## Parch FALSE FALSE
## Fare FALSE FALSE
## EmbarkedC FALSE FALSE
## EmbarkedQ FALSE FALSE
## Pclass2 FALSE FALSE
## Pclass3 FALSE FALSE
## SexF FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## Age SibSp Parch Fare EmbarkedC EmbarkedQ Pclass2 Pclass3 SexF
## 1 ( 1 ) " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " " " " " "*" "*"
## 3 ( 1 ) "*" " " " " " " " " " " " " "*" "*"
## 4 ( 1 ) "*" "*" " " " " " " " " " " "*" "*"
## 5 ( 1 ) "*" "*" " " " " " " " " "*" "*" "*"
## 6 ( 1 ) "*" "*" " " "*" " " " " "*" "*" "*"
## 7 ( 1 ) "*" "*" " " "*" " " "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*"
par(mfrow= c(2,2))
plot(reg_sum$rss, xlab = "number of variables", ylab = "RSS", type = "b")
plot(reg_sum$adjr2, xlab = "Number of variables", ylab = "Adjusted Rsqr", type = "b")
which.max(reg_sum$adjr2)
## [1] 5
points(5, reg_sum$adjr2[5], col= "red", cex= 2, pch= 20)
plot(reg_sum$cp, xlab = "Number of variables", ylab = "cp", type = "b")
which.min(reg_sum$cp)
## [1] 5
points(5, reg_sum$cp[5], col= "red", cex= 2, pch= 20)
plot(reg_sum$bic, xlab = "Number of variables", ylab = "BIC", type = "b")
which.min(reg_sum$bic)
## [1] 5
points(5, reg_sum$bic[5], col= "red", cex= 2, pch= 20)
rm(reg_glm2)
# taking 5 variable into account
glm2 <- glm(family = "binomial", Survived ~ SibSp + Age + Pclass2 + Pclass3 + SexF, data = d_train)
summary(glm2)
##
## Call:
## glm(formula = Survived ~ SibSp + Age + Pclass2 + Pclass3 + SexF,
## family = "binomial", data = d_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6754 -0.5325 -0.3920 0.5656 2.5581
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.06612 0.45758 2.330 0.01981 *
## SibSp -0.39900 0.13610 -2.932 0.00337 **
## Age -0.04612 0.01071 -4.305 1.67e-05 ***
## Pclass2 -0.89437 0.34350 -2.604 0.00922 **
## Pclass3 -2.11780 0.32099 -6.598 4.17e-11 ***
## SexF 2.97569 0.25882 11.497 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 698.46 on 534 degrees of freedom
## Residual deviance: 446.29 on 529 degrees of freedom
## AIC: 458.29
##
## Number of Fisher Scoring iterations: 5
# Checking the Accuracy
pred2 <- predict(glm2, d_cv, type = "response")
result2 <- ifelse(pred2 > 0.5, result2 <- 1, result2 <- 0)
print(table(result2, d_cv$Survived))
##
## result2 0 1
## 0 97 24
## 1 12 45
print(sum((result2 - d_cv$Survived)^2) / 178) # error rate
## [1] 0.2022472
print((97+45)/178) # accuracy: true positive + true negative divided by total examples
## [1] 0.7977528
# the exact same result
Although we have the same accuracy result, we gain in interpretatibility
We used so far 0.5 as the threshold for logistic regression. We will optimize this using accuracy (true positive + true negative divided by the total examples)
error_cv <- rep(0, 100)
predict_cv <- predict(glm2, d_cv, type = "response")
for (i in 1:100){
predict <- ifelse(predict_cv > i*0.01, predict <- 1, predict <- 0)
error_cv[i] <- sum((predict - d_cv$Survived)^2) / 178
}
error_cv <- data.frame(threshold = seq(0.01,1,0.01), value = error_cv)
plot(y= error_cv$value, x= error_cv$threshold, xlab = "Threshold", ylab = "Error rate", main = "Choosing the right threshold")
print(which.min(error_cv$value)) # 0.59
## [1] 59
# accuracy
result3 <- ifelse(predict_cv > 0.59, result3 <- 1, result3 <- 0)
print(table(result3, d_cv$Survived))
##
## result3 0 1
## 0 101 25
## 1 8 44
print(sum((result3 - d_cv$Survived)^2) / 178) # error rate
## [1] 0.1853933
print((101+44)/178) # accuracy
## [1] 0.8146067
rm(error_cv, predict_cv, result3)
It improves the overall accuracy to 81% but the share of survivor which has been correctly guessed is not really good
Learning curve on the CV set. We will evaluate the training error on the first i training example and perform the error rate on the ENTIRE test set.
We will calculate the error rate with the misclassification error as follow: it is 1 if I am wrong 0 if it is well predicted
m <- nrow(d_train)
error_train <- vector(mode = "numeric", length = m-20)
error_cv <- vector(mode = "numeric", length = m-20)
for (i in 20:m){
glm3 <- glm(family = "binomial", Survived ~ SibSp + Age + Pclass2 + Pclass3 + SexF, data = d_train[1:i, ])
# error on train
predict_train <- predict(glm3, d_train[1:i, ], type = "response")
predict_train <- ifelse(predict_train > 0.59, predict_train <- 1,
predict_train <- 0)
error_train[i-1] <- sum((predict_train - d_train$Survived[1:i])^2) / i
# sum all errors (each error counting as 1) divided by the number of cases
# error on cv
predict_cv <- predict(glm3, d_cv, type = "response")
predict_cv <- ifelse(predict_cv > 0.59, predict_cv <- 1,
predict_cv <- 0)
error_cv[i-1] <- sum((predict_cv - d_cv$Survived)^2) / 178
# the error cv is done on the whole set
}
# plot and interpretation
learning_curve <- data.frame(error_train, error_cv, i= 1:length(error_cv)+20)
learning_curve <- gather(learning_curve, category, value, -i)
ggplot(learning_curve, aes(x= i, y= value, colour= category, group= category))+
geom_line(size= 1.5)+
theme_bw()+
ggtitle("Learning curve on the cross validation set")
The learning curve seems to indicate that we suffer from high bias. As we get more and more training examples, the cross validation error is not going down much.
Potential solutions include to get new features and / or to add polynomial features.
Although not presented here, I did try to create a new feature based on what I could see as a trend among the wrong predictions. However it didn’t improve the cross validation accuracy. Moreover, polynomial features on the Age variable was not successful neither. Therefore, I decided to move forward and check the accuracy on the test set before submitting the final test set to kaggle.
# Checking the Accuracy
pred3 <- predict(glm2, d_test, type = "response")
result3 <- ifelse(pred3 > 0.59, result3 <- 1, result3 <- 0)
print(table(result3, d_test$Survived))
##
## result3 0 1
## 0 85 33
## 1 12 48
print(sum((result3 - d_test$Survived)^2) / 178) # error rate
## [1] 0.252809
print((85+48)/178) # accuracy: true positive + true negative divided by total examples
## [1] 0.747191
The test set accuracy is a bit lower, with 74.7% of accuracy.
# variable class change
test$Pclass <- as.character(test$Pclass)
# Dealing with NA
## column Embarked:
test$Embarked[test$Embarked == ""] <- "S"
## missing age
mr_match <- grepl("Mr.", x = test$Name, fixed = TRUE)
miss_match <- grepl("Miss.", x = test$Name, fixed = TRUE)
mrs_match <- grepl("Mrs.", x = test$Name, fixed = TRUE)
master_match <- grepl("Master", x = test$Name, fixed = TRUE)
rev_match <- grepl("Rev", x = test$Name, fixed = TRUE)
doc_match <- grepl("Dr.", x = test$Name, fixed = TRUE)
# We use median because the density show the Mr. age is right skewed
test$Age[mr_match == TRUE & is.na(test$Age)] <- median(na.omit(test$Age[mr_match == TRUE]))
test$Age[miss_match == TRUE & is.na(test$Age)] <- median(na.omit(test$Age[miss_match == TRUE]))
test$Age[mrs_match == TRUE & is.na(test$Age)] <- median(na.omit(test$Age[mrs_match == TRUE]))
test$Age[master_match == TRUE & is.na(test$Age)] <- median(na.omit(test$Age[master_match == TRUE]))
test$Age[rev_match == TRUE & is.na(test$Age)] <- median(na.omit(test$Age[rev_match == TRUE]))
test$Age[doc_match == TRUE & is.na(test$Age)] <- median(na.omit(test$Age[doc_match == TRUE]))
# row 980 is without age and she is a Ms. So we will add the median of mrs
test$Age[test$PassengerId == 980] <- 36.5
# transform discrete variables into new columns
# Embarked
test$EmbarkedS <- ifelse(test$Embarked == "S", test$EmbarkedS <- 1, test$EmbarkedS <- 0)
test$EmbarkedC <- ifelse(test$Embarked == "C", test$EmbarkedC <- 1, test$EmbarkedC <- 0)
test$EmbarkedQ <- ifelse(test$Embarked == "Q", test$EmbarkedQ <- 1, test$EmbarkedQ <- 0)
# Pclass
test$Pclass1 <- ifelse(test$Pclass == "1", test$Pclass1 <- 1, test$Pclass1 <- 0)
test$Pclass2 <- ifelse(test$Pclass == "2", test$Pclass2 <- 1, test$Pclass2 <- 0)
test$Pclass3 <- ifelse(test$Pclass == "3", test$Pclass3 <- 1, test$Pclass3 <- 0)
# Sex
test$SexF <- ifelse(test$Sex == "female", test$SexF <- 1, test$SexF <- 0)
test$SexM <- ifelse(test$Sex == "male", test$SexM <- 1, test$SexM <- 0)
# remove columns
test <- test[, -c(2,3,4,10,11,12,8,15,19)]
# Checking the Accuracy
pred_final <- predict(glm2, test, type = "response")
result_final <- ifelse(pred_final > 0.59, result_final <- 1, result_final <- 0)
titanic_test <- data.frame(PassengerId <- test$PassengerId,
Survived <- result_final)
colnames(titanic_test) <- c("PassengerId", "Survived")
# write.csv(titanic_test, "titanic_test.csv", row.names=FALSE)
The final result given by kaggle is 0.78947 accuracy