loading required libraries

library(RColorBrewer) library(h2o) library(ggsci) library(forcats) library(broom) library(WVPlots) library(caret) library(pROC) library(recipes) library(readr) library(rlang) library(httr) library(jsonlite) library(data.table) library(tidyr) library(ROCR) library(vip) library(stringr) library(scales) library(ggtext) library(ggplot2) library(dplyr) library(showtext) library(extrafont) library(viridis) library(rpart) library(rpart.plot) library(ranger) library(randomForest) library(rattle)

loading data

titanic.test <- fread(“data/test.csv”) titanic.train <-fread(“data/train.csv”) gen.subm <- fread(“data/gender_submission.csv”)

get rid of NA

titanic.train1 <- titanic.train %>% mutate(Pclass = factor(Pclass), Sex = factor(Sex), Survived = factor(Survived)) %>% filter(Age != is.na(Age)) %>% select(-c(“Ticket”, “Cabin”, “Name”))

make themes to be used later

loadfonts(device = “win”) font_add(“Palatino”, “pala.ttf”) showtext_auto() theme1 <- theme_minimal(base_size = 30) + theme(plot.subtitle = element_text(face = “italic”), text = element_text(family = “Palatino”)) color1 <- pal_jama(palette = “default”)(3)[1] color2 <- pal_jama(palette = “default”)(3)[2] color3 <- pal_jama(palette = “default”)(3)[3]

my_theme <- function(type = c(“discrete”, “continuous”)) { if(type == “discrete”) { list(theme1, scale_colour_manual(values = c(color1, color3, color2)), scale_fill_manual(values = c(color1, color3, color2))) } else { list(theme1, scale_fill_continuous(high = color1, low = color3)) } }

Exploratory Data Analysis

age relevance

titanic.train2 <- titanic.train1 %>% mutate(Survived = fct_recode(Survived, “Survivor” = “1”, “Deceased” = “0”), Sex = fct_recode(Sex, “Male” = “male”, “Female” = “female”), Family_size = factor(SibSp + Parch + 1))

vline_df <- titanic.train2 %>% group_by(Survived, Sex, .drop = FALSE) %>% summarise(Mean = mean(Age), Median = median(Age)) %>% as.data.frame() %>% mutate(Comment_median = paste(“Median Age:”, Median), Comment_mean = paste(“Mean Age:”, round(Mean, 2)))

titanic.train2 %>% ggplot(aes(x = Age)) + geom_density(data = titanic.train2[Survived == “Survivor”], aes(x = Age, y = ..density.., color = Survived, fill = Survived), alpha = 0.3, size = 0.8) + geom_density(data = titanic.train2[Survived == “Deceased”], aes(x = Age, y = -..density.., color = Survived, fill = Survived), alpha = 0.3, size = 0.8) + geom_segment(data = vline_df %>% filter(Survived == “Survivor”), aes(x = Mean, y = 0 , xend = Mean, yend = Inf, color = Survived), size = 1) + geom_segment(data = vline_df %>% filter(Survived == “Deceased”), aes(x = Mean, y = 0 , xend = Mean, yend = -Inf, color = Survived), size = 1) + geom_segment(data = vline_df %>% filter(Survived == “Survivor”), aes(x = Median, y = 0 , xend = Median, yend = Inf, color = Survived), linetype = ‘dashed’, size = 1) + geom_segment(data = vline_df %>% filter(Survived == “Deceased”), aes(x = Median, y = 0 , xend = Median, yend = -Inf, color = Survived), linetype = ‘dashed’, size = 1) + geom_text(data = vline_df %>% filter(Survived == “Survivor”), y = 0.02, aes(x = 70, label = paste(Comment_mean,“”,Comment_median), color = Survived), family = “Palatino”, size = 9) + geom_text(data = vline_df %>% filter(Survived == “Deceased”), y = -0.02, aes(x = 70, label = paste(Comment_mean,“”,Comment_median), color = Survived), family = “Palatino”, size = 9) + coord_flip() + labs( title = “Density of Age Per Sex” ) + my_theme(type = “discrete”) + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank()) + facet_wrap(~Sex)

fare price relevance

titanic.train2 %>% filter(Survived == “Deceased”) %>% filter(Fare != is.na(Fare)) %>% group_by(grp_age = cut(Age, breaks = seq(0,80, by = 10)), grp_fare = cut(Fare, breaks = c(seq(0,100, by = 20), 520)), Survived) %>% summarise(n = n()) %>% ggplot(aes(x = grp_age, y = grp_fare)) + geom_tile(aes(fill = n), width = 0.9, height = 0.9) + geom_text(aes(x = grp_age, y = grp_fare, label = n), family = “Palatino”, color = “white”, size = 20) + labs( title = “Number Of People per Age and Fare Group - Deceased Passengers”, y = “Fare Group”, x = “Age Group”, fill = “Count” ) + my_theme(type = “continuous”) + theme(legend.position = ‘none’, panel.grid.major.x = element_blank(), panel.grid.major.y = element_blank()) + scale_fill_gradient2(low = “lightblue”, high = color1, trans = “log10”)

family size

titanic.train2 %>% mutate(Family_size = factor(Family_size)) %>% ggplot(aes(x = Family_size)) + geom_bar(data = titanic.train2 %>% filter(Survived == “Survivor”), aes(y = ..count.., fill = Survived)) + geom_bar(data = titanic.train2 %>% filter(Survived == “Deceased”), aes(y = -..count.., fill = Survived)) + geom_text(data = titanic.train2 %>% filter(Survived == “Survivor”), stat = ‘count’,aes(label = ..count.., y = ..count..), family = “Palatino”, hjust = -0.5, size = 15) + geom_text(data = titanic.train2 %>% filter(Survived == “Deceased”), stat = ‘count’,aes(label = ..count.., y = -..count..), family = “Palatino”, hjust = 1.25, size = 15) + scale_x_discrete(limits = rev) + scale_y_continuous(limits = c(-285, 285)) + coord_flip() + labs( title = “Number of Individuals Per Family Size”, x = “Size of Family” ) + my_theme(type = “discrete”) + theme(axis.text.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), axis.title.x = element_blank())

combining most important variables which are age, sex and class and look at the survival rate within each group

titanic.train3 <- titanic.train2 %>% group_by(grp_age = cut(Age, breaks = seq(0,80, by = 20)),Survived, Sex, Pclass) %>% count() %>% group_by(grp_age, Sex, Pclass) %>% mutate(prop = n/sum(n))

titanic.train3

titanic.train3 %>% ggplot(aes(x = grp_age, y = prop, fill = Survived)) + geom_col(position = ‘fill’) + geom_text(data = titanic.train3 %>% filter(Survived == “Survivor”), aes(x = grp_age, y = 0.2, label = paste(percent(prop))), color = “white”, family = “Palatino”, size = 13) + geom_text(data = titanic.train3, aes(x = grp_age, y = 0.1, label = paste(“(”,n/prop,“)”)), color = “white”, family = “Palatino”, size = 13) + facet_grid(Pclass~Sex) + my_theme(type = “discrete”) + theme(plot.subtitle = element_text(color = color3, family=“Palatino”, face = “italic”), panel.grid.major.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.y = element_blank()) + labs( title = “Survival Rate Analysis”, subtitle = “Percentage inside each bar indicates survival rate”, y = “Survival Rate”, x = “Age Group” )

setting logistic regression model

set.seed(123)

ctrl <- trainControl(method = “cv”, number = 10)

(cv_model_pls <- train( Survived ~ ., data = titanic.train1, method = “pls”, family = “binomial”, preProcess = c(“zv”,“center”, “scale”), trControl = ctrl, tuneLenght = 8 ))

ggplot(cv_model_pls) + geom_line(color = color1) + geom_point(color = color3, size = 3) + geom_text(aes(x = ncomp, y = Accuracy, label = percent(Accuracy)), family = “Palatino” , vjust = -1, size = 13) + my_theme(type = “discrete”) + labs( title = “Principal component analysis of Accuracy”, x = “Number of Components” )

vip1 <- vip(cv_model_pls, num_features = 9, method = “model”)

vip1$data %>% ggplot(aes(x = Importance, y = reorder(Variable, Importance), fill = Importance)) + geom_col() + my_theme(type = “continuous”) + labs( title = “Component Importance”, y = “Components” )

titanic.train1$survival_pred <- predict(cv_model_pls, titanic.train1) model_prob <- predict(cv_model_pls, newdata = titanic.train1, type = “prob”)

titanic.train1 <- titanic.train1 %>% mutate(survival_prob = model_prob[,2])

cfMatrix <- confusionMatrix( data = relevel(titanic.train1\(survival_pred, ref = "1"), reference = relevel(titanic.train1\)Survived, ref = “1”) )

cfMatrix

ROC <- roc(titanic.train1\(Survived, as.numeric(titanic.train1\)survival_pred))

titanic.train2 <- titanic.train1 %>% mutate(Survived = as.numeric(as.character(Survived)), survival_prob = as.numeric(as.character(survival_prob)))

ROCPlot(titanic.train2, “survival_prob”, “Survived”, truthTarget = TRUE, title = “ROC Curve”, curve_color = color3) + my_theme(type = “discrete”) + labs( title = “ROC Curve” )

cfMatrix_plot <- as.data.frame(cfMatrix$table) cfMatrix_plot <- cfMatrix_plot %>% mutate(Prediction = fct_relevel(Prediction, “1”, after = 1), Reference = fct_relevel(Reference, “1”, after = 1)) %>% group_by(Prediction) %>% mutate(Accuracy_prediction = Freq/sum(Freq)) %>% ungroup() %>% group_by(Reference) %>% mutate(Accuracy_reference = Freq/sum(Freq))

cfMatrix_plot

cfMatrix_plot %>% ggplot(aes(x = Prediction, y = Reference, fill = Freq)) + geom_tile() + geom_text(aes(x = Prediction, y = Reference, label = Freq), family = “Palatino”, color = “white”, size = 20) + my_theme(type = “continuous”) + theme(legend.position = “none”) + labs(title = “Confusion Matrix”)

titanic.train1 %>% mutate(Sex = fct_recode(Sex , “Male” = “male”, “Female” = “female”)) %>% ggplot(aes(x = Age, y = survival_prob, color = Pclass)) + geom_point() + geom_smooth(method = “glm”, method.args = list(family = “binomial”), se = FALSE) + facet_wrap(~Sex) + my_theme(type = “discrete”) + labs( title = “Survival Probability with Logistic Regression”, y = “Survival Probability”, color = “Passenger Class” ) titanic.train1 %>% ggplot(aes(x = survival_prob, y = as.numeric(as.character(Survived)))) + geom_point(alpha = 0.5, color = color3) + stat_smooth(method = “glm”, method.args = list(family = “binomial”), se = FALSE, color = color1) + my_theme(type = “discrete”) + labs( title = “Prediction Probabilities vs Actual Results”, y = “Actual Results”, x = “Prediction Probabilities” )

Applying the model

titanic.train.recipe <- titanic.train %>% select(-c(“Name”, “Ticket”, “Cabin”))

Test.recipe <- recipe(Survived ~ ., data = titanic.train.recipe) %>% step_impute_knn(Age, impute_with = imp_vars(Pclass, Fare, SibSp, Parch), neighbors = 6) %>% step_impute_knn(Fare, impute_with = imp_vars(Pclass, Age, SibSp, Parch), neighbors = 6)

Test.recipe

prepare <- prep(Test.recipe, training = titanic.train.recipe, strings_as_factors = TRUE)

titanic.test1 <- titanic.test %>% mutate(Survived = as.integer(0)) %>% relocate(Survived, .after = PassengerId) %>% select(-c(“Ticket”, “Cabin”, “Name”))

baked.test1 <- bake(prepare, new_data = titanic.test1)

baked.test1 <- baked.test1 %>% mutate(Pclass = factor(Pclass), Sex = factor(Sex))

baked.test1$prediction <- predict(cv_model_pls, baked.test1)

save the output as csv

write.csv(baked.test1$prediction, “output.csv”, row.names = TRUE)