library(RColorBrewer) library(h2o) library(ggsci) library(forcats) library(broom) library(WVPlots) library(caret) library(pROC) library(recipes) library(readr) library(rlang) library(httr) library(jsonlite) library(data.table) library(tidyr) library(ROCR) library(vip) library(stringr) library(scales) library(ggtext) library(ggplot2) library(dplyr) library(showtext) library(extrafont) library(viridis) library(rpart) library(rpart.plot) library(ranger) library(randomForest) library(rattle)
titanic.test <- fread(“data/test.csv”) titanic.train <-fread(“data/train.csv”) gen.subm <- fread(“data/gender_submission.csv”)
titanic.train1 <- titanic.train %>% mutate(Pclass = factor(Pclass), Sex = factor(Sex), Survived = factor(Survived)) %>% filter(Age != is.na(Age)) %>% select(-c(“Ticket”, “Cabin”, “Name”))
loadfonts(device = “win”) font_add(“Palatino”, “pala.ttf”) showtext_auto() theme1 <- theme_minimal(base_size = 30) + theme(plot.subtitle = element_text(face = “italic”), text = element_text(family = “Palatino”)) color1 <- pal_jama(palette = “default”)(3)[1] color2 <- pal_jama(palette = “default”)(3)[2] color3 <- pal_jama(palette = “default”)(3)[3]
my_theme <- function(type = c(“discrete”, “continuous”)) { if(type == “discrete”) { list(theme1, scale_colour_manual(values = c(color1, color3, color2)), scale_fill_manual(values = c(color1, color3, color2))) } else { list(theme1, scale_fill_continuous(high = color1, low = color3)) } }
titanic.train2 <- titanic.train1 %>% mutate(Survived = fct_recode(Survived, “Survivor” = “1”, “Deceased” = “0”), Sex = fct_recode(Sex, “Male” = “male”, “Female” = “female”), Family_size = factor(SibSp + Parch + 1))
vline_df <- titanic.train2 %>% group_by(Survived, Sex, .drop = FALSE) %>% summarise(Mean = mean(Age), Median = median(Age)) %>% as.data.frame() %>% mutate(Comment_median = paste(“Median Age:”, Median), Comment_mean = paste(“Mean Age:”, round(Mean, 2)))
titanic.train2 %>% ggplot(aes(x = Age)) + geom_density(data = titanic.train2[Survived == “Survivor”], aes(x = Age, y = ..density.., color = Survived, fill = Survived), alpha = 0.3, size = 0.8) + geom_density(data = titanic.train2[Survived == “Deceased”], aes(x = Age, y = -..density.., color = Survived, fill = Survived), alpha = 0.3, size = 0.8) + geom_segment(data = vline_df %>% filter(Survived == “Survivor”), aes(x = Mean, y = 0 , xend = Mean, yend = Inf, color = Survived), size = 1) + geom_segment(data = vline_df %>% filter(Survived == “Deceased”), aes(x = Mean, y = 0 , xend = Mean, yend = -Inf, color = Survived), size = 1) + geom_segment(data = vline_df %>% filter(Survived == “Survivor”), aes(x = Median, y = 0 , xend = Median, yend = Inf, color = Survived), linetype = ‘dashed’, size = 1) + geom_segment(data = vline_df %>% filter(Survived == “Deceased”), aes(x = Median, y = 0 , xend = Median, yend = -Inf, color = Survived), linetype = ‘dashed’, size = 1) + geom_text(data = vline_df %>% filter(Survived == “Survivor”), y = 0.02, aes(x = 70, label = paste(Comment_mean,“”,Comment_median), color = Survived), family = “Palatino”, size = 9) + geom_text(data = vline_df %>% filter(Survived == “Deceased”), y = -0.02, aes(x = 70, label = paste(Comment_mean,“”,Comment_median), color = Survived), family = “Palatino”, size = 9) + coord_flip() + labs( title = “Density of Age Per Sex” ) + my_theme(type = “discrete”) + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank()) + facet_wrap(~Sex)
titanic.train2 %>% filter(Survived == “Deceased”) %>% filter(Fare != is.na(Fare)) %>% group_by(grp_age = cut(Age, breaks = seq(0,80, by = 10)), grp_fare = cut(Fare, breaks = c(seq(0,100, by = 20), 520)), Survived) %>% summarise(n = n()) %>% ggplot(aes(x = grp_age, y = grp_fare)) + geom_tile(aes(fill = n), width = 0.9, height = 0.9) + geom_text(aes(x = grp_age, y = grp_fare, label = n), family = “Palatino”, color = “white”, size = 20) + labs( title = “Number Of People per Age and Fare Group - Deceased Passengers”, y = “Fare Group”, x = “Age Group”, fill = “Count” ) + my_theme(type = “continuous”) + theme(legend.position = ‘none’, panel.grid.major.x = element_blank(), panel.grid.major.y = element_blank()) + scale_fill_gradient2(low = “lightblue”, high = color1, trans = “log10”)
titanic.train2 %>% mutate(Family_size = factor(Family_size)) %>% ggplot(aes(x = Family_size)) + geom_bar(data = titanic.train2 %>% filter(Survived == “Survivor”), aes(y = ..count.., fill = Survived)) + geom_bar(data = titanic.train2 %>% filter(Survived == “Deceased”), aes(y = -..count.., fill = Survived)) + geom_text(data = titanic.train2 %>% filter(Survived == “Survivor”), stat = ‘count’,aes(label = ..count.., y = ..count..), family = “Palatino”, hjust = -0.5, size = 15) + geom_text(data = titanic.train2 %>% filter(Survived == “Deceased”), stat = ‘count’,aes(label = ..count.., y = -..count..), family = “Palatino”, hjust = 1.25, size = 15) + scale_x_discrete(limits = rev) + scale_y_continuous(limits = c(-285, 285)) + coord_flip() + labs( title = “Number of Individuals Per Family Size”, x = “Size of Family” ) + my_theme(type = “discrete”) + theme(axis.text.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), axis.title.x = element_blank())
titanic.train3 <- titanic.train2 %>% group_by(grp_age = cut(Age, breaks = seq(0,80, by = 20)),Survived, Sex, Pclass) %>% count() %>% group_by(grp_age, Sex, Pclass) %>% mutate(prop = n/sum(n))
titanic.train3
titanic.train3 %>% ggplot(aes(x = grp_age, y = prop, fill = Survived)) + geom_col(position = ‘fill’) + geom_text(data = titanic.train3 %>% filter(Survived == “Survivor”), aes(x = grp_age, y = 0.2, label = paste(percent(prop))), color = “white”, family = “Palatino”, size = 13) + geom_text(data = titanic.train3, aes(x = grp_age, y = 0.1, label = paste(“(”,n/prop,“)”)), color = “white”, family = “Palatino”, size = 13) + facet_grid(Pclass~Sex) + my_theme(type = “discrete”) + theme(plot.subtitle = element_text(color = color3, family=“Palatino”, face = “italic”), panel.grid.major.y = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.y = element_blank()) + labs( title = “Survival Rate Analysis”, subtitle = “Percentage inside each bar indicates survival rate”, y = “Survival Rate”, x = “Age Group” )
set.seed(123)
ctrl <- trainControl(method = “cv”, number = 10)
(cv_model_pls <- train( Survived ~ ., data = titanic.train1, method = “pls”, family = “binomial”, preProcess = c(“zv”,“center”, “scale”), trControl = ctrl, tuneLenght = 8 ))
ggplot(cv_model_pls) + geom_line(color = color1) + geom_point(color = color3, size = 3) + geom_text(aes(x = ncomp, y = Accuracy, label = percent(Accuracy)), family = “Palatino” , vjust = -1, size = 13) + my_theme(type = “discrete”) + labs( title = “Principal component analysis of Accuracy”, x = “Number of Components” )
vip1 <- vip(cv_model_pls, num_features = 9, method = “model”)
vip1$data %>% ggplot(aes(x = Importance, y = reorder(Variable, Importance), fill = Importance)) + geom_col() + my_theme(type = “continuous”) + labs( title = “Component Importance”, y = “Components” )
titanic.train1$survival_pred <- predict(cv_model_pls, titanic.train1) model_prob <- predict(cv_model_pls, newdata = titanic.train1, type = “prob”)
titanic.train1 <- titanic.train1 %>% mutate(survival_prob = model_prob[,2])
cfMatrix <- confusionMatrix( data = relevel(titanic.train1\(survival_pred, ref = "1"), reference = relevel(titanic.train1\)Survived, ref = “1”) )
cfMatrix
ROC <- roc(titanic.train1\(Survived, as.numeric(titanic.train1\)survival_pred))
titanic.train2 <- titanic.train1 %>% mutate(Survived = as.numeric(as.character(Survived)), survival_prob = as.numeric(as.character(survival_prob)))
ROCPlot(titanic.train2, “survival_prob”, “Survived”, truthTarget = TRUE, title = “ROC Curve”, curve_color = color3) + my_theme(type = “discrete”) + labs( title = “ROC Curve” )
cfMatrix_plot <- as.data.frame(cfMatrix$table) cfMatrix_plot <- cfMatrix_plot %>% mutate(Prediction = fct_relevel(Prediction, “1”, after = 1), Reference = fct_relevel(Reference, “1”, after = 1)) %>% group_by(Prediction) %>% mutate(Accuracy_prediction = Freq/sum(Freq)) %>% ungroup() %>% group_by(Reference) %>% mutate(Accuracy_reference = Freq/sum(Freq))
cfMatrix_plot
cfMatrix_plot %>% ggplot(aes(x = Prediction, y = Reference, fill = Freq)) + geom_tile() + geom_text(aes(x = Prediction, y = Reference, label = Freq), family = “Palatino”, color = “white”, size = 20) + my_theme(type = “continuous”) + theme(legend.position = “none”) + labs(title = “Confusion Matrix”)
titanic.train1 %>% mutate(Sex = fct_recode(Sex , “Male” = “male”, “Female” = “female”)) %>% ggplot(aes(x = Age, y = survival_prob, color = Pclass)) + geom_point() + geom_smooth(method = “glm”, method.args = list(family = “binomial”), se = FALSE) + facet_wrap(~Sex) + my_theme(type = “discrete”) + labs( title = “Survival Probability with Logistic Regression”, y = “Survival Probability”, color = “Passenger Class” ) titanic.train1 %>% ggplot(aes(x = survival_prob, y = as.numeric(as.character(Survived)))) + geom_point(alpha = 0.5, color = color3) + stat_smooth(method = “glm”, method.args = list(family = “binomial”), se = FALSE, color = color1) + my_theme(type = “discrete”) + labs( title = “Prediction Probabilities vs Actual Results”, y = “Actual Results”, x = “Prediction Probabilities” )
titanic.train.recipe <- titanic.train %>% select(-c(“Name”, “Ticket”, “Cabin”))
Test.recipe <- recipe(Survived ~ ., data = titanic.train.recipe) %>% step_impute_knn(Age, impute_with = imp_vars(Pclass, Fare, SibSp, Parch), neighbors = 6) %>% step_impute_knn(Fare, impute_with = imp_vars(Pclass, Age, SibSp, Parch), neighbors = 6)
Test.recipe
prepare <- prep(Test.recipe, training = titanic.train.recipe, strings_as_factors = TRUE)
titanic.test1 <- titanic.test %>% mutate(Survived = as.integer(0)) %>% relocate(Survived, .after = PassengerId) %>% select(-c(“Ticket”, “Cabin”, “Name”))
baked.test1 <- bake(prepare, new_data = titanic.test1)
baked.test1 <- baked.test1 %>% mutate(Pclass = factor(Pclass), Sex = factor(Sex))
baked.test1$prediction <- predict(cv_model_pls, baked.test1)
write.csv(baked.test1$prediction, “output.csv”, row.names = TRUE)