Titanic Data Exploration and Machine Learning

Loading Libraries and Data Cleaning

library(caret)
library(rpart)
library(rpart.plot)
library(C50)
library(neuralnet)
library(class)
library(readr)
library(GGally)
train <- read_csv("~/Titanic/train.csv")
# Data Cleaning NA values
print(colSums(is.na(train)))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2

print(colSums(train == ""))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0          NA 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0          NA          NA

# Replace missing ages with median age
train$Age[is.na(train$Age)] <- median(train$Age, na.rm = TRUE)
# Replace missing embarked with the mode
most_common_embarked <- names(which.max(table(train$Embarked)))
train$Embarked[is.na(train$Embarked)] <- most_common_embarked
# Replace missing cabins with 'Unknown'
train$Cabin[is.na(train$Cabin)] <- 'Unknown'
print(colSums(is.na(train)))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0           0 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0           0           0

# Function to identify outliers
identify_outliers <- function(x) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
  H <- 1.5 * IQR(x, na.rm = T)
  y <- ifelse(x < (qnt[1] - H), (qnt[1] - H), 
              ifelse(x > (qnt[2] + H), (qnt[2] + H), x))
  y}
# Applying the function to Fare and Age
train$Fare <- identify_outliers(train$Fare)
train$Age <- identify_outliers(train$Age)
# Removing Name Cabin and Ticket
train <- train[ , !(names(train) %in% c('Name', 'Cabin', 'Ticket'))]
# Split Embarked and Sex variable and normalizing 
train$Embarked_S <- as.numeric(train$Embarked == "S")
train$Embarked_C <- as.numeric(train$Embarked == "C")
train$Embarked_Q <- as.numeric(train$Embarked == "Q")
train$Sex_male <- as.numeric(train$Sex == "male")
train$Sex_female <- as.numeric(train$Sex == "female")
train <- train[, !(names(train) %in% c("Sex"))]
train <- train[, !(names(train) %in% c("Embarked"))]
# Normalizing Age and Fare
normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x))}
train$Age <- normalize(train$Age)
train$Fare <- normalize(train$Fare)
train$Parch <- normalize(train$Parch)
train$SibSp <- normalize(train$SibSp)
# Splitting data into training and test sets
index <- createDataPartition(train$Survived, p = 0.7, list = FALSE)
train_set <- train[index, ]
test_set <- train[-index, ]
# Converting Survived to a factor
train_set$Survived <- as.factor(train_set$Survived)
test_set$Survived <- as.factor(test_set$Survived)

Pairplot

num.cols <- sapply(train, is.numeric)
train_numeric <- train[, num.cols]
ggpairs(train_numeric)

K-Nearest Neighbors (KNN)

k <- 3
knn_vars <- c("Age", "Pclass", "SibSp")
pred_knn <- knn(train = train_set[, knn_vars],
                test = test_set[, knn_vars],
                cl = train_set$Survived,
                k = k)
pred_knn <- factor(pred_knn, levels = levels(test_set$Survived)) 
cm_knn <- confusionMatrix(pred_knn, reference = test_set$Survived)
print(cm_knn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 141  56
##          1  23  47
##                                           
##                Accuracy : 0.7041          
##                  95% CI : (0.6454, 0.7582)
##     No Information Rate : 0.6142          
##     P-Value [Acc > NIR] : 0.0013459       
##                                           
##                   Kappa : 0.3361          
##                                           
##  Mcnemar's Test P-Value : 0.0003179       
##                                           
##             Sensitivity : 0.8598          
##             Specificity : 0.4563          
##          Pos Pred Value : 0.7157          
##          Neg Pred Value : 0.6714          
##              Prevalence : 0.6142          
##          Detection Rate : 0.5281          
##    Detection Prevalence : 0.7378          
##       Balanced Accuracy : 0.6580          
##                                           
##        'Positive' Class : 0               
##

C5.0 Decision Tree

# Fit the C5.0 model
c50_model <- C5.0(Survived ~ Sex_male + Sex_female + Pclass + Age + Fare + Embarked_S + Embarked_C + Embarked_Q,
                  data = train_set)
# Predicting using c5.0
c50_pred <- predict(c50_model, newdata = test_set)
# Convert predictions and reference to factors with the same levels
c50_pred <- factor(c50_pred, levels = levels(test_set$Survived)) 
# Confusion matrix for C5.0
cm_c50 <- confusionMatrix(c50_pred, reference = test_set$Survived)
print(cm_c50)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 143  32
##          1  21  71
##                                           
##                Accuracy : 0.8015          
##                  95% CI : (0.7485, 0.8476)
##     No Information Rate : 0.6142          
##     P-Value [Acc > NIR] : 3.634e-11       
##                                           
##                   Kappa : 0.5726          
##                                           
##  Mcnemar's Test P-Value : 0.1696          
##                                           
##             Sensitivity : 0.8720          
##             Specificity : 0.6893          
##          Pos Pred Value : 0.8171          
##          Neg Pred Value : 0.7717          
##              Prevalence : 0.6142          
##          Detection Rate : 0.5356          
##    Detection Prevalence : 0.6554          
##       Balanced Accuracy : 0.7806          
##                                           
##        'Positive' Class : 0               
##

# Plot the C5.0 decision tree
plot(c50_model, main="C5.0 Decision Tree")

Neural Network

# Converting Survived to numeric for NN
train_set$Survived <- as.numeric(as.character(train_set$Survived))
test_set$Survived <- as.numeric(as.character(test_set$Survived))
# Train the neural network model
net <- neuralnet(Survived ~ Age + Fare + Pclass + SibSp + Embarked_S + Embarked_C + Embarked_Q + Sex_female + Parch,
                 data = train_set, rep = 1, hidden = 3, lifesign.step = 220000)
# Predicting using NN
net.pred <- round(predict(net, test_set), 0)
# Calculating accuracy
accuracy <- sum(net.pred == test_set$Survived) / length(test_set$Survived)
print(accuracy)

## [1] 0.7977528

CART Decision Tree

# Convert Survived to factor with explicit levels
train_set$Survived <- factor(train_set$Survived, levels = c(0, 1))
test_set$Survived <- factor(test_set$Survived, levels = c(0, 1))
# CART model
cart_model <- rpart(Survived ~ Sex_male + Sex_female + Pclass + Age + Fare + Embarked_S + Embarked_C + Embarked_Q, data = train_set, method = "class")
# Make predictions on the test set using CART
cart_pred <- predict(cart_model, newdata = test_set, type = "class")
# Checking predicted values are a factor with explicit levels
cart_pred <- factor(cart_pred, levels = c(0, 1))
# Calculating confusion matrix for CART
cm_cart <- confusionMatrix(cart_pred, reference = test_set$Survived)
print(cm_cart)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 148  37
##          1  16  66
##                                           
##                Accuracy : 0.8015          
##                  95% CI : (0.7485, 0.8476)
##     No Information Rate : 0.6142          
##     P-Value [Acc > NIR] : 3.634e-11       
##                                           
##                   Kappa : 0.5646          
##                                           
##  Mcnemar's Test P-Value : 0.00601         
##                                           
##             Sensitivity : 0.9024          
##             Specificity : 0.6408          
##          Pos Pred Value : 0.8000          
##          Neg Pred Value : 0.8049          
##              Prevalence : 0.6142          
##          Detection Rate : 0.5543          
##    Detection Prevalence : 0.6929          
##       Balanced Accuracy : 0.7716          
##                                           
##        'Positive' Class : 0               
##

#Visualizing Cart Model both Complex and Simple visualizations
rpart.plot(cart_model)

prp(cart_model, type = 4, extra = 2)

Titanic Data Exploration and Machine Learning

Reda Abouzaid

2023-06-05