The Titanic dataset is a classic dataset for binary classification problems. The goal is to predict whether a passenger survived or not based on their demographic and travel details.
In this report, we will: - Explore and clean the training data
(train.csv
) - Build classification models to predict
survival - Evaluate model performance - Use the trained model to predict
survival for the test dataset (test.csv
)
# Load libraries
library(tidyverse)
library(caret)
library(ggplot2)
library(ROCR)
# Load datasets
train <- read.csv("train.csv")
test <- read.csv("test.csv")
2.1 Overview of the Data
# View structure and summary
str(train)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
summary(train)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
2.2 Missing Values
# Check missing values
colSums(is.na(train))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
2.3 Key Variables
# Survival distribution
train %>%
ggplot(aes(x = factor(Survived), fill = factor(Survived))) +
geom_bar() +
labs(title = "Survival Distribution", x = "Survived", fill = "Survived")
# Survival by gender
train %>%
ggplot(aes(x = Sex, fill = factor(Survived))) +
geom_bar(position = "fill") +
labs(title = "Survival by Gender", y = "Proportion", fill = "Survived")
3.1 Handling Missing Values
# Fill missing Age with median
train$Age[is.na(train$Age)] <- median(train$Age, na.rm = TRUE)
# Fill missing Embarked with the most common value
train$Embarked[is.na(train$Embarked)] <- "S"
3.2 Feature Engineering
# Convert categorical variables to factors
train$Pclass <- factor(train$Pclass)
train$Survived <- factor(train$Survived)
# Create a new feature: FamilySize
train$FamilySize <- train$SibSp + train$Parch + 1
3.3 Scaling Numerical Features
# Scale Age and Fare
preProc <- preProcess(train[, c("Age", "Fare")], method = c("center", "scale"))
train[, c("Age", "Fare")] <- predict(preProc, train[, c("Age", "Fare")])
4.1 Splitting Data
set.seed(123)
trainIndex <- createDataPartition(train$Survived, p = 0.8, list = FALSE)
trainData <- train[trainIndex, ]
testData <- train[-trainIndex, ]
4.2 Logistic Regression
# Train logistic regression model
logistic_model <- glm(Survived ~ Pclass + Sex + Age + FamilySize + Fare,
data = trainData,
family = binomial)
# Predict on test data
logistic_preds <- predict(logistic_model, newdata = testData, type = "response")
logistic_class <- ifelse(logistic_preds > 0.5, 1, 0)
# Confusion matrix
confusionMatrix(factor(logistic_class), testData$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 94 25
## 1 15 43
##
## Accuracy : 0.774
## 95% CI : (0.7052, 0.8334)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 5.385e-06
##
## Kappa : 0.5088
##
## Mcnemar's Test P-Value : 0.1547
##
## Sensitivity : 0.8624
## Specificity : 0.6324
## Pos Pred Value : 0.7899
## Neg Pred Value : 0.7414
## Prevalence : 0.6158
## Detection Rate : 0.5311
## Detection Prevalence : 0.6723
## Balanced Accuracy : 0.7474
##
## 'Positive' Class : 0
##
4.3 K-Nearest Neighbors (KNN)
# Train KNN model
set.seed(123)
knn_model <- train(Survived ~ Pclass + Sex + Age + FamilySize + Fare,
data = trainData,
method = "knn",
tuneGrid = data.frame(k = 1:15),
trControl = trainControl(method = "cv", number = 5))
# Best k value
knn_model$bestTune
## k
## 1 1
# Predict on test data
knn_preds <- predict(knn_model, newdata = testData)
confusionMatrix(knn_preds, testData$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 81 20
## 1 28 48
##
## Accuracy : 0.7288
## 95% CI : (0.657, 0.7928)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 0.001044
##
## Kappa : 0.4393
##
## Mcnemar's Test P-Value : 0.312321
##
## Sensitivity : 0.7431
## Specificity : 0.7059
## Pos Pred Value : 0.8020
## Neg Pred Value : 0.6316
## Prevalence : 0.6158
## Detection Rate : 0.4576
## Detection Prevalence : 0.5706
## Balanced Accuracy : 0.7245
##
## 'Positive' Class : 0
##
4.4 Comparison of Models
# Logistic Regression performance
logistic_performance <- confusionMatrix(factor(logistic_class), testData$Survived)
# KNN performance
knn_performance <- confusionMatrix(knn_preds, testData$Survived)
# Compare Accuracy
logistic_performance$overall["Accuracy"]
## Accuracy
## 0.7740113
knn_performance$overall["Accuracy"]
## Accuracy
## 0.7288136
# Apply preprocessing to the test set
test$Age[is.na(test$Age)] <- median(train$Age, na.rm = TRUE)
test$Fare[is.na(test$Fare)] <- median(train$Fare, na.rm = TRUE)
test[, c("Age", "Fare")] <- predict(preProc, test[, c("Age", "Fare")])
test$FamilySize <- test$SibSp + test$Parch + 1
test$Pclass <- factor(test$Pclass)
# Predict using the Logistic Regression model
final_preds <- predict(logistic_model, newdata = test, type = "response")
final_class <- ifelse(final_preds > 0.5, 1, 0)
# Create submission file
submission <- data.frame(PassengerId = test$PassengerId, Survived = final_class)
write.csv(submission, "submission.csv", row.names = FALSE)
Two classification models, Logistic Regression and K-Nearest Neighbors (KNN), were applied to predict Titanic passengers’ survival based on their demographic and travel details.
Using the Logistic Regression model, the survival predictions for Titanic passengers on the test.csv dataset were successfully made. These predictions have been saved in the submission.csv file, which can be used for participating in Kaggle competitions or for further analysis.