Load dataset
library(titanic)
data("titanic_train")
df <- titanic_train
Basic information about dataset
summary(df)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length :891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 N.unique :891
## Median :446.0 Median :0.0000 Median :3.000 N.blank : 0
## Mean :446.0 Mean :0.3838 Mean :2.309 Min.nchar: 12
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000 Max.nchar: 82
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length :891 Min. : 0.42 Min. :0.000 Min. :0.0000
## N.unique : 2 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## N.blank : 0 Median :28.00 Median :0.000 Median :0.0000
## Min.nchar: 4 Mean :29.70 Mean :0.523 Mean :0.3816
## Max.nchar: 6 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NAs :177
## Ticket Fare Cabin Embarked
## Length :891 Min. : 0.00 Length :891 Length :891
## N.unique :681 1st Qu.: 7.91 N.unique :148 N.unique : 4
## N.blank : 0 Median : 14.45 N.blank :687 N.blank : 2
## Min.nchar: 3 Mean : 32.20 Min.nchar: 0 Min.nchar: 0
## Max.nchar: 18 3rd Qu.: 31.00 Max.nchar: 15 Max.nchar: 1
## Max. :512.33
##
head(df)
Filtering the data:
df <- titanic_train[, c("Survived","Pclass","Sex","Age","SibSp","Parch","Fare")]
df <- na.omit(df)
df$Survived <- as.factor(df$Survived)
df$Sex <- as.factor(df$Sex)
Train/ Test split:
set.seed(123)
train_index <- sample(1:nrow(df), 0.7*nrow(df))
train <- df[train_index, ]
test <- df[-train_index, ]
Logistic Regression Model:
log_model <- glm(Survived ~ ., data = train, family = binomial)
log_prob <- predict(log_model, test, type = "response")
log_pred <- ifelse(log_prob > 0.5, 1, 0)
log_accuracy <- mean(log_pred == test$Survived)
log_accuracy
## [1] 0.8139535
Support Vector Machine Model:
library(e1071)
svm_model <- svm(Survived ~ ., data = train, kernel = "linear")
svm_pred <- predict(svm_model, test)
svm_accuracy <- mean(svm_pred == test$Survived)
svm_accuracy
## [1] 0.8
Comparison
data.frame(
Model = c("Logistic Regression", "SVM"),
Accuracy = c(
mean(log_pred == test$Survived),
mean(svm_pred == test$Survived)
)
)
Confusion Matrix:
# Logisitc Regression
table(Predicted = log_pred, Actual = test$Survived)
## Actual
## Predicted 0 1
## 0 110 29
## 1 11 65
# SVM
table(Predicted = svm_pred, Actual = test$Survived)
## Actual
## Predicted 0 1
## 0 108 30
## 1 13 64
More info:
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
##
## element
## Loading required package: lattice
confusionMatrix(
as.factor(log_pred),
test$Survived
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 110 29
## 1 11 65
##
## Accuracy : 0.814
## 95% CI : (0.7554, 0.8636)
## No Information Rate : 0.5628
## P-Value [Acc > NIR] : 6.17e-15
##
## Kappa : 0.6137
##
## Mcnemar's Test P-Value : 0.00719
##
## Sensitivity : 0.9091
## Specificity : 0.6915
## Pos Pred Value : 0.7914
## Neg Pred Value : 0.8553
## Prevalence : 0.5628
## Detection Rate : 0.5116
## Detection Prevalence : 0.6465
## Balanced Accuracy : 0.8003
##
## 'Positive' Class : 0
##
confusionMatrix(
svm_pred,
test$Survived
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 108 30
## 1 13 64
##
## Accuracy : 0.8
## 95% CI : (0.7402, 0.8513)
## No Information Rate : 0.5628
## P-Value [Acc > NIR] : 2.138e-13
##
## Kappa : 0.5852
##
## Mcnemar's Test P-Value : 0.01469
##
## Sensitivity : 0.8926
## Specificity : 0.6809
## Pos Pred Value : 0.7826
## Neg Pred Value : 0.8312
## Prevalence : 0.5628
## Detection Rate : 0.5023
## Detection Prevalence : 0.6419
## Balanced Accuracy : 0.7867
##
## 'Positive' Class : 0
##
Unrelated, but each variable by themselves.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
df$Survived <- as.factor(df$Survived)
Survival rates by sex
sex_survival <- df %>%
group_by(Sex) %>%
summarise(Survival_Rate = mean(as.numeric(Survived) - 1))
sex_survival
Survival rates by passenger class:
class_survival <- df %>%
group_by(Pclass) %>%
summarise(Survival_Rate = mean(as.numeric(Survived) - 1))
class_survival
Survival rates by number of siblings/ spouces
sibsp_survival <- df %>%
group_by(SibSp) %>%
summarise(Survival_Rate = mean(as.numeric(Survived) - 1))
sibsp_survival
Age/ Survival visualization
ggplot(df, aes(x = Age, fill = Survived)) +
geom_histogram(binwidth = 5, alpha = 0.6, position = "identity") +
labs(title = "Survival Distribution by Age",
x = "Age",
y = "Count")