Load dataset

library(titanic)
data("titanic_train")
df <- titanic_train

Basic information about dataset

summary(df)
##   PassengerId       Survived          Pclass             Name    
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length   :891  
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   N.unique :891  
##  Median :446.0   Median :0.0000   Median :3.000   N.blank  :  0  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309   Min.nchar: 12  
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000   Max.nchar: 82  
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                  
##                                                                  
##         Sex           Age            SibSp           Parch       
##  Length   :891   Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  N.unique :  2   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  N.blank  :  0   Median :28.00   Median :0.000   Median :0.0000  
##  Min.nchar:  4   Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##  Max.nchar:  6   3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                  Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                  NAs    :177                                     
##        Ticket         Fare              Cabin          Embarked  
##  Length   :891   Min.   :  0.00   Length   :891   Length   :891  
##  N.unique :681   1st Qu.:  7.91   N.unique :148   N.unique :  4  
##  N.blank  :  0   Median : 14.45   N.blank  :687   N.blank  :  2  
##  Min.nchar:  3   Mean   : 32.20   Min.nchar:  0   Min.nchar:  0  
##  Max.nchar: 18   3rd Qu.: 31.00   Max.nchar: 15   Max.nchar:  1  
##                  Max.   :512.33                                  
## 
head(df)

Filtering the data:

df <- titanic_train[, c("Survived","Pclass","Sex","Age","SibSp","Parch","Fare")]
df <- na.omit(df)
df$Survived <- as.factor(df$Survived)
df$Sex <- as.factor(df$Sex)

Train/ Test split:

set.seed(123)

train_index <- sample(1:nrow(df), 0.7*nrow(df))
train <- df[train_index, ]
test <- df[-train_index, ]

Logistic Regression Model:

log_model <- glm(Survived ~ ., data = train, family = binomial)

log_prob <- predict(log_model, test, type = "response")
log_pred <- ifelse(log_prob > 0.5, 1, 0)

log_accuracy <- mean(log_pred == test$Survived)
log_accuracy
## [1] 0.8139535

Support Vector Machine Model:

library(e1071)

svm_model <- svm(Survived ~ ., data = train, kernel = "linear")
svm_pred <- predict(svm_model, test)

svm_accuracy <- mean(svm_pred == test$Survived)
svm_accuracy
## [1] 0.8

Comparison

data.frame(
  Model = c("Logistic Regression", "SVM"),
  Accuracy = c(
    mean(log_pred == test$Survived),
    mean(svm_pred == test$Survived)
  )
)

Confusion Matrix:

# Logisitc Regression
table(Predicted = log_pred, Actual = test$Survived)
##          Actual
## Predicted   0   1
##         0 110  29
##         1  11  65
# SVM
table(Predicted = svm_pred, Actual = test$Survived)
##          Actual
## Predicted   0   1
##         0 108  30
##         1  13  64

More info:

library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
## 
##     element
## Loading required package: lattice
confusionMatrix(
  as.factor(log_pred),
  test$Survived
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 110  29
##          1  11  65
##                                           
##                Accuracy : 0.814           
##                  95% CI : (0.7554, 0.8636)
##     No Information Rate : 0.5628          
##     P-Value [Acc > NIR] : 6.17e-15        
##                                           
##                   Kappa : 0.6137          
##                                           
##  Mcnemar's Test P-Value : 0.00719         
##                                           
##             Sensitivity : 0.9091          
##             Specificity : 0.6915          
##          Pos Pred Value : 0.7914          
##          Neg Pred Value : 0.8553          
##              Prevalence : 0.5628          
##          Detection Rate : 0.5116          
##    Detection Prevalence : 0.6465          
##       Balanced Accuracy : 0.8003          
##                                           
##        'Positive' Class : 0               
## 
confusionMatrix(
  svm_pred,
  test$Survived
)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 108  30
##          1  13  64
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.7402, 0.8513)
##     No Information Rate : 0.5628          
##     P-Value [Acc > NIR] : 2.138e-13       
##                                           
##                   Kappa : 0.5852          
##                                           
##  Mcnemar's Test P-Value : 0.01469         
##                                           
##             Sensitivity : 0.8926          
##             Specificity : 0.6809          
##          Pos Pred Value : 0.7826          
##          Neg Pred Value : 0.8312          
##              Prevalence : 0.5628          
##          Detection Rate : 0.5023          
##    Detection Prevalence : 0.6419          
##       Balanced Accuracy : 0.7867          
##                                           
##        'Positive' Class : 0               
## 

Unrelated, but each variable by themselves.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

df$Survived <- as.factor(df$Survived)

Survival rates by sex

sex_survival <- df %>%
  group_by(Sex) %>%
  summarise(Survival_Rate = mean(as.numeric(Survived) - 1))

sex_survival

Survival rates by passenger class:

class_survival <- df %>%
  group_by(Pclass) %>%
  summarise(Survival_Rate = mean(as.numeric(Survived) - 1))

class_survival

Survival rates by number of siblings/ spouces

sibsp_survival <- df %>%
  group_by(SibSp) %>%
  summarise(Survival_Rate = mean(as.numeric(Survived) - 1))

sibsp_survival

Age/ Survival visualization

ggplot(df, aes(x = Age, fill = Survived)) +
  geom_histogram(binwidth = 5, alpha = 0.6, position = "identity") +
  labs(title = "Survival Distribution by Age",
       x = "Age",
       y = "Count")