Exploring the Titanic Dataset: Unveiling Influential Variables for Survival

Install dan memuat library yang diperlukan

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Membaca dataset Titanic

titanic <- read.csv("D:/SOBAT KARIER/PORTOFOLIO/titanic.csv", sep = ",")

Menampilkan informasi dasar tentang dataset

str(titanic)
## 'data.frame':    418 obs. of  12 variables:
##  $ PassengerId: int  892 893 894 895 896 897 898 899 900 901 ...
##  $ Survived   : int  0 1 0 0 1 0 1 0 1 0 ...
##  $ Pclass     : int  3 3 2 3 3 3 3 2 3 3 ...
##  $ Name       : chr  "Kelly, Mr. James" "Wilkes, Mrs. James (Ellen Needs)" "Myles, Mr. Thomas Francis" "Wirz, Mr. Albert" ...
##  $ Sex        : chr  "male" "female" "male" "male" ...
##  $ Age        : num  34.5 47 62 27 22 14 30 26 18 21 ...
##  $ SibSp      : int  0 1 0 0 1 0 0 1 0 2 ...
##  $ Parch      : int  0 0 0 0 1 0 0 1 0 0 ...
##  $ Ticket     : chr  "330911" "363272" "240276" "315154" ...
##  $ Fare       : num  7.83 7 9.69 8.66 12.29 ...
##  $ Cabin      : chr  "" "" "" "" ...
##  $ Embarked   : chr  "Q" "S" "Q" "S" ...

Melihat 10 baris pertama dari dataset

head(titanic, 10)
##    PassengerId Survived Pclass                                         Name
## 1          892        0      3                             Kelly, Mr. James
## 2          893        1      3             Wilkes, Mrs. James (Ellen Needs)
## 3          894        0      2                    Myles, Mr. Thomas Francis
## 4          895        0      3                             Wirz, Mr. Albert
## 5          896        1      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist)
## 6          897        0      3                   Svensson, Mr. Johan Cervin
## 7          898        1      3                         Connolly, Miss. Kate
## 8          899        0      2                 Caldwell, Mr. Albert Francis
## 9          900        1      3    Abrahim, Mrs. Joseph (Sophie Halaut Easu)
## 10         901        0      3                      Davies, Mr. John Samuel
##       Sex  Age SibSp Parch    Ticket    Fare Cabin Embarked
## 1    male 34.5     0     0    330911  7.8292              Q
## 2  female 47.0     1     0    363272  7.0000              S
## 3    male 62.0     0     0    240276  9.6875              Q
## 4    male 27.0     0     0    315154  8.6625              S
## 5  female 22.0     1     1   3101298 12.2875              S
## 6    male 14.0     0     0      7538  9.2250              S
## 7  female 30.0     0     0    330972  7.6292              Q
## 8    male 26.0     1     1    248738 29.0000              S
## 9  female 18.0     0     0      2657  7.2292              C
## 10   male 21.0     2     0 A/4 48871 24.1500              S
summary(titanic)
##   PassengerId        Survived          Pclass          Name          
##  Min.   : 892.0   Min.   :0.0000   Min.   :1.000   Length:418        
##  1st Qu.: 996.2   1st Qu.:0.0000   1st Qu.:1.000   Class :character  
##  Median :1100.5   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :1100.5   Mean   :0.3636   Mean   :2.266                     
##  3rd Qu.:1204.8   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :1309.0   Max.   :1.0000   Max.   :3.000                     
##                                                                      
##      Sex                 Age            SibSp            Parch       
##  Length:418         Min.   : 0.17   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:21.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character   Median :27.00   Median :0.0000   Median :0.0000  
##                     Mean   :30.27   Mean   :0.4474   Mean   :0.3923  
##                     3rd Qu.:39.00   3rd Qu.:1.0000   3rd Qu.:0.0000  
##                     Max.   :76.00   Max.   :8.0000   Max.   :9.0000  
##                     NA's   :86                                       
##     Ticket               Fare            Cabin             Embarked        
##  Length:418         Min.   :  0.000   Length:418         Length:418        
##  Class :character   1st Qu.:  7.896   Class :character   Class :character  
##  Mode  :character   Median : 14.454   Mode  :character   Mode  :character  
##                     Mean   : 35.627                                        
##                     3rd Qu.: 31.500                                        
##                     Max.   :512.329                                        
##                     NA's   :1

Menghitung jumlah penumpang berdasarkan jenis kelamin

table(titanic$Sex)
## 
## female   male 
##    152    266

Menghitung jumlah penumpang yang selamat atau tidak selamat

table(titanic$Survived)
## 
##   0   1 
## 266 152

Membuat histogram usia penumpang

ggplot(titanic, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Histogram of Passenger Age", x = "Age", y = "Count")
## Warning: Removed 86 rows containing non-finite values (`stat_bin()`).

Membuat diagram batang untuk menampilkan jumlah penumpang selamat atau tidak selamat berdasarkan jenis kelamin

ggplot(titanic, aes(x = Sex, fill = factor(Survived))) +
  geom_bar() +
  labs(title = "Number of Survived/Not Survived Passengers by Gender", x = "Gender", y = "Count") +
  scale_fill_manual(values = c("red", "green"), labels = c("Not Survived", "Survived"))

Membuat boxplot untuk membandingkan usia penumpang yang selamat atau tidak selamat

ggplot(titanic, aes(x = factor(Survived), y = Age, fill = factor(Survived))) +
  geom_boxplot() +
  labs(title = "Comparison of Age for Survived/Not Survived Passengers", x = "Status", y = "Age") +
  scale_fill_manual(values = c("red", "green"), labels = c("Not Survived", "Survived"))
## Warning: Removed 86 rows containing non-finite values (`stat_boxplot()`).

Contoh: Menghitung jumlah penumpang selamat atau tidak selamat berdasarkan kelas kabin

ggplot(titanic, aes(x = factor(Survived), fill = factor(Pclass))) +
  geom_bar() +
  labs(title = "Number of Survived/Not Survived Passengers by Cabin Class", x = "Status", y = "Count") +
  scale_fill_manual(values = c("lightblue", "lightgreen", "lightyellow"), labels = c("1st Class", "2nd Class", "3rd Class"))

Contoh: Menganalisis korelasi antara biaya tiket (Fare) dan kelangsungan hidup (Survived)

ggplot(titanic, aes(x = Fare, y = factor(Survived))) +
geom_boxplot() +
  labs(title = "Comparison of Fare for Survived/Not Survived Passengers", x = "Fare", y = "Status") +
  scale_y_discrete(labels = c("Not Survived", "Survived"))
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).

Membuat diagram batang untuk menampilkan jumlah penumpang selamat atau tidak selamat berdasarkan titik embarkasi

ggplot(titanic, aes(x = factor(Survived), fill = Embarked)) +
  geom_bar() +
  labs(title = "Number of Survived/Not Survived Passengers by Embarkation Point", x = "Status", y = "Count") +
  scale_fill_manual(values = c("lightblue", "lightgreen", "lightyellow"), labels = c("C", "Q", "S"))

Membuat diagram batang untuk menampilkan jumlah penumpang selamat atau tidak selamat berdasarkan jumlah saudara/kakak pasangan

ggplot(titanic, aes(x = factor(Survived), fill = factor(SibSp))) +
  geom_bar() +
  labs(title = "Number of Survived/Not Survived Passengers by Siblings/Spouses Count", x = "Status", y = "Count")

Membuat diagram batang untuk menampilkan jumlah penumpang selamat atau tidak selamat berdasarkan jumlah orangtua/anak

ggplot(titanic, aes(x = factor(Survived), fill = factor(Parch))) +
  geom_bar() +
  labs(title = "Number of Survived/Not Survived Passengers by Parents/Children Count", x = "Status", y = "Count")

Analisis peubah yang berpengaruh terhadap kelangsungan hidup (survival)

Menggunakan statistik deskriptif untuk melihat perbedaan antara penumpang selamat dan tidak selamat berdasarkan peubah kategorikal

table(titanic$Survived, titanic$Sex)
##    
##     female male
##   0      0  266
##   1    152    0
table(titanic$Survived, titanic$Pclass)
##    
##       1   2   3
##   0  57  63 146
##   1  50  30  72
table(titanic$Survived, titanic$Embarked)
##    
##       C   Q   S
##   0  62  22 182
##   1  40  24  88

Menggunakan uji statistik seperti uji chi-square atau t-test untuk menentukan hubungan signifikan antara peubah numerikal dan kelangsungan hidup (survival)

survived <- titanic$Survived
age <- titanic$Age
fare <- titanic$Fare

Uji t-test untuk perbandingan usia antara penumpang selamat dan tidak selamat

t.test(titanic$Age ~ titanic$Survived)
## 
##  Welch Two Sample t-test
## 
## data:  titanic$Age by titanic$Survived
## t = 0.00022286, df = 238.89, p-value = 0.9998
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -3.265749  3.266488
## sample estimates:
## mean in group 0 mean in group 1 
##        30.27273        30.27236

Uji t-test untuk perbandingan biaya tiket antara penumpang selamat dan tidak selamat

t.test(titanic$Fare ~ titanic$Survived)
## 
##  Welch Two Sample t-test
## 
## data:  titanic$Fare by titanic$Survived
## t = -3.4479, df = 206.76, p-value = 0.0006845
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -34.925196  -9.514447
## sample estimates:
## mean in group 0 mean in group 1 
##        27.52788        49.74770

Uji chi-square untuk perbandingan distribusi jenis kelamin antara penumpang selamat dan tidak selamat

chisq.test(titanic$Survived, titanic$Sex)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  titanic$Survived and titanic$Sex
## X-squared = 413.69, df = 1, p-value < 2.2e-16

Uji chi-square untuk perbandingan distribusi kelas kabin antara penumpang selamat dan tidak selamat

chisq.test(titanic$Survived, titanic$Pclass)
## 
##  Pearson's Chi-squared test
## 
## data:  titanic$Survived and titanic$Pclass
## X-squared = 6.6939, df = 2, p-value = 0.03519

Uji chi-square untuk perbandingan distribusi titik embarkasi antara penumpang selamat dan tidak selamat

chisq.test(titanic$Survived, titanic$Embarked)
## 
##  Pearson's Chi-squared test
## 
## data:  titanic$Survived and titanic$Embarked
## X-squared = 6.9867, df = 2, p-value = 0.0304

Uji chi-square untuk perbandingan distribusi jumlah saudara/kakak pasangan antara penumpang selamat dan tidak selamat

sibsp_fisher <- fisher.test(titanic$Survived, titanic$SibSp)

Uji chi-square untuk perbandingan distribusi jumlah orangtua/anak antara penumpang selamat dan tidak selamat

parch_fisher <- fisher.test(titanic$Survived, titanic$Parch)

Tabel Siginifikansi Peubah

Perform t-test dan chi-square

age_ttest <- t.test(titanic$Age ~ titanic$Survived)
fare_ttest <- t.test(titanic$Fare ~ titanic$Survived)
sex_chi <- chisq.test(titanic$Survived, titanic$Sex)
pclass_chi <- chisq.test(titanic$Survived, titanic$Pclass)
embarked_chi <- chisq.test(titanic$Survived, titanic$Embarked)
sibsp_fisher <- fisher.test(titanic$Survived, titanic$SibSp)
parch_fisher <- fisher.test(titanic$Survived, titanic$Parch)

Membuat data frame

results <- data.frame(
  Variable = c("Age", "Fare", "Sex", "Cabin Class", "Embarkation Point"),
  Test = c("Independent t-test", "Independent t-test", "Chi-square test", "Chi-square test", "Chi-square test"),
  P_Value = c(age_ttest$p.value, fare_ttest$p.value, sex_chi$p.value, pclass_chi$p.value, embarked_chi$p.value),
  Significance = c(ifelse(age_ttest$p.value < 0.05, "Significant", "Not Significant"),
                  ifelse(fare_ttest$p.value < 0.05, "Significant", "Not Significant"),
                  ifelse(sex_chi$p.value < 0.05, "Significant", "Not Significant"),
                  ifelse(pclass_chi$p.value < 0.05, "Significant", "Not Significant"),
                  ifelse(embarked_chi$p.value < 0.05, "Significant", "Not Significant"))
)
updated_results <- rbind(results, data.frame(
  Variable = c("Siblings/Spouses", "Parents/Children"),
  Test = rep("Fisher's exact test", 2),
  P_Value = c(sibsp_fisher$p.value, parch_fisher$p.value),
  Significance = c(ifelse(sibsp_fisher$p.value < 0.05, "Significant", "Not Significant"),
                  ifelse(parch_fisher$p.value < 0.05, "Significant", "Not Significant"))
))