Exploring the Titanic Dataset: Unveiling Influential Variables
for Survival
Install dan memuat library yang diperlukan
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
Membaca dataset Titanic
titanic <- read.csv("D:/SOBAT KARIER/PORTOFOLIO/titanic.csv", sep = ",")
Melihat 10 baris pertama dari dataset
head(titanic, 10)
## PassengerId Survived Pclass Name
## 1 892 0 3 Kelly, Mr. James
## 2 893 1 3 Wilkes, Mrs. James (Ellen Needs)
## 3 894 0 2 Myles, Mr. Thomas Francis
## 4 895 0 3 Wirz, Mr. Albert
## 5 896 1 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist)
## 6 897 0 3 Svensson, Mr. Johan Cervin
## 7 898 1 3 Connolly, Miss. Kate
## 8 899 0 2 Caldwell, Mr. Albert Francis
## 9 900 1 3 Abrahim, Mrs. Joseph (Sophie Halaut Easu)
## 10 901 0 3 Davies, Mr. John Samuel
## Sex Age SibSp Parch Ticket Fare Cabin Embarked
## 1 male 34.5 0 0 330911 7.8292 Q
## 2 female 47.0 1 0 363272 7.0000 S
## 3 male 62.0 0 0 240276 9.6875 Q
## 4 male 27.0 0 0 315154 8.6625 S
## 5 female 22.0 1 1 3101298 12.2875 S
## 6 male 14.0 0 0 7538 9.2250 S
## 7 female 30.0 0 0 330972 7.6292 Q
## 8 male 26.0 1 1 248738 29.0000 S
## 9 female 18.0 0 0 2657 7.2292 C
## 10 male 21.0 2 0 A/4 48871 24.1500 S
summary(titanic)
## PassengerId Survived Pclass Name
## Min. : 892.0 Min. :0.0000 Min. :1.000 Length:418
## 1st Qu.: 996.2 1st Qu.:0.0000 1st Qu.:1.000 Class :character
## Median :1100.5 Median :0.0000 Median :3.000 Mode :character
## Mean :1100.5 Mean :0.3636 Mean :2.266
## 3rd Qu.:1204.8 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :1309.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:418 Min. : 0.17 Min. :0.0000 Min. :0.0000
## Class :character 1st Qu.:21.00 1st Qu.:0.0000 1st Qu.:0.0000
## Mode :character Median :27.00 Median :0.0000 Median :0.0000
## Mean :30.27 Mean :0.4474 Mean :0.3923
## 3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :76.00 Max. :8.0000 Max. :9.0000
## NA's :86
## Ticket Fare Cabin Embarked
## Length:418 Min. : 0.000 Length:418 Length:418
## Class :character 1st Qu.: 7.896 Class :character Class :character
## Mode :character Median : 14.454 Mode :character Mode :character
## Mean : 35.627
## 3rd Qu.: 31.500
## Max. :512.329
## NA's :1
Menghitung jumlah penumpang berdasarkan jenis kelamin
table(titanic$Sex)
##
## female male
## 152 266
Menghitung jumlah penumpang yang selamat atau tidak selamat
table(titanic$Survived)
##
## 0 1
## 266 152
Membuat histogram usia penumpang
ggplot(titanic, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(title = "Histogram of Passenger Age", x = "Age", y = "Count")
## Warning: Removed 86 rows containing non-finite values (`stat_bin()`).

Membuat diagram batang untuk menampilkan jumlah penumpang selamat
atau tidak selamat berdasarkan jenis kelamin
ggplot(titanic, aes(x = Sex, fill = factor(Survived))) +
geom_bar() +
labs(title = "Number of Survived/Not Survived Passengers by Gender", x = "Gender", y = "Count") +
scale_fill_manual(values = c("red", "green"), labels = c("Not Survived", "Survived"))

Membuat boxplot untuk membandingkan usia penumpang yang selamat atau
tidak selamat
ggplot(titanic, aes(x = factor(Survived), y = Age, fill = factor(Survived))) +
geom_boxplot() +
labs(title = "Comparison of Age for Survived/Not Survived Passengers", x = "Status", y = "Age") +
scale_fill_manual(values = c("red", "green"), labels = c("Not Survived", "Survived"))
## Warning: Removed 86 rows containing non-finite values (`stat_boxplot()`).

Contoh: Menghitung jumlah penumpang selamat atau tidak selamat
berdasarkan kelas kabin
ggplot(titanic, aes(x = factor(Survived), fill = factor(Pclass))) +
geom_bar() +
labs(title = "Number of Survived/Not Survived Passengers by Cabin Class", x = "Status", y = "Count") +
scale_fill_manual(values = c("lightblue", "lightgreen", "lightyellow"), labels = c("1st Class", "2nd Class", "3rd Class"))

Contoh: Menganalisis korelasi antara biaya tiket (Fare) dan
kelangsungan hidup (Survived)
ggplot(titanic, aes(x = Fare, y = factor(Survived))) +
geom_boxplot() +
labs(title = "Comparison of Fare for Survived/Not Survived Passengers", x = "Fare", y = "Status") +
scale_y_discrete(labels = c("Not Survived", "Survived"))
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).

Membuat diagram batang untuk menampilkan jumlah penumpang selamat
atau tidak selamat berdasarkan titik embarkasi
ggplot(titanic, aes(x = factor(Survived), fill = Embarked)) +
geom_bar() +
labs(title = "Number of Survived/Not Survived Passengers by Embarkation Point", x = "Status", y = "Count") +
scale_fill_manual(values = c("lightblue", "lightgreen", "lightyellow"), labels = c("C", "Q", "S"))

Membuat diagram batang untuk menampilkan jumlah penumpang selamat
atau tidak selamat berdasarkan jumlah saudara/kakak pasangan
ggplot(titanic, aes(x = factor(Survived), fill = factor(SibSp))) +
geom_bar() +
labs(title = "Number of Survived/Not Survived Passengers by Siblings/Spouses Count", x = "Status", y = "Count")

Membuat diagram batang untuk menampilkan jumlah penumpang selamat
atau tidak selamat berdasarkan jumlah orangtua/anak
ggplot(titanic, aes(x = factor(Survived), fill = factor(Parch))) +
geom_bar() +
labs(title = "Number of Survived/Not Survived Passengers by Parents/Children Count", x = "Status", y = "Count")

Analisis peubah yang berpengaruh terhadap kelangsungan hidup
(survival)
Menggunakan statistik deskriptif untuk melihat perbedaan antara
penumpang selamat dan tidak selamat berdasarkan peubah kategorikal
table(titanic$Survived, titanic$Sex)
##
## female male
## 0 0 266
## 1 152 0
table(titanic$Survived, titanic$Pclass)
##
## 1 2 3
## 0 57 63 146
## 1 50 30 72
table(titanic$Survived, titanic$Embarked)
##
## C Q S
## 0 62 22 182
## 1 40 24 88
Menggunakan uji statistik seperti uji chi-square atau t-test untuk
menentukan hubungan signifikan antara peubah numerikal dan kelangsungan
hidup (survival)
survived <- titanic$Survived
age <- titanic$Age
fare <- titanic$Fare
Uji t-test untuk perbandingan usia antara penumpang selamat dan
tidak selamat
t.test(titanic$Age ~ titanic$Survived)
##
## Welch Two Sample t-test
##
## data: titanic$Age by titanic$Survived
## t = 0.00022286, df = 238.89, p-value = 0.9998
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -3.265749 3.266488
## sample estimates:
## mean in group 0 mean in group 1
## 30.27273 30.27236
Uji t-test untuk perbandingan biaya tiket antara penumpang selamat
dan tidak selamat
t.test(titanic$Fare ~ titanic$Survived)
##
## Welch Two Sample t-test
##
## data: titanic$Fare by titanic$Survived
## t = -3.4479, df = 206.76, p-value = 0.0006845
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -34.925196 -9.514447
## sample estimates:
## mean in group 0 mean in group 1
## 27.52788 49.74770
Uji chi-square untuk perbandingan distribusi jenis kelamin antara
penumpang selamat dan tidak selamat
chisq.test(titanic$Survived, titanic$Sex)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: titanic$Survived and titanic$Sex
## X-squared = 413.69, df = 1, p-value < 2.2e-16
Uji chi-square untuk perbandingan distribusi kelas kabin antara
penumpang selamat dan tidak selamat
chisq.test(titanic$Survived, titanic$Pclass)
##
## Pearson's Chi-squared test
##
## data: titanic$Survived and titanic$Pclass
## X-squared = 6.6939, df = 2, p-value = 0.03519
Uji chi-square untuk perbandingan distribusi titik embarkasi antara
penumpang selamat dan tidak selamat
chisq.test(titanic$Survived, titanic$Embarked)
##
## Pearson's Chi-squared test
##
## data: titanic$Survived and titanic$Embarked
## X-squared = 6.9867, df = 2, p-value = 0.0304
Uji chi-square untuk perbandingan distribusi jumlah saudara/kakak
pasangan antara penumpang selamat dan tidak selamat
sibsp_fisher <- fisher.test(titanic$Survived, titanic$SibSp)
Uji chi-square untuk perbandingan distribusi jumlah orangtua/anak
antara penumpang selamat dan tidak selamat
parch_fisher <- fisher.test(titanic$Survived, titanic$Parch)
Tabel Siginifikansi Peubah
Membuat data frame
results <- data.frame(
Variable = c("Age", "Fare", "Sex", "Cabin Class", "Embarkation Point"),
Test = c("Independent t-test", "Independent t-test", "Chi-square test", "Chi-square test", "Chi-square test"),
P_Value = c(age_ttest$p.value, fare_ttest$p.value, sex_chi$p.value, pclass_chi$p.value, embarked_chi$p.value),
Significance = c(ifelse(age_ttest$p.value < 0.05, "Significant", "Not Significant"),
ifelse(fare_ttest$p.value < 0.05, "Significant", "Not Significant"),
ifelse(sex_chi$p.value < 0.05, "Significant", "Not Significant"),
ifelse(pclass_chi$p.value < 0.05, "Significant", "Not Significant"),
ifelse(embarked_chi$p.value < 0.05, "Significant", "Not Significant"))
)
updated_results <- rbind(results, data.frame(
Variable = c("Siblings/Spouses", "Parents/Children"),
Test = rep("Fisher's exact test", 2),
P_Value = c(sibsp_fisher$p.value, parch_fisher$p.value),
Significance = c(ifelse(sibsp_fisher$p.value < 0.05, "Significant", "Not Significant"),
ifelse(parch_fisher$p.value < 0.05, "Significant", "Not Significant"))
))
Print Table
# Print the table
print(updated_results)
## Variable Test P_Value Significance
## 1 Age Independent t-test 9.998224e-01 Not Significant
## 2 Fare Independent t-test 6.845156e-04 Significant
## 3 Sex Chi-square test 5.767311e-92 Significant
## 4 Cabin Class Chi-square test 3.519206e-02 Significant
## 5 Embarkation Point Chi-square test 3.039815e-02 Significant
## 6 Siblings/Spouses Fisher's exact test 1.208175e-02 Significant
## 7 Parents/Children Fisher's exact test 4.939563e-05 Significant