# Read file data Titanic
Dt <- read.csv("C:/Users/LENOVO/Downloads/titanic.csv")
Memahami variable yang ada dalam dataset
PassengerId = Nomor Id Penumpang
Survived = Keterangan Selamat(0=Tidak, 1=Ya)
Pclass = Kelas Tiket (1=Kelas 1, 2=Kelas 2, dst)
Name = Nama Penumpang
Sex = Jenis kelamin
Age = Usia dalam tahun
SibSp = Jumlah saudara kandung / pasangan di kapal Titanic
Parch = Jumlah orang tua / anak di kapal Titanic
Ticket = Nomor Tiket
Fare = Harga Tiket
Cabin = Nama Kabin
Embarked = Pelabuhan Asal (C = Cherbourg, Q = Queenstown, S = Southampton)
library(readr)
library(tidyr)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v stringr 1.4.0
## v purrr 0.3.4 v forcats 0.5.1
## v dplyr 1.0.7
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
melihat Summary mengggunakan library (skimr)
library(skimr)
skim(Dt)
| Name | Dt |
| Number of rows | 891 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Name | 0 | 1 | 12 | 82 | 0 | 891 | 0 |
| Sex | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| Ticket | 0 | 1 | 3 | 18 | 0 | 681 | 0 |
| Cabin | 0 | 1 | 0 | 15 | 687 | 148 | 0 |
| Embarked | 0 | 1 | 0 | 1 | 2 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| PassengerId | 0 | 1.0 | 446.00 | 257.35 | 1.00 | 223.50 | 446.00 | 668.5 | 891.00 | ▇▇▇▇▇ |
| Survived | 0 | 1.0 | 0.38 | 0.49 | 0.00 | 0.00 | 0.00 | 1.0 | 1.00 | ▇▁▁▁▅ |
| Pclass | 0 | 1.0 | 2.31 | 0.84 | 1.00 | 2.00 | 3.00 | 3.0 | 3.00 | ▃▁▃▁▇ |
| Age | 177 | 0.8 | 29.70 | 14.53 | 0.42 | 20.12 | 28.00 | 38.0 | 80.00 | ▂▇▅▂▁ |
| SibSp | 0 | 1.0 | 0.52 | 1.10 | 0.00 | 0.00 | 0.00 | 1.0 | 8.00 | ▇▁▁▁▁ |
| Parch | 0 | 1.0 | 0.38 | 0.81 | 0.00 | 0.00 | 0.00 | 0.0 | 6.00 | ▇▁▁▁▁ |
| Fare | 0 | 1.0 | 32.20 | 49.69 | 0.00 | 7.91 | 14.45 | 31.0 | 512.33 | ▇▁▁▁▁ |
dim(Dt) #Dimensi dataset
## [1] 891 12
str(Dt) # melihat Structure tipe data
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
library(visdat)
vis_dat(Dt) #memvisualisasikan
# Mengakses DataSet
head(Dt) #menampilkan 5 data teratas
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
tail(Dt) #menampilkan 5 data terbawah
## PassengerId Survived Pclass Name Sex
## 886 886 0 3 Rice, Mrs. William (Margaret Norton) female
## 887 887 0 2 Montvila, Rev. Juozas male
## 888 888 1 1 Graham, Miss. Margaret Edith female
## 889 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female
## 890 890 1 1 Behr, Mr. Karl Howell male
## 891 891 0 3 Dooley, Mr. Patrick male
## Age SibSp Parch Ticket Fare Cabin Embarked
## 886 39 0 5 382652 29.125 Q
## 887 27 0 0 211536 13.000 S
## 888 19 0 0 112053 30.000 B42 S
## 889 NA 1 2 W./C. 6607 23.450 S
## 890 26 0 0 111369 30.000 C148 C
## 891 32 0 0 370376 7.750 Q
Melihat Summary Dataset
summary (Dt)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
summary(is.na(Dt))
## PassengerId Survived Pclass Name
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:891 FALSE:891 FALSE:891 FALSE:891
##
## Sex Age SibSp Parch
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:891 FALSE:714 FALSE:891 FALSE:891
## TRUE :177
## Ticket Fare Cabin Embarked
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:891 FALSE:891 FALSE:891 FALSE:891
##
# variabel mana yang memiliki Missing Value
as.data.frame(colSums(is.na(Dt)))
## colSums(is.na(Dt))
## PassengerId 0
## Survived 0
## Pclass 0
## Name 0
## Sex 0
## Age 177
## SibSp 0
## Parch 0
## Ticket 0
## Fare 0
## Cabin 0
## Embarked 0
Dataset titanic memiliki Missing Value sebanyak 177 data pada variabel Age
library(devtools)
## Loading required package: usethis
vis_miss(Dt)
Boxplot Age
boxplot(Dt$Age,
ylab = "Age",
main = "Boxplot of Age") #Boxplot Age
boxplot(Dt$Age,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "Age"
## [1] 66.0 71.0 70.5 71.0 80.0 70.0 70.0 74.0
BoxPlot Fare
boxplot(Dt$Fare,
ylab = "Fare",
main = "Boxplot of Fare") # Boxplot Fare
boxplot(Dt$Fare,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "Fare"
## [1] 71.2833 263.0000 146.5208 82.1708 76.7292 80.0000 83.4750 73.5000
## [9] 263.0000 77.2875 247.5208 73.5000 77.2875 79.2000 66.6000 69.5500
## [17] 69.5500 146.5208 69.5500 113.2750 76.2917 90.0000 83.4750 90.0000
## [25] 79.2000 86.5000 512.3292 79.6500 153.4625 135.6333 77.9583 78.8500
## [33] 91.0792 151.5500 247.5208 151.5500 110.8833 108.9000 83.1583 262.3750
## [41] 164.8667 134.5000 69.5500 135.6333 153.4625 133.6500 66.6000 134.5000
## [49] 263.0000 75.2500 69.3000 135.6333 82.1708 211.5000 227.5250 73.5000
## [57] 120.0000 113.2750 90.0000 120.0000 263.0000 81.8583 89.1042 91.0792
## [65] 90.0000 78.2667 151.5500 86.5000 108.9000 93.5000 221.7792 106.4250
## [73] 71.0000 106.4250 110.8833 227.5250 79.6500 110.8833 79.6500 79.2000
## [81] 78.2667 153.4625 77.9583 69.3000 76.7292 73.5000 113.2750 133.6500
## [89] 73.5000 512.3292 76.7292 211.3375 110.8833 227.5250 151.5500 227.5250
## [97] 211.3375 512.3292 78.8500 262.3750 71.0000 86.5000 120.0000 77.9583
## [105] 211.3375 79.2000 69.5500 120.0000 93.5000 80.0000 83.1583 69.5500
## [113] 89.1042 164.8667 69.5500 83.1583
Boxplot Sibsp
boxplot(Dt$SibSp,
ylab = "SibSp",
main = "Boxplot of SibSp") # Boxplot SibSp
boxplot(Dt$SibSp,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "SibSp"
## [1] 3 4 3 3 4 5 3 4 5 3 3 4 8 4 4 3 8 4 8 3 4 4 4 4 8 3 3 5 3 5 3 4 4 3 3 5 4 3
## [39] 4 8 4 3 4 8 4 8
Boxplot Parch
boxplot(Dt$Parch,
ylab = "Parch",
main = "Boxplot of Parch") # Boxplot Parch
boxplot(Dt$Parch,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "SibSp"
## [1] 1 2 1 5 1 1 5 2 2 1 1 2 2 2 1 2 2 2 3 2 2 1 1 1 1 2 1 1 2 2 1 2 2 2 1 2 1
## [38] 1 2 1 4 1 1 1 1 2 2 1 2 1 1 1 2 1 1 2 2 2 1 1 2 2 1 2 1 1 1 1 1 1 1 2 1 2
## [75] 2 1 1 2 1 1 2 1 1 1 1 2 1 1 1 4 1 1 2 2 2 2 2 1 1 1 2 2 1 1 2 2 3 4 1 2 1
## [112] 1 2 1 2 1 2 1 1 2 2 1 1 1 1 2 2 2 2 2 2 1 1 2 1 4 1 1 2 1 2 1 1 2 5 2 1 1
## [149] 1 2 1 5 2 1 1 1 2 1 6 1 2 1 2 1 1 1 1 1 1 1 3 2 1 1 1 1 2 1 2 3 1 2 1 2 2
## [186] 1 1 2 1 2 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 3 2 1 1 1 1 5 2
Boxplot Survived
boxplot(Dt$Survived,
ylab = "Parch",
main = "Boxplot of Survived") # Boxplot Survived
boxplot(Dt$Survived,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "Survived"
## numeric(0)
Boxplot Passengerid
boxplot(Dt$PassengerId,
ylab = "PassengerId",
main = "Boxplot of PassengerId") # Boxplot PassengerId
boxplot(Dt$PassengerId,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "PassengerId"
## numeric(0)
Boxplot Pclass
boxplot(Dt$Pclass,
ylab = "Pclass",
main = "Boxplot of Pclass") # Boxplot Pclass
boxplot(Dt$Pclass,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "Pclass"
## numeric(0)
Kolom numeric yang memiliki outlier adalah Age, Sibsp, Parch, dan Fare
Melakukan Filterisasi mengambil data yang bertipe numeric
#Proses Filtering mengabil tipe data yang numerik
Dt2 <- Dt %>% select_if(is.numeric)
library(ggcorrplot)
library(corrplot)
## corrplot 0.92 loaded
#Proses Filtering mengabil tipe data yang numerik
M <- cor(Dt2, use='complete.obs')
print(M)
## PassengerId Survived Pclass Age SibSp
## PassengerId 1.00000000 0.02934016 -0.03534911 0.03684720 -0.08239772
## Survived 0.02934016 1.00000000 -0.35965268 -0.07722109 -0.01735836
## Pclass -0.03534911 -0.35965268 1.00000000 -0.36922602 0.06724737
## Age 0.03684720 -0.07722109 -0.36922602 1.00000000 -0.30824676
## SibSp -0.08239772 -0.01735836 0.06724737 -0.30824676 1.00000000
## Parch -0.01161741 0.09331701 0.02568307 -0.18911926 0.38381986
## Fare 0.00959178 0.26818862 -0.55418247 0.09606669 0.13832879
## Parch Fare
## PassengerId -0.01161741 0.00959178
## Survived 0.09331701 0.26818862
## Pclass 0.02568307 -0.55418247
## Age -0.18911926 0.09606669
## SibSp 0.38381986 0.13832879
## Parch 1.00000000 0.20511888
## Fare 0.20511888 1.00000000
corrplot(M,
order = "hclust", # mengurutkan berdasarkan hierarchical clustering
tl.col="black", # warna tulisan
tl.srt=90, # kemiringan tulisan 45 derajat
main = "Correlation",
method = "number") # Bentuk visualisasimethod="number")
library(EnvStats) #menggunakan library EnVStats
##
## Attaching package: 'EnvStats'
## The following objects are masked from 'package:stats':
##
## predict, predict.lm
## The following object is masked from 'package:base':
##
## print.default
epdfPlot(Dt$Age, epdf.col = "Blue", main = "Age")
## Warning in is.not.finite.warning(x): There were 177 nonfinite values in x : 177
## NA's
## Warning in epdfPlot(Dt$Age, epdf.col = "Blue", main = "Age"): 177 observations
## with NA/NaN/Inf in 'x' removed.
epdfPlot(Dt$Survived, epdf.col = "Blue", main = "Survived")
epdfPlot(Dt$Fare, epdf.col = "Blue", main = "Fare")
epdfPlot(Dt$Parch, epdf.col = "Blue", main = "Parch")
epdfPlot(Dt$PassengerId, epdf.col = "Blue", main = "PassengerId")
epdfPlot(Dt$SibSp, epdf.col = "Blue", main = "SibSp")
epdfPlot(Dt$Pclass, epdf.col = "Blue", main = "Pclass")
library(ggplot2)
# Plot Distribusi Numerik variabel age
ggplot(data = Dt, mapping = aes(x = Age)) +
geom_density(fill = "aquamarine", alpha = 0.7)
## Warning: Removed 177 rows containing non-finite values (stat_density).
# Plot Distribusi Numerik variabel Survived
ggplot(data = Dt, mapping = aes(x = Survived)) +
geom_density(fill = "aquamarine", alpha = 0.7)
# Plot Distribusi Numerik variabel Fare
ggplot(data = Dt, mapping = aes(x = Fare)) +
geom_density(fill = "aquamarine", alpha = 0.7)
# Plot Distribusi Numerik variabel Parch
ggplot(data = Dt, mapping = aes(x = Parch)) +
geom_density(fill = "aquamarine", alpha = 0.7)
# Plot Distribusi Numerik variabel PassengerId
ggplot(data = Dt, mapping = aes(x = PassengerId)) +
geom_density(fill = "aquamarine", alpha = 0.7)
# Plot Distribusi Numerik variabel SibSp
ggplot(data = Dt, mapping = aes(x = SibSp)) +
geom_density(fill = "aquamarine", alpha = 0.7)
# Plot Distribusi Numerik variabel Pclass
ggplot(data = Dt, mapping = aes(x = Pclass)) +
geom_density(fill = "aquamarine", alpha = 0.7)
# mengisi Missing Value Mengisi missing value pada kolom Age dengan Nilai Mean
### replace missing value of column with mean
Dt2$Age[is.na(Dt2$Age)] <- mean(Dt2$Age,na.rm = TRUE)
vis_miss(Dt2) #mengecek apakah missing value sudah terisi
Menghilangkan Data Outliers di variabel Age
#Mencari nilai Q1, Q3, and interquartile range for values in column Age
Q1_Age <- quantile(Dt2$Age, .25)
Q3_Age <- quantile(Dt2$Age, .75)
IQR <- IQR(Dt2$Age)
#Menghilangkan data outliers dari variabel Age
Dt2_no_outliers <- subset(Dt2, Dt2$Age> (Q1_Age - 1.5*IQR) & Dt2$Age< (Q3_Age + 1.5*IQR))
Menampilkan boxplot Age
setelah dihilangkan Outliers
boxplot(Dt2_no_outliers$Age,
ylab = "Age",
main = "Boxplot of Age") # Boxplot Pclass)
Menghilangkan Data outliers di variabel Parch
#Mencari nilai Q1, Q3, and interquartile range for values in column Age
Q1_Parch <- quantile(Dt2$Age, .25)
Q3_Parch <- quantile(Dt2$Age, .75)
IQR <- IQR(Dt2$Age)
#Menghilangkan data outliers dari variabel Age
Dt2_no_outliers1 <- subset(Dt2, Dt2$Parch> (Q1_Parch - 1.5*IQR) & Dt2$Parch< (Q3_Parch + 1.5*IQR))
Menampilkan boxplot Parch
setelah outliers dihilangkan
boxplot(Dt2_no_outliers1$Parch,
ylab = "Parch",
main = "Boxplot of Parch") # Boxplot Pclass)
Menghilangkan Data outliers SibSp
#Mencari nilai Q1, Q3, and interquartile range for values in column SibSP
Q1_SibSp <- quantile(Dt2$SibSp, .25)
Q3_SibSp <- quantile(Dt2$SibSp, .75)
IQR <- IQR(Dt2$SibSp)
#Menghilangkan data outliers dari variabel SibSp
Dt2_no_outliers2 <- subset(Dt2, Dt2$SibSp> (Q1_SibSp - 1.5*IQR) & Dt2$SibSp< (Q3_SibSp + 1.5*IQR))
Menampilkan boxplot SibSP
setelah outliers dihilangkan
boxplot(Dt2_no_outliers2$SibSp,
ylab = "SibSp",
main = "Boxplot of SibSp") # Boxplot SibSp)
Menghilangkan Data Outliers Fare
#Mencari nilai Q1, Q3, and interquartile range for values in column Fare
Q1_Fare <- quantile(Dt2$Fare, .25)
Q3_Fare <- quantile(Dt2$Fare, .75)
IQR <- IQR(Dt2$SibSp)
#Menghilangkan data outliers dari variabel SibSp
Dt2_no_outliers3 <- subset(Dt2, Dt2$Fare> (Q1_Fare - 1.5*IQR) & Dt2$Fare< (Q3_Fare + 1.5*IQR))
Menampilkan boxplot Fare
setelah outliers dihilangkan
boxplot(Dt2_no_outliers3$Fare,
ylab = "Fare",
main = "Boxplot of Fare") # Boxplot Fare
Sekian yang saya bisa selesaikan pada Project EDA dengan menggunakan R. Mohon maaf bila ada kekurangan dan terima kasih