# Read file data Titanic
Dt <- read.csv("C:/Users/LENOVO/Downloads/titanic.csv")

Memahami dataset

Memahami variable yang ada dalam dataset
PassengerId = Nomor Id Penumpang
Survived = Keterangan Selamat(0=Tidak, 1=Ya)
Pclass = Kelas Tiket (1=Kelas 1, 2=Kelas 2, dst)
Name = Nama Penumpang
Sex = Jenis kelamin
Age = Usia dalam tahun
SibSp = Jumlah saudara kandung / pasangan di kapal Titanic
Parch = Jumlah orang tua / anak di kapal Titanic
Ticket = Nomor Tiket
Fare = Harga Tiket
Cabin = Nama Kabin
Embarked = Pelabuhan Asal (C = Cherbourg, Q = Queenstown, S = Southampton)
library(readr)
library(tidyr)
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.6     v stringr 1.4.0
## v purrr   0.3.4     v forcats 0.5.1
## v dplyr   1.0.7
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

melihat Summary mengggunakan library (skimr)

library(skimr)
skim(Dt)
Data summary
Name Dt
Number of rows 891
Number of columns 12
_______________________
Column type frequency:
character 5
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Name 0 1 12 82 0 891 0
Sex 0 1 4 6 0 2 0
Ticket 0 1 3 18 0 681 0
Cabin 0 1 0 15 687 148 0
Embarked 0 1 0 1 2 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PassengerId 0 1.0 446.00 257.35 1.00 223.50 446.00 668.5 891.00 ▇▇▇▇▇
Survived 0 1.0 0.38 0.49 0.00 0.00 0.00 1.0 1.00 ▇▁▁▁▅
Pclass 0 1.0 2.31 0.84 1.00 2.00 3.00 3.0 3.00 ▃▁▃▁▇
Age 177 0.8 29.70 14.53 0.42 20.12 28.00 38.0 80.00 ▂▇▅▂▁
SibSp 0 1.0 0.52 1.10 0.00 0.00 0.00 1.0 8.00 ▇▁▁▁▁
Parch 0 1.0 0.38 0.81 0.00 0.00 0.00 0.0 6.00 ▇▁▁▁▁
Fare 0 1.0 32.20 49.69 0.00 7.91 14.45 31.0 512.33 ▇▁▁▁▁
dim(Dt) #Dimensi dataset
## [1] 891  12
str(Dt) # melihat Structure tipe data
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
library(visdat)
vis_dat(Dt) #memvisualisasikan

# Mengakses DataSet

head(Dt) #menampilkan 5 data teratas
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q
tail(Dt) #menampilkan 5 data terbawah
##     PassengerId Survived Pclass                                     Name    Sex
## 886         886        0      3     Rice, Mrs. William (Margaret Norton) female
## 887         887        0      2                    Montvila, Rev. Juozas   male
## 888         888        1      1             Graham, Miss. Margaret Edith female
## 889         889        0      3 Johnston, Miss. Catherine Helen "Carrie" female
## 890         890        1      1                    Behr, Mr. Karl Howell   male
## 891         891        0      3                      Dooley, Mr. Patrick   male
##     Age SibSp Parch     Ticket   Fare Cabin Embarked
## 886  39     0     5     382652 29.125              Q
## 887  27     0     0     211536 13.000              S
## 888  19     0     0     112053 30.000   B42        S
## 889  NA     1     2 W./C. 6607 23.450              S
## 890  26     0     0     111369 30.000  C148        C
## 891  32     0     0     370376  7.750              Q

Melihat Summary Dataset

summary (Dt)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
summary(is.na(Dt))  
##  PassengerId      Survived         Pclass           Name        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:891       FALSE:891       FALSE:891       FALSE:891      
##                                                                 
##     Sex             Age            SibSp           Parch        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:891       FALSE:714       FALSE:891       FALSE:891      
##                  TRUE :177                                      
##    Ticket           Fare           Cabin          Embarked      
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:891       FALSE:891       FALSE:891       FALSE:891      
## 

Mengecek Missing Value

 #  variabel mana yang memiliki Missing Value 
as.data.frame(colSums(is.na(Dt))) 
##             colSums(is.na(Dt))
## PassengerId                  0
## Survived                     0
## Pclass                       0
## Name                         0
## Sex                          0
## Age                        177
## SibSp                        0
## Parch                        0
## Ticket                       0
## Fare                         0
## Cabin                        0
## Embarked                     0

Dataset titanic memiliki Missing Value sebanyak 177 data pada variabel Age

Menampilkan Missing Value

library(devtools)
## Loading required package: usethis
vis_miss(Dt)

Mengecek data Outliers

Boxplot Age

boxplot(Dt$Age, 
  ylab = "Age",
  main = "Boxplot of Age") #Boxplot Age

boxplot(Dt$Age,plot=FALSE)$out  #Menampilkan Data Outlier pada variabel "Age"
## [1] 66.0 71.0 70.5 71.0 80.0 70.0 70.0 74.0

BoxPlot Fare

boxplot(Dt$Fare,
  ylab = "Fare",
  main = "Boxplot of Fare") # Boxplot Fare

boxplot(Dt$Fare,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "Fare"
##   [1]  71.2833 263.0000 146.5208  82.1708  76.7292  80.0000  83.4750  73.5000
##   [9] 263.0000  77.2875 247.5208  73.5000  77.2875  79.2000  66.6000  69.5500
##  [17]  69.5500 146.5208  69.5500 113.2750  76.2917  90.0000  83.4750  90.0000
##  [25]  79.2000  86.5000 512.3292  79.6500 153.4625 135.6333  77.9583  78.8500
##  [33]  91.0792 151.5500 247.5208 151.5500 110.8833 108.9000  83.1583 262.3750
##  [41] 164.8667 134.5000  69.5500 135.6333 153.4625 133.6500  66.6000 134.5000
##  [49] 263.0000  75.2500  69.3000 135.6333  82.1708 211.5000 227.5250  73.5000
##  [57] 120.0000 113.2750  90.0000 120.0000 263.0000  81.8583  89.1042  91.0792
##  [65]  90.0000  78.2667 151.5500  86.5000 108.9000  93.5000 221.7792 106.4250
##  [73]  71.0000 106.4250 110.8833 227.5250  79.6500 110.8833  79.6500  79.2000
##  [81]  78.2667 153.4625  77.9583  69.3000  76.7292  73.5000 113.2750 133.6500
##  [89]  73.5000 512.3292  76.7292 211.3375 110.8833 227.5250 151.5500 227.5250
##  [97] 211.3375 512.3292  78.8500 262.3750  71.0000  86.5000 120.0000  77.9583
## [105] 211.3375  79.2000  69.5500 120.0000  93.5000  80.0000  83.1583  69.5500
## [113]  89.1042 164.8667  69.5500  83.1583

Boxplot Sibsp

boxplot(Dt$SibSp,
  ylab = "SibSp",
  main = "Boxplot of SibSp") # Boxplot SibSp

boxplot(Dt$SibSp,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "SibSp"
##  [1] 3 4 3 3 4 5 3 4 5 3 3 4 8 4 4 3 8 4 8 3 4 4 4 4 8 3 3 5 3 5 3 4 4 3 3 5 4 3
## [39] 4 8 4 3 4 8 4 8

Boxplot Parch

boxplot(Dt$Parch,
  ylab = "Parch",
  main = "Boxplot of Parch") # Boxplot Parch

boxplot(Dt$Parch,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "SibSp"
##   [1] 1 2 1 5 1 1 5 2 2 1 1 2 2 2 1 2 2 2 3 2 2 1 1 1 1 2 1 1 2 2 1 2 2 2 1 2 1
##  [38] 1 2 1 4 1 1 1 1 2 2 1 2 1 1 1 2 1 1 2 2 2 1 1 2 2 1 2 1 1 1 1 1 1 1 2 1 2
##  [75] 2 1 1 2 1 1 2 1 1 1 1 2 1 1 1 4 1 1 2 2 2 2 2 1 1 1 2 2 1 1 2 2 3 4 1 2 1
## [112] 1 2 1 2 1 2 1 1 2 2 1 1 1 1 2 2 2 2 2 2 1 1 2 1 4 1 1 2 1 2 1 1 2 5 2 1 1
## [149] 1 2 1 5 2 1 1 1 2 1 6 1 2 1 2 1 1 1 1 1 1 1 3 2 1 1 1 1 2 1 2 3 1 2 1 2 2
## [186] 1 1 2 1 2 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 3 2 1 1 1 1 5 2

Boxplot Survived

boxplot(Dt$Survived,
  ylab = "Parch",
  main = "Boxplot of Survived") # Boxplot Survived

boxplot(Dt$Survived,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "Survived"
## numeric(0)

Boxplot Passengerid

boxplot(Dt$PassengerId,
  ylab = "PassengerId",
  main = "Boxplot of PassengerId") # Boxplot PassengerId

boxplot(Dt$PassengerId,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "PassengerId"
## numeric(0)

Boxplot Pclass

boxplot(Dt$Pclass,
  ylab = "Pclass",
  main = "Boxplot of Pclass") # Boxplot Pclass

boxplot(Dt$Pclass,plot=FALSE)$out #Menampilkan Data Outlier pada variabel "Pclass"
## numeric(0)

Kolom numeric yang memiliki outlier adalah Age, Sibsp, Parch, dan Fare

Melakukan Filterisasi mengambil data yang bertipe numeric

#Proses Filtering mengabil tipe data yang numerik
Dt2 <- Dt %>% select_if(is.numeric)

Melihat Korelasi Data

library(ggcorrplot)
library(corrplot)
## corrplot 0.92 loaded
#Proses Filtering mengabil tipe data yang numerik

M <- cor(Dt2, use='complete.obs')
print(M)
##             PassengerId    Survived      Pclass         Age       SibSp
## PassengerId  1.00000000  0.02934016 -0.03534911  0.03684720 -0.08239772
## Survived     0.02934016  1.00000000 -0.35965268 -0.07722109 -0.01735836
## Pclass      -0.03534911 -0.35965268  1.00000000 -0.36922602  0.06724737
## Age          0.03684720 -0.07722109 -0.36922602  1.00000000 -0.30824676
## SibSp       -0.08239772 -0.01735836  0.06724737 -0.30824676  1.00000000
## Parch       -0.01161741  0.09331701  0.02568307 -0.18911926  0.38381986
## Fare         0.00959178  0.26818862 -0.55418247  0.09606669  0.13832879
##                   Parch        Fare
## PassengerId -0.01161741  0.00959178
## Survived     0.09331701  0.26818862
## Pclass       0.02568307 -0.55418247
## Age         -0.18911926  0.09606669
## SibSp        0.38381986  0.13832879
## Parch        1.00000000  0.20511888
## Fare         0.20511888  1.00000000
corrplot(M,
         order = "hclust",   # mengurutkan berdasarkan hierarchical clustering
         tl.col="black",     # warna tulisan
         tl.srt=90,         # kemiringan tulisan 45 derajat
         main = "Correlation",
         method = "number") # Bentuk visualisasimethod="number")

Plot Distribusi Numerik

library(EnvStats) #menggunakan library EnVStats
## 
## Attaching package: 'EnvStats'
## The following objects are masked from 'package:stats':
## 
##     predict, predict.lm
## The following object is masked from 'package:base':
## 
##     print.default
epdfPlot(Dt$Age, epdf.col = "Blue", main = "Age")
## Warning in is.not.finite.warning(x): There were 177 nonfinite values in x : 177
## NA's
## Warning in epdfPlot(Dt$Age, epdf.col = "Blue", main = "Age"): 177 observations
## with NA/NaN/Inf in 'x' removed.

epdfPlot(Dt$Survived, epdf.col = "Blue", main = "Survived")

epdfPlot(Dt$Fare, epdf.col = "Blue", main = "Fare")

epdfPlot(Dt$Parch, epdf.col = "Blue", main = "Parch")

epdfPlot(Dt$PassengerId, epdf.col = "Blue", main = "PassengerId")

epdfPlot(Dt$SibSp, epdf.col = "Blue", main = "SibSp")

epdfPlot(Dt$Pclass, epdf.col = "Blue", main = "Pclass")

Visualisasi Density

library(ggplot2)
# Plot Distribusi Numerik variabel age
ggplot(data = Dt, mapping = aes(x = Age)) + 
  geom_density(fill = "aquamarine", alpha = 0.7)
## Warning: Removed 177 rows containing non-finite values (stat_density).

# Plot Distribusi Numerik variabel Survived
ggplot(data = Dt, mapping = aes(x = Survived)) + 
  geom_density(fill = "aquamarine", alpha = 0.7)

# Plot Distribusi Numerik variabel Fare
ggplot(data = Dt, mapping = aes(x = Fare)) + 
  geom_density(fill = "aquamarine", alpha = 0.7)

# Plot Distribusi Numerik variabel Parch
ggplot(data = Dt, mapping = aes(x = Parch)) + 
  geom_density(fill = "aquamarine", alpha = 0.7)

# Plot Distribusi Numerik variabel PassengerId
ggplot(data = Dt, mapping = aes(x = PassengerId)) + 
  geom_density(fill = "aquamarine", alpha = 0.7)

# Plot Distribusi Numerik variabel SibSp
ggplot(data = Dt, mapping = aes(x = SibSp)) + 
  geom_density(fill = "aquamarine", alpha = 0.7)

# Plot Distribusi Numerik variabel Pclass
ggplot(data = Dt, mapping = aes(x = Pclass)) + 
  geom_density(fill = "aquamarine", alpha = 0.7)

# mengisi Missing Value Mengisi missing value pada kolom Age dengan Nilai Mean

### replace missing value of column with mean
Dt2$Age[is.na(Dt2$Age)] <- mean(Dt2$Age,na.rm = TRUE)
vis_miss(Dt2) #mengecek apakah missing value sudah terisi

Menghilangkan Outliers

Menghilangkan Data Outliers di variabel Age

#Mencari nilai Q1, Q3, and interquartile range for values in column Age
Q1_Age <- quantile(Dt2$Age, .25)
Q3_Age <- quantile(Dt2$Age, .75)
IQR <- IQR(Dt2$Age)

#Menghilangkan data outliers dari variabel Age
Dt2_no_outliers <- subset(Dt2, Dt2$Age> (Q1_Age - 1.5*IQR) & Dt2$Age< (Q3_Age + 1.5*IQR))

Menampilkan boxplot Age

setelah dihilangkan Outliers

boxplot(Dt2_no_outliers$Age,
        ylab = "Age",
  main = "Boxplot of Age") # Boxplot Pclass)

Menghilangkan Data outliers di variabel Parch

#Mencari nilai Q1, Q3, and interquartile range for values in column Age
Q1_Parch <- quantile(Dt2$Age, .25)
Q3_Parch <- quantile(Dt2$Age, .75)
IQR <- IQR(Dt2$Age)

#Menghilangkan data outliers dari variabel Age
Dt2_no_outliers1 <- subset(Dt2, Dt2$Parch> (Q1_Parch - 1.5*IQR) & Dt2$Parch< (Q3_Parch + 1.5*IQR))

Menampilkan boxplot Parch

setelah outliers dihilangkan

boxplot(Dt2_no_outliers1$Parch,
        ylab = "Parch",
  main = "Boxplot of Parch") # Boxplot Pclass)

Menghilangkan Data outliers SibSp

#Mencari nilai Q1, Q3, and interquartile range for values in column SibSP
Q1_SibSp <- quantile(Dt2$SibSp, .25)
Q3_SibSp <- quantile(Dt2$SibSp, .75)
IQR <- IQR(Dt2$SibSp)

#Menghilangkan data outliers dari variabel SibSp
Dt2_no_outliers2 <- subset(Dt2, Dt2$SibSp> (Q1_SibSp - 1.5*IQR) & Dt2$SibSp< (Q3_SibSp + 1.5*IQR))

Menampilkan boxplot SibSP

setelah outliers dihilangkan

boxplot(Dt2_no_outliers2$SibSp,
        ylab = "SibSp",
  main = "Boxplot of SibSp") # Boxplot SibSp)

Menghilangkan Data Outliers Fare

#Mencari nilai Q1, Q3, and interquartile range for values in column Fare
Q1_Fare <- quantile(Dt2$Fare, .25)
Q3_Fare <- quantile(Dt2$Fare, .75)
IQR <- IQR(Dt2$SibSp)

#Menghilangkan data outliers dari variabel SibSp
Dt2_no_outliers3 <- subset(Dt2, Dt2$Fare> (Q1_Fare - 1.5*IQR) & Dt2$Fare< (Q3_Fare + 1.5*IQR))

Menampilkan boxplot Fare

setelah outliers dihilangkan

boxplot(Dt2_no_outliers3$Fare,
        ylab = "Fare",
  main = "Boxplot of Fare") # Boxplot Fare

Penutup

Sekian yang saya bisa selesaikan pada Project EDA dengan menggunakan R. Mohon maaf bila ada kekurangan dan terima kasih