library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(ggplot2)

Load dataset bawaan R

dataaa <- datasets::trees

Melihat sekilas dataset

dataaa
##    Girth Height Volume
## 1    8.3     70   10.3
## 2    8.6     65   10.3
## 3    8.8     63   10.2
## 4   10.5     72   16.4
## 5   10.7     81   18.8
## 6   10.8     83   19.7
## 7   11.0     66   15.6
## 8   11.0     75   18.2
## 9   11.1     80   22.6
## 10  11.2     75   19.9
## 11  11.3     79   24.2
## 12  11.4     76   21.0
## 13  11.4     76   21.4
## 14  11.7     69   21.3
## 15  12.0     75   19.1
## 16  12.9     74   22.2
## 17  12.9     85   33.8
## 18  13.3     86   27.4
## 19  13.7     71   25.7
## 20  13.8     64   24.9
## 21  14.0     78   34.5
## 22  14.2     80   31.7
## 23  14.5     74   36.3
## 24  16.0     72   38.3
## 25  16.3     77   42.6
## 26  17.3     81   55.4
## 27  17.5     82   55.7
## 28  17.9     80   58.3
## 29  18.0     80   51.5
## 30  18.0     80   51.0
## 31  20.6     87   77.0
summary(dataaa)
##      Girth           Height       Volume     
##  Min.   : 8.30   Min.   :63   Min.   :10.20  
##  1st Qu.:11.05   1st Qu.:72   1st Qu.:19.40  
##  Median :12.90   Median :76   Median :24.20  
##  Mean   :13.25   Mean   :76   Mean   :30.17  
##  3rd Qu.:15.25   3rd Qu.:80   3rd Qu.:37.30  
##  Max.   :20.60   Max.   :87   Max.   :77.00
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep

Pendeteksian Missing Values

is.na(dataaa)
##       Girth Height Volume
##  [1,] FALSE  FALSE  FALSE
##  [2,] FALSE  FALSE  FALSE
##  [3,] FALSE  FALSE  FALSE
##  [4,] FALSE  FALSE  FALSE
##  [5,] FALSE  FALSE  FALSE
##  [6,] FALSE  FALSE  FALSE
##  [7,] FALSE  FALSE  FALSE
##  [8,] FALSE  FALSE  FALSE
##  [9,] FALSE  FALSE  FALSE
## [10,] FALSE  FALSE  FALSE
## [11,] FALSE  FALSE  FALSE
## [12,] FALSE  FALSE  FALSE
## [13,] FALSE  FALSE  FALSE
## [14,] FALSE  FALSE  FALSE
## [15,] FALSE  FALSE  FALSE
## [16,] FALSE  FALSE  FALSE
## [17,] FALSE  FALSE  FALSE
## [18,] FALSE  FALSE  FALSE
## [19,] FALSE  FALSE  FALSE
## [20,] FALSE  FALSE  FALSE
## [21,] FALSE  FALSE  FALSE
## [22,] FALSE  FALSE  FALSE
## [23,] FALSE  FALSE  FALSE
## [24,] FALSE  FALSE  FALSE
## [25,] FALSE  FALSE  FALSE
## [26,] FALSE  FALSE  FALSE
## [27,] FALSE  FALSE  FALSE
## [28,] FALSE  FALSE  FALSE
## [29,] FALSE  FALSE  FALSE
## [30,] FALSE  FALSE  FALSE
## [31,] FALSE  FALSE  FALSE
colSums(dataaa)
##  Girth Height Volume 
##  410.7 2356.0  935.3
aggr(dataaa, numbers = TRUE, prop = FALSE)

dataaa$Ozone[is.na(dataaa$Ozone)] <- median(dataaa$Ozone, na.rm = TRUE)
dataaa$Solar.R[is.na(dataaa$Solar.R)] <- median(dataaa$Solar.R, na.rm = TRUE)

dataaa
##    Girth Height Volume
## 1    8.3     70   10.3
## 2    8.6     65   10.3
## 3    8.8     63   10.2
## 4   10.5     72   16.4
## 5   10.7     81   18.8
## 6   10.8     83   19.7
## 7   11.0     66   15.6
## 8   11.0     75   18.2
## 9   11.1     80   22.6
## 10  11.2     75   19.9
## 11  11.3     79   24.2
## 12  11.4     76   21.0
## 13  11.4     76   21.4
## 14  11.7     69   21.3
## 15  12.0     75   19.1
## 16  12.9     74   22.2
## 17  12.9     85   33.8
## 18  13.3     86   27.4
## 19  13.7     71   25.7
## 20  13.8     64   24.9
## 21  14.0     78   34.5
## 22  14.2     80   31.7
## 23  14.5     74   36.3
## 24  16.0     72   38.3
## 25  16.3     77   42.6
## 26  17.3     81   55.4
## 27  17.5     82   55.7
## 28  17.9     80   58.3
## 29  18.0     80   51.5
## 30  18.0     80   51.0
## 31  20.6     87   77.0
colSums(is.na(dataaa))
##  Girth Height Volume 
##      0      0      0
summary(dataaa)
##      Girth           Height       Volume     
##  Min.   : 8.30   Min.   :63   Min.   :10.20  
##  1st Qu.:11.05   1st Qu.:72   1st Qu.:19.40  
##  Median :12.90   Median :76   Median :24.20  
##  Mean   :13.25   Mean   :76   Mean   :30.17  
##  3rd Qu.:15.25   3rd Qu.:80   3rd Qu.:37.30  
##  Max.   :20.60   Max.   :87   Max.   :77.00

Menggunakan metode IQR untuk mendeteksi outlier pada kolom Ozone

Q1 <- quantile(dataaa$Ozone, 0.25)
Q3 <- quantile(dataaa$Ozone, 0.75)
IQR <- Q3 - Q1

Batas bawah dan atas

lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

Menandai outlier dengan kondisi apakah nilainya di luar batas bawah atau atas

outliersa <- dataaa$Ozone < lower_bound
outliers <- dataaa$Ozone > upper_bound
sum(outliersa)
## [1] 0
sum(outliers)
## [1] 0
summary(dataaa)
##      Girth           Height       Volume     
##  Min.   : 8.30   Min.   :63   Min.   :10.20  
##  1st Qu.:11.05   1st Qu.:72   1st Qu.:19.40  
##  Median :12.90   Median :76   Median :24.20  
##  Mean   :13.25   Mean   :76   Mean   :30.17  
##  3rd Qu.:15.25   3rd Qu.:80   3rd Qu.:37.30  
##  Max.   :20.60   Max.   :87   Max.   :77.00
min(dataaa, na.rm = TRUE)
## [1] 8.3
max(dataaa, na.rm = TRUE)
## [1] 87

Visualisasi boxplot untuk melihat outlier

boxplot(dataaa$Volume,
        main = "Boxplot Volume",
        col = "pink",
        ylab = "Volume")

Menangani outlier dengan winsorizing (mengganti nilai ekstrem dengan batas)

Q1 <- quantile(dataaa$Volume, 0.25)
Q3 <- quantile(dataaa$Volume, 0.75)
IQR_val <- IQR(dataaa$Volume)

lower_bound <- Q1 - 1.5 * IQR_val
upper_bound <- Q3 + 1.5 * IQR_val
dataaa$Volume_winsor <- ifelse(dataaa$Volume < lower_bound, lower_bound,
                         ifelse(dataaa$Volume > upper_bound, upper_bound,
                                dataaa$Volume))

Bandingkan Boxplot Sebelum & Sesudah

par(mfrow=c(1,2))

boxplot(dataaa$Volume,
        main="Sebelum Winsorizing",
        col="pink")

boxplot(dataaa$Volume_winsor,
        main="Sesudah Winsorizing",
        col="lightblue")

Cek jumlah duplikasi dalam dataset

Menghitung jumlah baris yang duplikat

sum(duplicated(dataaa))
## [1] 0

Hapus duplikasi jika ada

Menyaring hanya baris unik

dataaa <- dataaa[!duplicated(dataaa), ]

Cek ulang data setelah preprocessing

Menampilkan ringkasan statistik setelah preprocessing

summary(dataaa)
##      Girth           Height       Volume      Volume_winsor  
##  Min.   : 8.30   Min.   :63   Min.   :10.20   Min.   :10.20  
##  1st Qu.:11.05   1st Qu.:72   1st Qu.:19.40   1st Qu.:19.40  
##  Median :12.90   Median :76   Median :24.20   Median :24.20  
##  Mean   :13.25   Mean   :76   Mean   :30.17   Mean   :29.76  
##  3rd Qu.:15.25   3rd Qu.:80   3rd Qu.:37.30   3rd Qu.:37.30  
##  Max.   :20.60   Max.   :87   Max.   :77.00   Max.   :64.15