library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(ggplot2)
Load dataset bawaan R
dataaa <- datasets::trees
Melihat sekilas dataset
dataaa
## Girth Height Volume
## 1 8.3 70 10.3
## 2 8.6 65 10.3
## 3 8.8 63 10.2
## 4 10.5 72 16.4
## 5 10.7 81 18.8
## 6 10.8 83 19.7
## 7 11.0 66 15.6
## 8 11.0 75 18.2
## 9 11.1 80 22.6
## 10 11.2 75 19.9
## 11 11.3 79 24.2
## 12 11.4 76 21.0
## 13 11.4 76 21.4
## 14 11.7 69 21.3
## 15 12.0 75 19.1
## 16 12.9 74 22.2
## 17 12.9 85 33.8
## 18 13.3 86 27.4
## 19 13.7 71 25.7
## 20 13.8 64 24.9
## 21 14.0 78 34.5
## 22 14.2 80 31.7
## 23 14.5 74 36.3
## 24 16.0 72 38.3
## 25 16.3 77 42.6
## 26 17.3 81 55.4
## 27 17.5 82 55.7
## 28 17.9 80 58.3
## 29 18.0 80 51.5
## 30 18.0 80 51.0
## 31 20.6 87 77.0
summary(dataaa)
## Girth Height Volume
## Min. : 8.30 Min. :63 Min. :10.20
## 1st Qu.:11.05 1st Qu.:72 1st Qu.:19.40
## Median :12.90 Median :76 Median :24.20
## Mean :13.25 Mean :76 Mean :30.17
## 3rd Qu.:15.25 3rd Qu.:80 3rd Qu.:37.30
## Max. :20.60 Max. :87 Max. :77.00
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
Pendeteksian Missing Values
is.na(dataaa)
## Girth Height Volume
## [1,] FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE
## [6,] FALSE FALSE FALSE
## [7,] FALSE FALSE FALSE
## [8,] FALSE FALSE FALSE
## [9,] FALSE FALSE FALSE
## [10,] FALSE FALSE FALSE
## [11,] FALSE FALSE FALSE
## [12,] FALSE FALSE FALSE
## [13,] FALSE FALSE FALSE
## [14,] FALSE FALSE FALSE
## [15,] FALSE FALSE FALSE
## [16,] FALSE FALSE FALSE
## [17,] FALSE FALSE FALSE
## [18,] FALSE FALSE FALSE
## [19,] FALSE FALSE FALSE
## [20,] FALSE FALSE FALSE
## [21,] FALSE FALSE FALSE
## [22,] FALSE FALSE FALSE
## [23,] FALSE FALSE FALSE
## [24,] FALSE FALSE FALSE
## [25,] FALSE FALSE FALSE
## [26,] FALSE FALSE FALSE
## [27,] FALSE FALSE FALSE
## [28,] FALSE FALSE FALSE
## [29,] FALSE FALSE FALSE
## [30,] FALSE FALSE FALSE
## [31,] FALSE FALSE FALSE
colSums(dataaa)
## Girth Height Volume
## 410.7 2356.0 935.3
aggr(dataaa, numbers = TRUE, prop = FALSE)

dataaa$Ozone[is.na(dataaa$Ozone)] <- median(dataaa$Ozone, na.rm = TRUE)
dataaa$Solar.R[is.na(dataaa$Solar.R)] <- median(dataaa$Solar.R, na.rm = TRUE)
dataaa
## Girth Height Volume
## 1 8.3 70 10.3
## 2 8.6 65 10.3
## 3 8.8 63 10.2
## 4 10.5 72 16.4
## 5 10.7 81 18.8
## 6 10.8 83 19.7
## 7 11.0 66 15.6
## 8 11.0 75 18.2
## 9 11.1 80 22.6
## 10 11.2 75 19.9
## 11 11.3 79 24.2
## 12 11.4 76 21.0
## 13 11.4 76 21.4
## 14 11.7 69 21.3
## 15 12.0 75 19.1
## 16 12.9 74 22.2
## 17 12.9 85 33.8
## 18 13.3 86 27.4
## 19 13.7 71 25.7
## 20 13.8 64 24.9
## 21 14.0 78 34.5
## 22 14.2 80 31.7
## 23 14.5 74 36.3
## 24 16.0 72 38.3
## 25 16.3 77 42.6
## 26 17.3 81 55.4
## 27 17.5 82 55.7
## 28 17.9 80 58.3
## 29 18.0 80 51.5
## 30 18.0 80 51.0
## 31 20.6 87 77.0
colSums(is.na(dataaa))
## Girth Height Volume
## 0 0 0
summary(dataaa)
## Girth Height Volume
## Min. : 8.30 Min. :63 Min. :10.20
## 1st Qu.:11.05 1st Qu.:72 1st Qu.:19.40
## Median :12.90 Median :76 Median :24.20
## Mean :13.25 Mean :76 Mean :30.17
## 3rd Qu.:15.25 3rd Qu.:80 3rd Qu.:37.30
## Max. :20.60 Max. :87 Max. :77.00
Menggunakan metode IQR untuk mendeteksi outlier pada kolom
Ozone
Q1 <- quantile(dataaa$Ozone, 0.25)
Q3 <- quantile(dataaa$Ozone, 0.75)
IQR <- Q3 - Q1
Batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
Menandai outlier dengan kondisi apakah nilainya di luar batas bawah
atau atas
outliersa <- dataaa$Ozone < lower_bound
outliers <- dataaa$Ozone > upper_bound
sum(outliersa)
## [1] 0
sum(outliers)
## [1] 0
summary(dataaa)
## Girth Height Volume
## Min. : 8.30 Min. :63 Min. :10.20
## 1st Qu.:11.05 1st Qu.:72 1st Qu.:19.40
## Median :12.90 Median :76 Median :24.20
## Mean :13.25 Mean :76 Mean :30.17
## 3rd Qu.:15.25 3rd Qu.:80 3rd Qu.:37.30
## Max. :20.60 Max. :87 Max. :77.00
min(dataaa, na.rm = TRUE)
## [1] 8.3
max(dataaa, na.rm = TRUE)
## [1] 87
Visualisasi boxplot untuk melihat outlier
boxplot(dataaa$Volume,
main = "Boxplot Volume",
col = "pink",
ylab = "Volume")

Menangani outlier dengan winsorizing (mengganti nilai ekstrem dengan
batas)
Q1 <- quantile(dataaa$Volume, 0.25)
Q3 <- quantile(dataaa$Volume, 0.75)
IQR_val <- IQR(dataaa$Volume)
lower_bound <- Q1 - 1.5 * IQR_val
upper_bound <- Q3 + 1.5 * IQR_val
dataaa$Volume_winsor <- ifelse(dataaa$Volume < lower_bound, lower_bound,
ifelse(dataaa$Volume > upper_bound, upper_bound,
dataaa$Volume))
Bandingkan Boxplot Sebelum & Sesudah
par(mfrow=c(1,2))
boxplot(dataaa$Volume,
main="Sebelum Winsorizing",
col="pink")
boxplot(dataaa$Volume_winsor,
main="Sesudah Winsorizing",
col="lightblue")

Cek jumlah duplikasi dalam dataset
Menghitung jumlah baris yang duplikat
sum(duplicated(dataaa))
## [1] 0
Hapus duplikasi jika ada
Menyaring hanya baris unik
dataaa <- dataaa[!duplicated(dataaa), ]
Cek ulang data setelah preprocessing
Menampilkan ringkasan statistik setelah preprocessing
summary(dataaa)
## Girth Height Volume Volume_winsor
## Min. : 8.30 Min. :63 Min. :10.20 Min. :10.20
## 1st Qu.:11.05 1st Qu.:72 1st Qu.:19.40 1st Qu.:19.40
## Median :12.90 Median :76 Median :24.20 Median :24.20
## Mean :13.25 Mean :76 Mean :30.17 Mean :29.76
## 3rd Qu.:15.25 3rd Qu.:80 3rd Qu.:37.30 3rd Qu.:37.30
## Max. :20.60 Max. :87 Max. :77.00 Max. :64.15