data("mtcars")
df <- mtcars
head(df)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
sum(is.na(df))
## [1] 0
Membuat missing value
df$mpg[c(3, 10, 20)] <- NA
sum(is.na(df))
## [1] 3
Mean Imputation
mean_mpg <- mean(df$mpg, na.rm = TRUE)
df_mean <- df
df_mean$mpg[is.na(df_mean$mpg)] <- mean_mpg
Median Imputation
median_mpg <- median(df$mpg, na.rm = TRUE)
df_median <- df
df_median$mpg[is.na(df_median$mpg)] <- median_mpg
Bandingkan rata-rata setelah imputasi
mean(df_mean$mpg)
## [1] 19.55172
mean(df_median$mpg)
## [1] 19.47188
Deteksi outlier menggunakan IQR
 Q1 <- quantile(df_mean$mpg, 0.25)
 Q3 <- quantile(df_mean$mpg, 0.75)
 IQR_val <- IQR(df_mean$mpg)

 lower_bound <- Q1 - 1.5 * IQR_val
 upper_bound <- Q3 + 1.5 * IQR_val

 outliers <- df_mean$mpg[df_mean$mpg < lower_bound | df_mean$mpg > upper_bound]
 outliers
## [1] 32.4
boxplot(df_mean$mpg, main = "Boxplot MPG (Mean Imputation)")

df_winsor <- df_mean
df_winsor$mpg[df_winsor$mpg < lower_bound] <- lower_bound
df_winsor$mpg[df_winsor$mpg > upper_bound] <- upper_bound

boxplot(df_winsor$mpg, main = "Boxplot MPG Setelah Winsorizing")

Bandingkan mean sebelum & sesudah winsor
mean(df_mean$mpg)
## [1] 19.55172
mean(df_winsor$mpg)
## [1] 19.49001