R Markdown

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(ggplot2)
data("presidents")
presidents
##      Qtr1 Qtr2 Qtr3 Qtr4
## 1945   NA   87   82   75
## 1946   63   50   43   32
## 1947   35   60   54   55
## 1948   36   39   NA   NA
## 1949   69   57   57   51
## 1950   45   37   46   39
## 1951   36   24   32   23
## 1952   25   32   NA   32
## 1953   59   74   75   60
## 1954   71   61   71   57
## 1955   71   68   79   73
## 1956   76   71   67   75
## 1957   79   62   63   57
## 1958   60   49   48   52
## 1959   57   62   61   66
## 1960   71   62   61   57
## 1961   72   83   71   78
## 1962   79   71   62   74
## 1963   76   64   62   57
## 1964   80   73   69   69
## 1965   71   64   69   62
## 1966   63   46   56   44
## 1967   44   52   38   46
## 1968   36   49   35   44
## 1969   59   65   65   56
## 1970   66   53   61   52
## 1971   51   48   54   49
## 1972   49   61   NA   NA
## 1973   68   44   40   27
## 1974   28   25   24   24
head(presidents)
##      Qtr1 Qtr2 Qtr3 Qtr4
## 1945   NA   87   82   75
## 1946   63   50
summary(presidents)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   23.00   46.00   59.00   56.31   69.00   87.00       6
is.na(presidents)
##   [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(is.na(presidents))
## [1] 6
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
aggr(presidents, numbers= TRUE, prop = FALSE)

presidents[is.na(presidents)] <- median(presidents, na.rm = TRUE)
presidents[is.na(presidents)] <- median(presidents, na.rm = TRUE)

presidents
##      Qtr1 Qtr2 Qtr3 Qtr4
## 1945   59   87   82   75
## 1946   63   50   43   32
## 1947   35   60   54   55
## 1948   36   39   59   59
## 1949   69   57   57   51
## 1950   45   37   46   39
## 1951   36   24   32   23
## 1952   25   32   59   32
## 1953   59   74   75   60
## 1954   71   61   71   57
## 1955   71   68   79   73
## 1956   76   71   67   75
## 1957   79   62   63   57
## 1958   60   49   48   52
## 1959   57   62   61   66
## 1960   71   62   61   57
## 1961   72   83   71   78
## 1962   79   71   62   74
## 1963   76   64   62   57
## 1964   80   73   69   69
## 1965   71   64   69   62
## 1966   63   46   56   44
## 1967   44   52   38   46
## 1968   36   49   35   44
## 1969   59   65   65   56
## 1970   66   53   61   52
## 1971   51   48   54   49
## 1972   49   61   59   59
## 1973   68   44   40   27
## 1974   28   25   24   24
sum(is.na(presidents))
## [1] 0
summary(presidents)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.00   46.00   59.00   56.44   68.25   87.00
Q1 <- quantile(presidents, 0.25)
Q3 <- quantile(presidents, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliersa <- presidents < lower_bound
outliers <- presidents > upper_bound
sum(outliersa)
## [1] 0
sum(outliers)
## [1] 0
boxplot(presidents, main = "Boxplot Ozone", col = "lightblue")

presidents[outliers] <- ifelse(presidents[outliers] < lower_bound, lower_bound, upper_bound)
sum(duplicated(presidents))
## [1] 71
presidents <- airquality[!duplicated(presidents), ] 
summary(presidents)
##      Ozone           Solar.R           Wind            Temp      
##  Min.   :  1.00   Min.   :  8.0   Min.   : 2.30   Min.   :56.00  
##  1st Qu.: 14.00   1st Qu.:109.2   1st Qu.: 7.40   1st Qu.:67.75  
##  Median : 24.00   Median :193.5   Median :10.00   Median :76.00  
##  Mean   : 36.72   Mean   :179.9   Mean   :10.26   Mean   :76.15  
##  3rd Qu.: 47.00   3rd Qu.:252.0   3rd Qu.:12.90   3rd Qu.:82.25  
##  Max.   :168.00   Max.   :332.0   Max.   :20.10   Max.   :96.00  
##  NA's   :15       NA's   :4                                      
##      Month            Day       
##  Min.   :5.000   Min.   : 1.00  
##  1st Qu.:5.000   1st Qu.: 7.00  
##  Median :6.500   Median :13.50  
##  Mean   :6.847   Mean   :14.72  
##  3rd Qu.:9.000   3rd Qu.:23.00  
##  Max.   :9.000   Max.   :31.00  
## 

KESIMPULAN Setelah dilakukan preprocessing, datasets presidents telah bersih dari missing values, outlier telah ditangani, dan duplikasi data telah dihapus. Proses ini memastikan data siap digunakan untuk analisis lebih lanjut.

Langkah-langkah ini memberikan pemahaman mendalam tentang preprocessing data, yang merupakan tahap penting sebelum melanjutkan analisis atau model prediktif.