library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(ggplot2)
data("presidents")
presidents
## Qtr1 Qtr2 Qtr3 Qtr4
## 1945 NA 87 82 75
## 1946 63 50 43 32
## 1947 35 60 54 55
## 1948 36 39 NA NA
## 1949 69 57 57 51
## 1950 45 37 46 39
## 1951 36 24 32 23
## 1952 25 32 NA 32
## 1953 59 74 75 60
## 1954 71 61 71 57
## 1955 71 68 79 73
## 1956 76 71 67 75
## 1957 79 62 63 57
## 1958 60 49 48 52
## 1959 57 62 61 66
## 1960 71 62 61 57
## 1961 72 83 71 78
## 1962 79 71 62 74
## 1963 76 64 62 57
## 1964 80 73 69 69
## 1965 71 64 69 62
## 1966 63 46 56 44
## 1967 44 52 38 46
## 1968 36 49 35 44
## 1969 59 65 65 56
## 1970 66 53 61 52
## 1971 51 48 54 49
## 1972 49 61 NA NA
## 1973 68 44 40 27
## 1974 28 25 24 24
head(presidents)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1945 NA 87 82 75
## 1946 63 50
summary(presidents)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 23.00 46.00 59.00 56.31 69.00 87.00 6
is.na(presidents)
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(is.na(presidents))
## [1] 6
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(presidents, numbers= TRUE, prop = FALSE)
presidents[is.na(presidents)] <- median(presidents, na.rm = TRUE)
presidents[is.na(presidents)] <- median(presidents, na.rm = TRUE)
presidents
## Qtr1 Qtr2 Qtr3 Qtr4
## 1945 59 87 82 75
## 1946 63 50 43 32
## 1947 35 60 54 55
## 1948 36 39 59 59
## 1949 69 57 57 51
## 1950 45 37 46 39
## 1951 36 24 32 23
## 1952 25 32 59 32
## 1953 59 74 75 60
## 1954 71 61 71 57
## 1955 71 68 79 73
## 1956 76 71 67 75
## 1957 79 62 63 57
## 1958 60 49 48 52
## 1959 57 62 61 66
## 1960 71 62 61 57
## 1961 72 83 71 78
## 1962 79 71 62 74
## 1963 76 64 62 57
## 1964 80 73 69 69
## 1965 71 64 69 62
## 1966 63 46 56 44
## 1967 44 52 38 46
## 1968 36 49 35 44
## 1969 59 65 65 56
## 1970 66 53 61 52
## 1971 51 48 54 49
## 1972 49 61 59 59
## 1973 68 44 40 27
## 1974 28 25 24 24
sum(is.na(presidents))
## [1] 0
summary(presidents)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.00 46.00 59.00 56.44 68.25 87.00
Q1 <- quantile(presidents, 0.25)
Q3 <- quantile(presidents, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliersa <- presidents < lower_bound
outliers <- presidents > upper_bound
sum(outliersa)
## [1] 0
sum(outliers)
## [1] 0
boxplot(presidents, main = "Boxplot Ozone", col = "lightblue")
presidents[outliers] <- ifelse(presidents[outliers] < lower_bound, lower_bound, upper_bound)
sum(duplicated(presidents))
## [1] 71
presidents <- airquality[!duplicated(presidents), ]
summary(presidents)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 8.0 Min. : 2.30 Min. :56.00
## 1st Qu.: 14.00 1st Qu.:109.2 1st Qu.: 7.40 1st Qu.:67.75
## Median : 24.00 Median :193.5 Median :10.00 Median :76.00
## Mean : 36.72 Mean :179.9 Mean :10.26 Mean :76.15
## 3rd Qu.: 47.00 3rd Qu.:252.0 3rd Qu.:12.90 3rd Qu.:82.25
## Max. :168.00 Max. :332.0 Max. :20.10 Max. :96.00
## NA's :15 NA's :4
## Month Day
## Min. :5.000 Min. : 1.00
## 1st Qu.:5.000 1st Qu.: 7.00
## Median :6.500 Median :13.50
## Mean :6.847 Mean :14.72
## 3rd Qu.:9.000 3rd Qu.:23.00
## Max. :9.000 Max. :31.00
##
KESIMPULAN Setelah dilakukan preprocessing, datasets presidents telah bersih dari missing values, outlier telah ditangani, dan duplikasi data telah dihapus. Proses ini memastikan data siap digunakan untuk analisis lebih lanjut.
Langkah-langkah ini memberikan pemahaman mendalam tentang preprocessing data, yang merupakan tahap penting sebelum melanjutkan analisis atau model prediktif.