library(dplyr) # Untuk manipulasi data
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) # Untuk menangani missing values
## Warning: package 'tidyr' was built under R version 4.4.3
library(ggplot2) # Untuk visualisasi data
library(VIM) # Untuk visualisasi missing values
## Warning: package 'VIM' was built under R version 4.4.3
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
data("volcano")
head(airquality) # Menampilkan 6 baris pertama
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
summary(airquality) # Statistik ringkasan dataset
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
colSums(is.na(airquality)) # Menampilkan jumlah missing values per kolom
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
aggr(airquality, numbers = TRUE, prop = FALSE)

# Mengubah dataset volcano ke dalam format data frame
volcano_df <- as.data.frame(as.table(volcano))
# Mengecek apakah ada missing value
sum(is.na(volcano_df))
## [1] 0
# Plot missing value
aggr(volcano_df, col = c("navyblue", "red"), numbers = TRUE, sortVars = TRUE,
labels = c("X", "Y", "Elevation"), cex.axis = 0.7, gap = 3,
ylab = c("Missing Data Pattern", "Percentage of Missing Values"))

##
## Variables sorted by number of missings:
## Variable Count
## X 0
## Y 0
## Elevation 0
# Menghitung kuartil
Q1 <- quantile(volcano_df$Freq, 0.25)
Q3 <- quantile(volcano_df$Freq, 0.75)
IQR <- Q3 - Q1
# Menentukan batas outlier
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Menentukan data yang termasuk outlier
outliers <- volcano_df$Freq < lower_bound | volcano_df$Freq > upper_bound
# Jumlah outlier
sum(outliers)
## [1] 0
boxplot(volcano_df$Freq, main = "Boxplot Elevasi Volcano", col = "lightblue",
ylab = "Elevation")
