library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(ggplot2)
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
dl <- read_excel("transaction.xlsx")
dl
## # A tibble: 10,025 × 10
## dt_id store_id kasir customer_id product_name product_category
## <dttm> <dbl> <chr> <dbl> <chr> <chr>
## 1 2025-03-10 00:00:00 42 Kasir… 665 Cleo Air Mineral
## 2 2025-05-10 00:00:00 44 Kasir… 985 Khong Guan Wafer
## 3 2025-02-10 00:00:00 36 Kasir… 457 Aqua Air Mineral
## 4 2025-04-10 00:00:00 14 Kasir… 129 Sarimi Mie Instan
## 5 2025-01-10 00:00:00 42 Kasir… 13 Khong Guan Wafer
## 6 2025-03-10 00:00:00 26 Kasir… 841 Cleo Air Mineral
## 7 2025-01-10 00:00:00 32 Kasir… 496 Oasis Air Mineral
## 8 2025-04-10 00:00:00 29 Kasir… 700 Sarimi Mie Instan
## 9 2025-03-10 00:00:00 19 Kasir… 977 Toblerone Coklat
## 10 2025-02-10 00:00:00 22 Kasir… 585 Minyak Gore… Sembako
## # ℹ 10,015 more rows
## # ℹ 4 more variables: quantity <dbl>, price_per_item <dbl>, total_price <dbl>,
## # paymet_method <chr>
#visualisasi missing value
aggr(dl, numbers = TRUE, prop = FALSE)

#missing value
colSums(is.na(dl))
## dt_id store_id kasir customer_id
## 0 0 0 0
## product_name product_category quantity price_per_item
## 0 0 0 27
## total_price paymet_method
## 18 0
#hapus missing value
dl$total_price[is.na(dl$total_price)] <- median(dl$total_price, na.rm = TRUE)
colSums(is.na(dl))
## dt_id store_id kasir customer_id
## 0 0 0 0
## product_name product_category quantity price_per_item
## 0 0 0 27
## total_price paymet_method
## 0 0
#outlier
Q1 <- quantile(dl$total_price, 0.25)
Q3 <- quantile(dl$total_price, 0.75)
IQR <- Q3 - Q1
#batas atas dan bawah
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- Q1 - 1.5 * IQR
#menandai outlier dengan kondisi apakah nilainya di luar batas atas atau bawah
outliers1 <- dl$total_price > upper_bound
outliers <- dl$total_price < lower_bound
sum(outliers)
## [1] 0
sum(outliers1)
## [1] 20
#visualisasi boxplot untuk melihat outliet pada kolom total_price
boxplot(dl$total_price, main = "Boxplot Total Price", col = "magenta")

#jumlah duplikasi di dataset
sum(duplicated(dl)) #menghitung jumlah baris tang duplikat
## [1] 25
#hapus duplikasi
dl_clean <- dl[!duplicated(dl),]
#cek data
summary(dl_clean)
## dt_id store_id kasir
## Min. :2025-01-10 00:00:00 Min. : 1.00 Length:10000
## 1st Qu.:2025-02-10 00:00:00 1st Qu.:16.00 Class :character
## Median :2025-03-10 00:00:00 Median :31.00 Mode :character
## Mean :2025-03-10 17:55:32 Mean :30.58
## 3rd Qu.:2025-04-10 00:00:00 3rd Qu.:45.00
## Max. :2025-05-10 00:00:00 Max. :60.00
##
## customer_id product_name product_category quantity
## Min. : 1.0 Length:10000 Length:10000 Min. :1.000
## 1st Qu.: 252.0 Class :character Class :character 1st Qu.:2.000
## Median : 499.0 Mode :character Mode :character Median :3.000
## Mean : 499.5 Mean :3.026
## 3rd Qu.: 747.0 3rd Qu.:4.000
## Max. :1000.0 Max. :5.000
##
## price_per_item total_price paymet_method
## Min. : 3001 Min. : 3002 Length:10000
## 1st Qu.: 9646 1st Qu.: 21716 Class :character
## Median :16511 Median : 40556 Mode :character
## Mean :16503 Mean : 49920
## 3rd Qu.:23297 3rd Qu.: 72502
## Max. :29996 Max. :149970
## NA's :24