library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(ggplot2)
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
dl <- read_excel("transaction.xlsx")
dl
## # A tibble: 10,025 × 10
##    dt_id               store_id kasir  customer_id product_name product_category
##    <dttm>                 <dbl> <chr>        <dbl> <chr>        <chr>           
##  1 2025-03-10 00:00:00       42 Kasir…         665 Cleo         Air Mineral     
##  2 2025-05-10 00:00:00       44 Kasir…         985 Khong Guan   Wafer           
##  3 2025-02-10 00:00:00       36 Kasir…         457 Aqua         Air Mineral     
##  4 2025-04-10 00:00:00       14 Kasir…         129 Sarimi       Mie Instan      
##  5 2025-01-10 00:00:00       42 Kasir…          13 Khong Guan   Wafer           
##  6 2025-03-10 00:00:00       26 Kasir…         841 Cleo         Air Mineral     
##  7 2025-01-10 00:00:00       32 Kasir…         496 Oasis        Air Mineral     
##  8 2025-04-10 00:00:00       29 Kasir…         700 Sarimi       Mie Instan      
##  9 2025-03-10 00:00:00       19 Kasir…         977 Toblerone    Coklat          
## 10 2025-02-10 00:00:00       22 Kasir…         585 Minyak Gore… Sembako         
## # ℹ 10,015 more rows
## # ℹ 4 more variables: quantity <dbl>, price_per_item <dbl>, total_price <dbl>,
## #   paymet_method <chr>
#visualisasi missing value
aggr(dl, numbers = TRUE, prop = FALSE)

#missing value
colSums(is.na(dl))
##            dt_id         store_id            kasir      customer_id 
##                0                0                0                0 
##     product_name product_category         quantity   price_per_item 
##                0                0                0               27 
##      total_price    paymet_method 
##               18                0
#hapus missing value
dl$total_price[is.na(dl$total_price)] <- median(dl$total_price, na.rm = TRUE)

colSums(is.na(dl))
##            dt_id         store_id            kasir      customer_id 
##                0                0                0                0 
##     product_name product_category         quantity   price_per_item 
##                0                0                0               27 
##      total_price    paymet_method 
##                0                0
#outlier
Q1 <- quantile(dl$total_price, 0.25)
Q3 <- quantile(dl$total_price, 0.75)
IQR <- Q3 - Q1

#batas atas dan bawah
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- Q1 - 1.5 * IQR
#menandai outlier dengan kondisi apakah nilainya di luar batas atas atau bawah
outliers1 <- dl$total_price > upper_bound
outliers <- dl$total_price < lower_bound
sum(outliers)
## [1] 0
sum(outliers1)
## [1] 20
#visualisasi boxplot untuk melihat outliet pada kolom total_price
boxplot(dl$total_price, main = "Boxplot Total Price", col = "magenta")

#jumlah duplikasi di dataset
sum(duplicated(dl)) #menghitung jumlah baris tang duplikat
## [1] 25
#hapus duplikasi
dl_clean <- dl[!duplicated(dl),]
#cek data
summary(dl_clean)
##      dt_id                        store_id        kasir          
##  Min.   :2025-01-10 00:00:00   Min.   : 1.00   Length:10000      
##  1st Qu.:2025-02-10 00:00:00   1st Qu.:16.00   Class :character  
##  Median :2025-03-10 00:00:00   Median :31.00   Mode  :character  
##  Mean   :2025-03-10 17:55:32   Mean   :30.58                     
##  3rd Qu.:2025-04-10 00:00:00   3rd Qu.:45.00                     
##  Max.   :2025-05-10 00:00:00   Max.   :60.00                     
##                                                                  
##   customer_id     product_name       product_category      quantity    
##  Min.   :   1.0   Length:10000       Length:10000       Min.   :1.000  
##  1st Qu.: 252.0   Class :character   Class :character   1st Qu.:2.000  
##  Median : 499.0   Mode  :character   Mode  :character   Median :3.000  
##  Mean   : 499.5                                         Mean   :3.026  
##  3rd Qu.: 747.0                                         3rd Qu.:4.000  
##  Max.   :1000.0                                         Max.   :5.000  
##                                                                        
##  price_per_item   total_price     paymet_method     
##  Min.   : 3001   Min.   :  3002   Length:10000      
##  1st Qu.: 9646   1st Qu.: 21716   Class :character  
##  Median :16511   Median : 40556   Mode  :character  
##  Mean   :16503   Mean   : 49920                     
##  3rd Qu.:23297   3rd Qu.: 72502                     
##  Max.   :29996   Max.   :149970                     
##  NA's   :24