library(readxl)
df <- read_xlsx("transaction.xlsx") 
df
## # A tibble: 10,015 × 10
##    dt_id               store_id kasir  customer_id product_name product_category
##    <dttm>                 <dbl> <chr>        <dbl> <chr>        <chr>           
##  1 2025-03-10 00:00:00       42 Kasir…          NA Cleo         Air Mineral     
##  2 2025-05-10 00:00:00       44 Kasir…         985 Khong Guan   Wafer           
##  3 2025-02-10 00:00:00       36 Kasir…         457 Aqua         Air Mineral     
##  4 2025-04-10 00:00:00       14 Kasir…         129 Sarimi       Mie Instan      
##  5 2025-01-10 00:00:00       42 Kasir…          13 Khong Guan   Wafer           
##  6 2025-03-10 00:00:00       26 Kasir…         841 Cleo         Air Mineral     
##  7 2025-01-10 00:00:00       32 Kasir…         496 Oasis        Air Mineral     
##  8 2025-04-10 00:00:00       29 Kasir…         700 Sarimi       Mie Instan      
##  9 2025-03-10 00:00:00       19 Kasir…          NA Toblerone    Coklat          
## 10 2025-02-10 00:00:00       22 Kasir…         585 Minyak Gore… Sembako         
## # ℹ 10,005 more rows
## # ℹ 4 more variables: quantity <dbl>, price_per_item <dbl>, total_price <dbl>,
## #   paymet_method <chr>
# Mendeteksi Missing Value
colSums(is.na(df))
##            dt_id         store_id            kasir      customer_id 
##                0                1                0                9 
##     product_name product_category         quantity   price_per_item 
##                0                0                4               11 
##      total_price    paymet_method 
##               10                0
# Visualisasi missing values
library(VIM) 
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(VIM) # Library untuk visualisasi missing values
aggr(df, numbers = TRUE, prop = FALSE)

# Mengganti missing values dengan median di setiap kolom
df$store_id[is.na(df$store_id)] <- median(df$store_id, na.rm = TRUE)
df$customer_id[is.na(df$customer_id)] <- median(df$customer_id, na.rm = TRUE)
df$quantity[is.na(df$quantity)] <- median(df$quantity, na.rm = TRUE)
df$price_per_item[is.na(df$price_per_item)] <- median(df$price_per_item, na.rm = TRUE)
df$total_price[is.na(df$total_price)] <- median(df$total_price, na.rm = TRUE)
df
## # A tibble: 10,015 × 10
##    dt_id               store_id kasir  customer_id product_name product_category
##    <dttm>                 <dbl> <chr>        <dbl> <chr>        <chr>           
##  1 2025-03-10 00:00:00       42 Kasir…         499 Cleo         Air Mineral     
##  2 2025-05-10 00:00:00       44 Kasir…         985 Khong Guan   Wafer           
##  3 2025-02-10 00:00:00       36 Kasir…         457 Aqua         Air Mineral     
##  4 2025-04-10 00:00:00       14 Kasir…         129 Sarimi       Mie Instan      
##  5 2025-01-10 00:00:00       42 Kasir…          13 Khong Guan   Wafer           
##  6 2025-03-10 00:00:00       26 Kasir…         841 Cleo         Air Mineral     
##  7 2025-01-10 00:00:00       32 Kasir…         496 Oasis        Air Mineral     
##  8 2025-04-10 00:00:00       29 Kasir…         700 Sarimi       Mie Instan      
##  9 2025-03-10 00:00:00       19 Kasir…         499 Toblerone    Coklat          
## 10 2025-02-10 00:00:00       22 Kasir…         585 Minyak Gore… Sembako         
## # ℹ 10,005 more rows
## # ℹ 4 more variables: quantity <dbl>, price_per_item <dbl>, total_price <dbl>,
## #   paymet_method <chr>
colSums(is.na(df))
##            dt_id         store_id            kasir      customer_id 
##                0                0                0                0 
##     product_name product_category         quantity   price_per_item 
##                0                0                0                0 
##      total_price    paymet_method 
##                0                0
summary (df)
##      dt_id                        store_id        kasir          
##  Min.   :2025-01-10 00:00:00   Min.   : 1.00   Length:10015      
##  1st Qu.:2025-02-10 00:00:00   1st Qu.:16.00   Class :character  
##  Median :2025-03-10 00:00:00   Median :31.00   Mode  :character  
##  Mean   :2025-03-10 18:30:35   Mean   :30.57                     
##  3rd Qu.:2025-04-10 00:00:00   3rd Qu.:45.00                     
##  Max.   :2025-05-10 00:00:00   Max.   :60.00                     
##   customer_id     product_name       product_category      quantity    
##  Min.   :   1.0   Length:10015       Length:10015       Min.   :1.000  
##  1st Qu.: 252.5   Class :character   Class :character   1st Qu.:2.000  
##  Median : 499.0   Mode  :character   Mode  :character   Median :3.000  
##  Mean   : 499.5                                         Mean   :3.026  
##  3rd Qu.: 746.0                                         3rd Qu.:4.000  
##  Max.   :1000.0                                         Max.   :5.000  
##  price_per_item   total_price     paymet_method     
##  Min.   : 3001   Min.   :  3002   Length:10015      
##  1st Qu.: 9652   1st Qu.: 21710   Class :character  
##  Median :16508   Median : 40556   Mode  :character  
##  Mean   :16499   Mean   : 49937                     
##  3rd Qu.:23290   3rd Qu.: 72548                     
##  Max.   :29996   Max.   :149970
# Menggunakan metode IQR untuk mendeteksi outlier pada kolom TransactionNo
Q1 <- quantile(df$quantity, 0.25)
Q3 <- quantile(df$quantity, 0.75)
IQR <- Q3 - Q1

# Batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Menandai outlier dengan kondisi apakah nilainya di luar batas bawah atau atas
outliersa <- df$quantity < lower_bound
outliers <- df$quantity > upper_bound
sum(outliersa)
## [1] 0
sum(outliers)
## [1] 0
# Visualisasi boxplot untuk melihat outlier pada kolom Quantity
boxplot(df$quantity, main = "Boxplot Quantity", col = "lightblue")

# Menangani outlier dengan winsorizing (mengganti nilai ekstrem dengan batas)
df$quantity[outliers] <- ifelse(df$quantity[outliers] < lower_bound, lower_bound, upper_bound)
# Cek jumlah duplikasi dalam dataset
sum(duplicated(df))  # Menghitung jumlah baris yang duplikat
## [1] 15
# Hapus duplikasi jika ada
df <- df[!duplicated(df), ]  # Menyaring hanya baris unik
sum(duplicated(df))
## [1] 0
summary(df)
##      dt_id                        store_id        kasir          
##  Min.   :2025-01-10 00:00:00   Min.   : 1.00   Length:10000      
##  1st Qu.:2025-02-10 00:00:00   1st Qu.:16.00   Class :character  
##  Median :2025-03-10 00:00:00   Median :31.00   Mode  :character  
##  Mean   :2025-03-10 17:55:32   Mean   :30.58                     
##  3rd Qu.:2025-04-10 00:00:00   3rd Qu.:45.00                     
##  Max.   :2025-05-10 00:00:00   Max.   :60.00                     
##   customer_id     product_name       product_category      quantity    
##  Min.   :   1.0   Length:10000       Length:10000       Min.   :1.000  
##  1st Qu.: 252.8   Class :character   Class :character   1st Qu.:2.000  
##  Median : 499.0   Mode  :character   Mode  :character   Median :3.000  
##  Mean   : 499.4                                         Mean   :3.026  
##  3rd Qu.: 746.0                                         3rd Qu.:4.000  
##  Max.   :1000.0                                         Max.   :5.000  
##  price_per_item   total_price     paymet_method     
##  Min.   : 3001   Min.   :  3002   Length:10000      
##  1st Qu.: 9656   1st Qu.: 21711   Class :character  
##  Median :16508   Median : 40556   Mode  :character  
##  Mean   :16502   Mean   : 49943                     
##  3rd Qu.:23292   3rd Qu.: 72560                     
##  Max.   :29996   Max.   :149970