library(readxl)
df <- read_xlsx("transaction.xlsx")
df
## # A tibble: 10,015 × 10
## dt_id store_id kasir customer_id product_name product_category
## <dttm> <dbl> <chr> <dbl> <chr> <chr>
## 1 2025-03-10 00:00:00 42 Kasir… NA Cleo Air Mineral
## 2 2025-05-10 00:00:00 44 Kasir… 985 Khong Guan Wafer
## 3 2025-02-10 00:00:00 36 Kasir… 457 Aqua Air Mineral
## 4 2025-04-10 00:00:00 14 Kasir… 129 Sarimi Mie Instan
## 5 2025-01-10 00:00:00 42 Kasir… 13 Khong Guan Wafer
## 6 2025-03-10 00:00:00 26 Kasir… 841 Cleo Air Mineral
## 7 2025-01-10 00:00:00 32 Kasir… 496 Oasis Air Mineral
## 8 2025-04-10 00:00:00 29 Kasir… 700 Sarimi Mie Instan
## 9 2025-03-10 00:00:00 19 Kasir… NA Toblerone Coklat
## 10 2025-02-10 00:00:00 22 Kasir… 585 Minyak Gore… Sembako
## # ℹ 10,005 more rows
## # ℹ 4 more variables: quantity <dbl>, price_per_item <dbl>, total_price <dbl>,
## # paymet_method <chr>
# Mendeteksi Missing Value
colSums(is.na(df))
## dt_id store_id kasir customer_id
## 0 1 0 9
## product_name product_category quantity price_per_item
## 0 0 4 11
## total_price paymet_method
## 10 0
# Visualisasi missing values
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(VIM) # Library untuk visualisasi missing values
aggr(df, numbers = TRUE, prop = FALSE)

# Mengganti missing values dengan median di setiap kolom
df$store_id[is.na(df$store_id)] <- median(df$store_id, na.rm = TRUE)
df$customer_id[is.na(df$customer_id)] <- median(df$customer_id, na.rm = TRUE)
df$quantity[is.na(df$quantity)] <- median(df$quantity, na.rm = TRUE)
df$price_per_item[is.na(df$price_per_item)] <- median(df$price_per_item, na.rm = TRUE)
df$total_price[is.na(df$total_price)] <- median(df$total_price, na.rm = TRUE)
df
## # A tibble: 10,015 × 10
## dt_id store_id kasir customer_id product_name product_category
## <dttm> <dbl> <chr> <dbl> <chr> <chr>
## 1 2025-03-10 00:00:00 42 Kasir… 499 Cleo Air Mineral
## 2 2025-05-10 00:00:00 44 Kasir… 985 Khong Guan Wafer
## 3 2025-02-10 00:00:00 36 Kasir… 457 Aqua Air Mineral
## 4 2025-04-10 00:00:00 14 Kasir… 129 Sarimi Mie Instan
## 5 2025-01-10 00:00:00 42 Kasir… 13 Khong Guan Wafer
## 6 2025-03-10 00:00:00 26 Kasir… 841 Cleo Air Mineral
## 7 2025-01-10 00:00:00 32 Kasir… 496 Oasis Air Mineral
## 8 2025-04-10 00:00:00 29 Kasir… 700 Sarimi Mie Instan
## 9 2025-03-10 00:00:00 19 Kasir… 499 Toblerone Coklat
## 10 2025-02-10 00:00:00 22 Kasir… 585 Minyak Gore… Sembako
## # ℹ 10,005 more rows
## # ℹ 4 more variables: quantity <dbl>, price_per_item <dbl>, total_price <dbl>,
## # paymet_method <chr>
colSums(is.na(df))
## dt_id store_id kasir customer_id
## 0 0 0 0
## product_name product_category quantity price_per_item
## 0 0 0 0
## total_price paymet_method
## 0 0
summary (df)
## dt_id store_id kasir
## Min. :2025-01-10 00:00:00 Min. : 1.00 Length:10015
## 1st Qu.:2025-02-10 00:00:00 1st Qu.:16.00 Class :character
## Median :2025-03-10 00:00:00 Median :31.00 Mode :character
## Mean :2025-03-10 18:30:35 Mean :30.57
## 3rd Qu.:2025-04-10 00:00:00 3rd Qu.:45.00
## Max. :2025-05-10 00:00:00 Max. :60.00
## customer_id product_name product_category quantity
## Min. : 1.0 Length:10015 Length:10015 Min. :1.000
## 1st Qu.: 252.5 Class :character Class :character 1st Qu.:2.000
## Median : 499.0 Mode :character Mode :character Median :3.000
## Mean : 499.5 Mean :3.026
## 3rd Qu.: 746.0 3rd Qu.:4.000
## Max. :1000.0 Max. :5.000
## price_per_item total_price paymet_method
## Min. : 3001 Min. : 3002 Length:10015
## 1st Qu.: 9652 1st Qu.: 21710 Class :character
## Median :16508 Median : 40556 Mode :character
## Mean :16499 Mean : 49937
## 3rd Qu.:23290 3rd Qu.: 72548
## Max. :29996 Max. :149970
# Menggunakan metode IQR untuk mendeteksi outlier pada kolom TransactionNo
Q1 <- quantile(df$quantity, 0.25)
Q3 <- quantile(df$quantity, 0.75)
IQR <- Q3 - Q1
# Batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Menandai outlier dengan kondisi apakah nilainya di luar batas bawah atau atas
outliersa <- df$quantity < lower_bound
outliers <- df$quantity > upper_bound
sum(outliersa)
## [1] 0
sum(outliers)
## [1] 0
# Visualisasi boxplot untuk melihat outlier pada kolom Quantity
boxplot(df$quantity, main = "Boxplot Quantity", col = "lightblue")

# Menangani outlier dengan winsorizing (mengganti nilai ekstrem dengan batas)
df$quantity[outliers] <- ifelse(df$quantity[outliers] < lower_bound, lower_bound, upper_bound)
# Cek jumlah duplikasi dalam dataset
sum(duplicated(df)) # Menghitung jumlah baris yang duplikat
## [1] 15
# Hapus duplikasi jika ada
df <- df[!duplicated(df), ] # Menyaring hanya baris unik
sum(duplicated(df))
## [1] 0
summary(df)
## dt_id store_id kasir
## Min. :2025-01-10 00:00:00 Min. : 1.00 Length:10000
## 1st Qu.:2025-02-10 00:00:00 1st Qu.:16.00 Class :character
## Median :2025-03-10 00:00:00 Median :31.00 Mode :character
## Mean :2025-03-10 17:55:32 Mean :30.58
## 3rd Qu.:2025-04-10 00:00:00 3rd Qu.:45.00
## Max. :2025-05-10 00:00:00 Max. :60.00
## customer_id product_name product_category quantity
## Min. : 1.0 Length:10000 Length:10000 Min. :1.000
## 1st Qu.: 252.8 Class :character Class :character 1st Qu.:2.000
## Median : 499.0 Mode :character Mode :character Median :3.000
## Mean : 499.4 Mean :3.026
## 3rd Qu.: 746.0 3rd Qu.:4.000
## Max. :1000.0 Max. :5.000
## price_per_item total_price paymet_method
## Min. : 3001 Min. : 3002 Length:10000
## 1st Qu.: 9656 1st Qu.: 21711 Class :character
## Median :16508 Median : 40556 Mode :character
## Mean :16502 Mean : 49943
## 3rd Qu.:23292 3rd Qu.: 72560
## Max. :29996 Max. :149970