library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Goals: Mengetahui kapan pelanggan paling sering melakukan transaksi di toko online
# 1) Load data
online_retail <- read.csv(
  "Online Retail.csv",
  sep = ";",
  row.names = NULL,
  stringsAsFactors = FALSE
)
head(online_retail)
##   InvoiceNo StockCode                         Description Quantity
## 1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6
## 2    536365     71053                 WHITE METAL LANTERN        6
## 3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8
## 4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6
## 5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6
## 6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2
##        InvoiceDate UnitPrice CustomerID        Country
## 1 01/12/2010 08:26      2,55      17850 United Kingdom
## 2 01/12/2010 08:26      3,39      17850 United Kingdom
## 3 01/12/2010 08:26      2,75      17850 United Kingdom
## 4 01/12/2010 08:26      3,39      17850 United Kingdom
## 5 01/12/2010 08:26      3,39      17850 United Kingdom
## 6 01/12/2010 08:26      7,65      17850 United Kingdom
# 2) Validasi awal
head(online_retail)
##   InvoiceNo StockCode                         Description Quantity
## 1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6
## 2    536365     71053                 WHITE METAL LANTERN        6
## 3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8
## 4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6
## 5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6
## 6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2
##        InvoiceDate UnitPrice CustomerID        Country
## 1 01/12/2010 08:26      2,55      17850 United Kingdom
## 2 01/12/2010 08:26      3,39      17850 United Kingdom
## 3 01/12/2010 08:26      2,75      17850 United Kingdom
## 4 01/12/2010 08:26      3,39      17850 United Kingdom
## 5 01/12/2010 08:26      3,39      17850 United Kingdom
## 6 01/12/2010 08:26      7,65      17850 United Kingdom
str(online_retail)
## 'data.frame':    541909 obs. of  8 variables:
##  $ InvoiceNo  : chr  "536365" "536365" "536365" "536365" ...
##  $ StockCode  : chr  "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: chr  "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" ...
##  $ UnitPrice  : chr  "2,55" "3,39" "2,75" "3,39" ...
##  $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
##  $ Country    : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
dim(online_retail)
## [1] 541909      8

DATA CLEANING

# Hapus missing CustomerID
online_retail <- online_retail[!is.na(online_retail$CustomerID), ]
# Hanya transaksi valid (pembelian)
online_retail <- online_retail[online_retail$Quantity > 0, ]
online_retail <- online_retail[online_retail$UnitPrice > 0, ]
# Convert tanggal
online_retail$InvoiceDate <- as.POSIXct(
  online_retail$InvoiceDate,
  format = "%d/%m/%Y %H:%M"
)
# Hapus duplikasi yang relevan
online_retail <- online_retail %>%
  distinct(InvoiceNo, StockCode, .keep_all = TRUE)
# Normalisasi teks
online_retail$Description <- trimws(toupper(online_retail$Description))
online_retail$Country <- trimws(toupper(online_retail$Country))
# Cek hasil setelah cleaning
summary(online_retail)
##   InvoiceNo          StockCode         Description           Quantity       
##  Length:387841      Length:387841      Length:387841      Min.   :    1.00  
##  Class :character   Class :character   Class :character   1st Qu.:    2.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    6.00  
##                                                           Mean   :   13.22  
##                                                           3rd Qu.:   12.00  
##                                                           Max.   :80995.00  
##   InvoiceDate                   UnitPrice           CustomerID   
##  Min.   :2010-12-01 08:26:00   Length:387841      Min.   :12346  
##  1st Qu.:2011-04-07 10:20:00   Class :character   1st Qu.:13941  
##  Median :2011-07-29 13:39:00   Mode  :character   Median :15145  
##  Mean   :2011-07-10 11:30:27                      Mean   :15282  
##  3rd Qu.:2011-10-20 11:03:00                      3rd Qu.:16790  
##  Max.   :2011-12-09 12:50:00                      Max.   :18287  
##    Country         
##  Length:387841     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
dim(online_retail)
## [1] 387841      8

DATA PREPARATION (Time Features)

online_retail$Hour  <- as.integer(format(online_retail$InvoiceDate, "%H"))
online_retail$Day   <- weekdays(online_retail$InvoiceDate)
online_retail$Month <- format(online_retail$InvoiceDate, "%B")

EDA: Pola waktu transaksi

hourly_trx <- table(online_retail$Hour)

barplot(
  hourly_trx,
  col = "skyblue",
  main = "Jumlah Transaksi Berdasarkan Jam",
  xlab = "Jam",
  ylab = "Jumlah Transaksi",
  las = 2
)

daily_trx   <- table(online_retail$Day)

barplot(
  daily_trx, 
  col = "lightblue",
  main="Jumlah Transaksi Berdasarkan Hari", 
  xlab="Hari",
  ylab="Jumlah Transaksi", 
  las=2
)

monthly_trx <- table(online_retail$Month)

barplot(
  monthly_trx, 
  col = "darkblue",
  main="Jumlah Transaksi Berdasarkan Bulan", 
  xlab="Bulan",
  ylab="Jumlah Transaksi", 
  las=2
)

# Insight tambahan
total_transaksi <- length(unique(online_retail$InvoiceNo))
total_transaksi
## [1] 18532