library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Goals: Mengetahui kapan pelanggan paling sering melakukan transaksi di toko online
# 1) Load data
online_retail <- read.csv(
"Online Retail.csv",
sep = ";",
row.names = NULL,
stringsAsFactors = FALSE
)
head(online_retail)
## InvoiceNo StockCode Description Quantity
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
## 2 536365 71053 WHITE METAL LANTERN 6
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2
## InvoiceDate UnitPrice CustomerID Country
## 1 01/12/2010 08:26 2,55 17850 United Kingdom
## 2 01/12/2010 08:26 3,39 17850 United Kingdom
## 3 01/12/2010 08:26 2,75 17850 United Kingdom
## 4 01/12/2010 08:26 3,39 17850 United Kingdom
## 5 01/12/2010 08:26 3,39 17850 United Kingdom
## 6 01/12/2010 08:26 7,65 17850 United Kingdom
# 2) Validasi awal
head(online_retail)
## InvoiceNo StockCode Description Quantity
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
## 2 536365 71053 WHITE METAL LANTERN 6
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2
## InvoiceDate UnitPrice CustomerID Country
## 1 01/12/2010 08:26 2,55 17850 United Kingdom
## 2 01/12/2010 08:26 3,39 17850 United Kingdom
## 3 01/12/2010 08:26 2,75 17850 United Kingdom
## 4 01/12/2010 08:26 3,39 17850 United Kingdom
## 5 01/12/2010 08:26 3,39 17850 United Kingdom
## 6 01/12/2010 08:26 7,65 17850 United Kingdom
str(online_retail)
## 'data.frame': 541909 obs. of 8 variables:
## $ InvoiceNo : chr "536365" "536365" "536365" "536365" ...
## $ StockCode : chr "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: chr "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" ...
## $ UnitPrice : chr "2,55" "3,39" "2,75" "3,39" ...
## $ CustomerID : int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
dim(online_retail)
## [1] 541909 8
DATA CLEANING
# Hapus missing CustomerID
online_retail <- online_retail[!is.na(online_retail$CustomerID), ]
# Hanya transaksi valid (pembelian)
online_retail <- online_retail[online_retail$Quantity > 0, ]
online_retail <- online_retail[online_retail$UnitPrice > 0, ]
# Convert tanggal
online_retail$InvoiceDate <- as.POSIXct(
online_retail$InvoiceDate,
format = "%d/%m/%Y %H:%M"
)
# Hapus duplikasi yang relevan
online_retail <- online_retail %>%
distinct(InvoiceNo, StockCode, .keep_all = TRUE)
# Normalisasi teks
online_retail$Description <- trimws(toupper(online_retail$Description))
online_retail$Country <- trimws(toupper(online_retail$Country))
# Cek hasil setelah cleaning
summary(online_retail)
## InvoiceNo StockCode Description Quantity
## Length:387841 Length:387841 Length:387841 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.: 2.00
## Mode :character Mode :character Mode :character Median : 6.00
## Mean : 13.22
## 3rd Qu.: 12.00
## Max. :80995.00
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00 Length:387841 Min. :12346
## 1st Qu.:2011-04-07 10:20:00 Class :character 1st Qu.:13941
## Median :2011-07-29 13:39:00 Mode :character Median :15145
## Mean :2011-07-10 11:30:27 Mean :15282
## 3rd Qu.:2011-10-20 11:03:00 3rd Qu.:16790
## Max. :2011-12-09 12:50:00 Max. :18287
## Country
## Length:387841
## Class :character
## Mode :character
##
##
##
dim(online_retail)
## [1] 387841 8
DATA PREPARATION (Time Features)
online_retail$Hour <- as.integer(format(online_retail$InvoiceDate, "%H"))
online_retail$Day <- weekdays(online_retail$InvoiceDate)
online_retail$Month <- format(online_retail$InvoiceDate, "%B")
EDA: Pola waktu transaksi
hourly_trx <- table(online_retail$Hour)
barplot(
hourly_trx,
col = "skyblue",
main = "Jumlah Transaksi Berdasarkan Jam",
xlab = "Jam",
ylab = "Jumlah Transaksi",
las = 2
)
daily_trx <- table(online_retail$Day)
barplot(
daily_trx,
col = "lightblue",
main="Jumlah Transaksi Berdasarkan Hari",
xlab="Hari",
ylab="Jumlah Transaksi",
las=2
)
monthly_trx <- table(online_retail$Month)
barplot(
monthly_trx,
col = "darkblue",
main="Jumlah Transaksi Berdasarkan Bulan",
xlab="Bulan",
ylab="Jumlah Transaksi",
las=2
)
# Insight tambahan
total_transaksi <- length(unique(online_retail$InvoiceNo))
total_transaksi
## [1] 18532