Project ini bertujuan membantu perusahaan memahami performa bisnis melalui 2 goals utama:
GOAL 1 — Segmentasi Produk Mengelompokkan produk menjadi: • Best Seller • Fast Moving • Slow Moving • Low Profit berdasarkan kombinasi Total_Quantity & Total_Revenue.
GOAL 2 — Segmentasi Customer Mengelompokkan pelanggan untuk strategi retensi: • High Value Customer • Medium Value Customer • Low Value Customer berdasarkan RFM (Recency, Frequency, Monetary).
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(skimr)
## Warning: package 'skimr' was built under R version 4.4.3
# Import data
data <- read_excel("Online Retail.xlsx")
# Cleaning
data <- data %>% filter(!is.na(CustomerID)) # hapus missing CustomerID
data <- data %>% filter(Quantity > 0, UnitPrice > 0) # hapus retur & harga tidak valid
data <- distinct(data) # hapus duplikat
data <- data %>% mutate(
Revenue = Quantity * UnitPrice,
InvoiceDate = as.Date(InvoiceDate)
)
head(data)
## # A tibble: 6 × 9
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID
## <chr> <chr> <chr> <dbl> <date> <dbl> <dbl>
## 1 536365 85123A WHITE HANGING H… 6 2010-12-01 2.55 17850
## 2 536365 71053 WHITE METAL LAN… 6 2010-12-01 3.39 17850
## 3 536365 84406B CREAM CUPID HEA… 8 2010-12-01 2.75 17850
## 4 536365 84029G KNITTED UNION F… 6 2010-12-01 3.39 17850
## 5 536365 84029E RED WOOLLY HOTT… 6 2010-12-01 3.39 17850
## 6 536365 22752 SET 7 BABUSHKA … 2 2010-12-01 7.65 17850
## # ℹ 2 more variables: Country <chr>, Revenue <dbl>
summary(data)
## InvoiceNo StockCode Description Quantity
## Length:392692 Length:392692 Length:392692 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.: 2.00
## Mode :character Mode :character Mode :character Median : 6.00
## Mean : 13.12
## 3rd Qu.: 12.00
## Max. :80995.00
## InvoiceDate UnitPrice CustomerID Country
## Min. :2010-12-01 Min. : 0.001 Min. :12346 Length:392692
## 1st Qu.:2011-04-07 1st Qu.: 1.250 1st Qu.:13955 Class :character
## Median :2011-07-31 Median : 1.950 Median :15150 Mode :character
## Mean :2011-07-10 Mean : 3.126 Mean :15288
## 3rd Qu.:2011-10-20 3rd Qu.: 3.750 3rd Qu.:16791
## Max. :2011-12-09 Max. :8142.750 Max. :18287
## Revenue
## Min. : 0.00
## 1st Qu.: 4.95
## Median : 12.45
## Mean : 22.63
## 3rd Qu.: 19.80
## Max. :168469.60
# Statistik deskriptif Quantity, UnitPrice, Revenue
skim(data[, c("Quantity", "UnitPrice", "Revenue")])
| Name | data[, c(“Quantity”, “Uni… |
| Number of rows | 392692 |
| Number of columns | 3 |
| _______________________ | |
| Column type frequency: | |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Quantity | 0 | 1 | 13.12 | 180.49 | 1 | 2.00 | 6.00 | 12.00 | 80995.00 | ▇▁▁▁▁ |
| UnitPrice | 0 | 1 | 3.13 | 22.24 | 0 | 1.25 | 1.95 | 3.75 | 8142.75 | ▇▁▁▁▁ |
| Revenue | 0 | 1 | 22.63 | 311.10 | 0 | 4.95 | 12.45 | 19.80 | 168469.60 | ▇▁▁▁▁ |
# Distribusi Quantity
ggplot(data, aes(x = Quantity)) +
geom_histogram(bins = 50) +
labs(title = "Distribusi Quantity", x = "Quantity", y = "Frekuensi")
# Distribusi Unit Price
ggplot(data, aes(x = UnitPrice)) +
geom_histogram(bins = 50) +
labs(title = "Distribusi Harga Produk", x = "Unit Price", y = "Frekuensi")
# Distribusi Revenue
ggplot(data, aes(x = Revenue)) +
geom_histogram(bins = 50) +
labs(title = "Distribusi Revenue", x = "Revenue", y = "Frekuensi")
# Top produk berdasarkan revenue
top_products <- data %>%
group_by(Description) %>%
summarise(Total_Revenue = sum(Revenue)) %>%
arrange(desc(Total_Revenue)) %>%
slice(1:10)
ggplot(top_products, aes(x = reorder(Description, Total_Revenue), y = Total_Revenue)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 Produk Berdasarkan Revenue", x = "Produk", y = "Total Revenue")
product_seg <- data %>%
group_by(Description) %>%
summarise(
Total_Quantity = sum(Quantity),
Total_Revenue = sum(Revenue)
)
product_scaled <- scale(product_seg[, c("Total_Quantity", "Total_Revenue")])
set.seed(123)
kmeans_prod <- kmeans(product_scaled, centers = 4, nstart = 25)
product_seg$Cluster <- as.factor(kmeans_prod$cluster)
product_seg <- product_seg %>%
mutate(Segment_Label = case_when(
Cluster == 1 ~ "Best Seller",
Cluster == 2 ~ "Fast Moving",
Cluster == 3 ~ "Slow Moving",
Cluster == 4 ~ "Low Profit"
))
ggplot(product_seg, aes(x = Total_Quantity, y = Total_Revenue, color = Segment_Label)) +
geom_point(size = 3) +
labs(title = "Segmentasi Produk Berdasarkan Profit & Frekuensi Penjualan",
x = "Total Quantity", y = "Total Revenue")
ref_date <- max(data$InvoiceDate)
customer_seg <- data %>%
group_by(CustomerID) %>%
summarise(
Recency = as.numeric(ref_date - max(InvoiceDate)),
Frequency = n_distinct(InvoiceNo),
Monetary = sum(Revenue)
)
customer_scaled <- scale(customer_seg[, c("Recency", "Frequency", "Monetary")])
set.seed(123)
kmeans_cust <- kmeans(customer_scaled, centers = 3, nstart = 25)
customer_seg$Cluster <- as.factor(kmeans_cust$cluster)
customer_seg <- customer_seg %>%
mutate(Customer_Label = case_when(
Cluster == 1 ~ "High Value Customer",
Cluster == 2 ~ "Medium Value Customer",
Cluster == 3 ~ "Low Value Customer"
))
ggplot(customer_seg, aes(x = Frequency, y = Monetary, color = Customer_Label)) +
geom_point(size = 3) +
labs(title = "Segmentasi Customer Berdasarkan Nilai Loyalitas",
x = "Frequency", y = "Total Spending")