1. Business Understanding

Project ini bertujuan membantu perusahaan memahami performa bisnis melalui 2 goals utama:

GOAL 1 — Segmentasi Produk Mengelompokkan produk menjadi: • Best Seller • Fast Moving • Slow Moving • Low Profit berdasarkan kombinasi Total_Quantity & Total_Revenue.

GOAL 2 — Segmentasi Customer Mengelompokkan pelanggan untuk strategi retensi: • High Value Customer • Medium Value Customer • Low Value Customer berdasarkan RFM (Recency, Frequency, Monetary).

2. Data Import & Cleaning

library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(skimr)
## Warning: package 'skimr' was built under R version 4.4.3
# Import data
data <- read_excel("Online Retail.xlsx")
# Cleaning
data <- data %>% filter(!is.na(CustomerID))         # hapus missing CustomerID
data <- data %>% filter(Quantity > 0, UnitPrice > 0) # hapus retur & harga tidak valid
data <- distinct(data)                              # hapus duplikat
data <- data %>% mutate(
  Revenue = Quantity * UnitPrice,
  InvoiceDate = as.Date(InvoiceDate)
)

head(data)
## # A tibble: 6 × 9
##   InvoiceNo StockCode Description      Quantity InvoiceDate UnitPrice CustomerID
##   <chr>     <chr>     <chr>               <dbl> <date>          <dbl>      <dbl>
## 1 536365    85123A    WHITE HANGING H…        6 2010-12-01       2.55      17850
## 2 536365    71053     WHITE METAL LAN…        6 2010-12-01       3.39      17850
## 3 536365    84406B    CREAM CUPID HEA…        8 2010-12-01       2.75      17850
## 4 536365    84029G    KNITTED UNION F…        6 2010-12-01       3.39      17850
## 5 536365    84029E    RED WOOLLY HOTT…        6 2010-12-01       3.39      17850
## 6 536365    22752     SET 7 BABUSHKA …        2 2010-12-01       7.65      17850
## # ℹ 2 more variables: Country <chr>, Revenue <dbl>
summary(data)
##   InvoiceNo          StockCode         Description           Quantity       
##  Length:392692      Length:392692      Length:392692      Min.   :    1.00  
##  Class :character   Class :character   Class :character   1st Qu.:    2.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    6.00  
##                                                           Mean   :   13.12  
##                                                           3rd Qu.:   12.00  
##                                                           Max.   :80995.00  
##   InvoiceDate           UnitPrice          CustomerID      Country         
##  Min.   :2010-12-01   Min.   :   0.001   Min.   :12346   Length:392692     
##  1st Qu.:2011-04-07   1st Qu.:   1.250   1st Qu.:13955   Class :character  
##  Median :2011-07-31   Median :   1.950   Median :15150   Mode  :character  
##  Mean   :2011-07-10   Mean   :   3.126   Mean   :15288                     
##  3rd Qu.:2011-10-20   3rd Qu.:   3.750   3rd Qu.:16791                     
##  Max.   :2011-12-09   Max.   :8142.750   Max.   :18287                     
##     Revenue         
##  Min.   :     0.00  
##  1st Qu.:     4.95  
##  Median :    12.45  
##  Mean   :    22.63  
##  3rd Qu.:    19.80  
##  Max.   :168469.60

3. Exploratory Data Analysis (Statistika Deskriptif)

# Statistik deskriptif Quantity, UnitPrice, Revenue
skim(data[, c("Quantity", "UnitPrice", "Revenue")])
Data summary
Name data[, c(“Quantity”, “Uni…
Number of rows 392692
Number of columns 3
_______________________
Column type frequency:
numeric 3
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Quantity 0 1 13.12 180.49 1 2.00 6.00 12.00 80995.00 ▇▁▁▁▁
UnitPrice 0 1 3.13 22.24 0 1.25 1.95 3.75 8142.75 ▇▁▁▁▁
Revenue 0 1 22.63 311.10 0 4.95 12.45 19.80 168469.60 ▇▁▁▁▁
# Distribusi Quantity
ggplot(data, aes(x = Quantity)) +
  geom_histogram(bins = 50) +
  labs(title = "Distribusi Quantity", x = "Quantity", y = "Frekuensi")

# Distribusi Unit Price
ggplot(data, aes(x = UnitPrice)) +
  geom_histogram(bins = 50) +
  labs(title = "Distribusi Harga Produk", x = "Unit Price", y = "Frekuensi")

# Distribusi Revenue
ggplot(data, aes(x = Revenue)) +
  geom_histogram(bins = 50) +
  labs(title = "Distribusi Revenue", x = "Revenue", y = "Frekuensi")

# Top produk berdasarkan revenue
top_products <- data %>%
  group_by(Description) %>%
  summarise(Total_Revenue = sum(Revenue)) %>%
  arrange(desc(Total_Revenue)) %>%
  slice(1:10)

ggplot(top_products, aes(x = reorder(Description, Total_Revenue), y = Total_Revenue)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 10 Produk Berdasarkan Revenue", x = "Produk", y = "Total Revenue")

4. Goal 1 — Segmentasi Produk

product_seg <- data %>%
  group_by(Description) %>%
  summarise(
    Total_Quantity = sum(Quantity),
    Total_Revenue  = sum(Revenue)
  )

product_scaled <- scale(product_seg[, c("Total_Quantity", "Total_Revenue")])

set.seed(123)
kmeans_prod <- kmeans(product_scaled, centers = 4, nstart = 25)
product_seg$Cluster <- as.factor(kmeans_prod$cluster)

product_seg <- product_seg %>%
  mutate(Segment_Label = case_when(
    Cluster == 1 ~ "Best Seller",
    Cluster == 2 ~ "Fast Moving",
    Cluster == 3 ~ "Slow Moving",
    Cluster == 4 ~ "Low Profit"
  ))

ggplot(product_seg, aes(x = Total_Quantity, y = Total_Revenue, color = Segment_Label)) +
  geom_point(size = 3) +
  labs(title = "Segmentasi Produk Berdasarkan Profit & Frekuensi Penjualan",
       x = "Total Quantity", y = "Total Revenue")

5. Goal 2 — Segmentasi Customer (RFM Clustering)

ref_date <- max(data$InvoiceDate)

customer_seg <- data %>%
  group_by(CustomerID) %>%
  summarise(
    Recency = as.numeric(ref_date - max(InvoiceDate)),
    Frequency = n_distinct(InvoiceNo),
    Monetary = sum(Revenue)
  )

customer_scaled <- scale(customer_seg[, c("Recency", "Frequency", "Monetary")])

set.seed(123)
kmeans_cust <- kmeans(customer_scaled, centers = 3, nstart = 25)
customer_seg$Cluster <- as.factor(kmeans_cust$cluster)

customer_seg <- customer_seg %>%
  mutate(Customer_Label = case_when(
    Cluster == 1 ~ "High Value Customer",
    Cluster == 2 ~ "Medium Value Customer",
    Cluster == 3 ~ "Low Value Customer"
  ))

ggplot(customer_seg, aes(x = Frequency, y = Monetary, color = Customer_Label)) +
  geom_point(size = 3) +
  labs(title = "Segmentasi Customer Berdasarkan Nilai Loyalitas",
       x = "Frequency", y = "Total Spending")