library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(readxl)

## Warning: package 'readxl' was built under R version 4.5.2

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.5.2

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## Warning: package 'purrr' was built under R version 4.5.2

## Warning: package 'forcats' was built under R version 4.5.2

## Warning: package 'lubridate' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.2
## ✔ purrr     1.2.0     ✔ tibble    3.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)

1. Import Data

retail <- read_excel("Online Retail.xlsx")
retail

## # A tibble: 541,909 × 8
##    InvoiceNo StockCode Description        Quantity InvoiceDate         UnitPrice
##    <chr>     <chr>     <chr>                 <dbl> <dttm>                  <dbl>
##  1 536365    85123A    WHITE HANGING HEA…        6 2010-12-01 08:26:00      2.55
##  2 536365    71053     WHITE METAL LANTE…        6 2010-12-01 08:26:00      3.39
##  3 536365    84406B    CREAM CUPID HEART…        8 2010-12-01 08:26:00      2.75
##  4 536365    84029G    KNITTED UNION FLA…        6 2010-12-01 08:26:00      3.39
##  5 536365    84029E    RED WOOLLY HOTTIE…        6 2010-12-01 08:26:00      3.39
##  6 536365    22752     SET 7 BABUSHKA NE…        2 2010-12-01 08:26:00      7.65
##  7 536365    21730     GLASS STAR FROSTE…        6 2010-12-01 08:26:00      4.25
##  8 536366    22633     HAND WARMER UNION…        6 2010-12-01 08:28:00      1.85
##  9 536366    22632     HAND WARMER RED P…        6 2010-12-01 08:28:00      1.85
## 10 536367    84879     ASSORTED COLOUR B…       32 2010-12-01 08:34:00      1.69
## # ℹ 541,899 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>

Lihat 6 baris pertama

head(retail)

## # A tibble: 6 × 8
##   InvoiceNo StockCode Description         Quantity InvoiceDate         UnitPrice
##   <chr>     <chr>     <chr>                  <dbl> <dttm>                  <dbl>
## 1 536365    85123A    WHITE HANGING HEAR…        6 2010-12-01 08:26:00      2.55
## 2 536365    71053     WHITE METAL LANTERN        6 2010-12-01 08:26:00      3.39
## 3 536365    84406B    CREAM CUPID HEARTS…        8 2010-12-01 08:26:00      2.75
## 4 536365    84029G    KNITTED UNION FLAG…        6 2010-12-01 08:26:00      3.39
## 5 536365    84029E    RED WOOLLY HOTTIE …        6 2010-12-01 08:26:00      3.39
## 6 536365    22752     SET 7 BABUSHKA NES…        2 2010-12-01 08:26:00      7.65
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>

Struktur data

str(retail)

## tibble [541,909 × 8] (S3: tbl_df/tbl/data.frame)
##  $ InvoiceNo  : chr [1:541909] "536365" "536365" "536365" "536365" ...
##  $ StockCode  : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: POSIXct[1:541909], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
##  $ UnitPrice  : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
##  $ Country    : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...

Ringkasan statistik numerik

summary(retail)

##   InvoiceNo          StockCode         Description           Quantity         
##  Length:541909      Length:541909      Length:541909      Min.   :-80995.000  
##  Class :character   Class :character   Class :character   1st Qu.:     1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3.000  
##                                                           Mean   :     9.552  
##                                                           3rd Qu.:    10.000  
##                                                           Max.   : 80995.000  
##                                                                               
##   InvoiceDate                    UnitPrice            CustomerID    
##  Min.   :2010-12-01 08:26:00   Min.   :-11062.060   Min.   :12346   
##  1st Qu.:2011-03-28 11:34:00   1st Qu.:     1.250   1st Qu.:13953   
##  Median :2011-07-19 17:17:00   Median :     2.080   Median :15152   
##  Mean   :2011-07-04 13:34:57   Mean   :     4.611   Mean   :15288   
##  3rd Qu.:2011-10-19 11:27:00   3rd Qu.:     4.130   3rd Qu.:16791   
##  Max.   :2011-12-09 12:50:00   Max.   : 38970.000   Max.   :18287   
##                                                     NA's   :135080  
##    Country         
##  Length:541909     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

2. CEK KUALITAS DATA & MISSING VALUE

Cek jumlah missing value per kolom

colSums(is.na(retail))

##   InvoiceNo   StockCode Description    Quantity InvoiceDate   UnitPrice 
##           0           0        1454           0           0           0 
##  CustomerID     Country 
##      135080           0

Proporsi missing value per kolom (%)

colMeans(is.na(retail)) * 100

##   InvoiceNo   StockCode Description    Quantity InvoiceDate   UnitPrice 
##   0.0000000   0.0000000   0.2683107   0.0000000   0.0000000   0.0000000 
##  CustomerID     Country 
##  24.9266943   0.0000000

Cek duplikasi baris

sum(duplicated(retail))

## [1] 5268

3. DATA CLEANING DASAR

3.1. Buat salinan data agar aman

retail_clean <- retail

3.2. Buang transaksi dengan Quantity <= 0 atau UnitPrice <= 0

retail_clean <- retail_clean %>%
  filter(Quantity > 0,
         UnitPrice > 0)

3.3. Hapus baris dengan CustomerID NA (kalau fokus ke analisis pelanggan)

retail_clean <- retail_clean %>%
  filter(!is.na(CustomerID))

3.4. Buang invoice yang merupakan cancel/retur

(di dataset Online Retail, invoice cancel biasanya diawali huruf “C”)

retail_clean <- retail_clean %>%
  mutate(InvoiceNo = as.character(InvoiceNo)) %>%
  filter(!str_starts(InvoiceNo, "C"))

Cek lagi jumlah baris setelah dibersihkan

nrow(retail); nrow(retail_clean)

## [1] 541909

## [1] 397884

4. FEATURE ENGINEERING (MENAMBAH KOLOM BARU)

Total nilai transaksi tiap baris

retail_clean <- retail_clean %>%
  mutate(
    # Total nilai transaksi tiap baris
    TotalPrice = Quantity * UnitPrice,
    
    # Ekstrak tanggal (tanpa jam)
    InvoiceDate = as.POSIXct(InvoiceDate),
    InvoiceDateDate = as.Date(InvoiceDate),
    
    Year  = year(InvoiceDateDate),
    Month = month(InvoiceDateDate, label = TRUE, abbr = TRUE),
    Weekday = wday(InvoiceDateDate, label = TRUE, abbr = TRUE)
  )
head(retail_clean)

## # A tibble: 6 × 13
##   InvoiceNo StockCode Description         Quantity InvoiceDate         UnitPrice
##   <chr>     <chr>     <chr>                  <dbl> <dttm>                  <dbl>
## 1 536365    85123A    WHITE HANGING HEAR…        6 2010-12-01 08:26:00      2.55
## 2 536365    71053     WHITE METAL LANTERN        6 2010-12-01 08:26:00      3.39
## 3 536365    84406B    CREAM CUPID HEARTS…        8 2010-12-01 08:26:00      2.75
## 4 536365    84029G    KNITTED UNION FLAG…        6 2010-12-01 08:26:00      3.39
## 5 536365    84029E    RED WOOLLY HOTTIE …        6 2010-12-01 08:26:00      3.39
## 6 536365    22752     SET 7 BABUSHKA NES…        2 2010-12-01 08:26:00      7.65
## # ℹ 7 more variables: CustomerID <dbl>, Country <chr>, TotalPrice <dbl>,
## #   InvoiceDateDate <date>, Year <dbl>, Month <ord>, Weekday <ord>

5. ANALISIS DESKRIPTIF: PENJUALAN SECARA UMUM

5.1. Total penjualan (Revenue) keseluruhan

total_revenue <- sum(retail_clean$TotalPrice)
total_revenue

## [1] 8911408

5.2. Ringkasan revenue per tahun

revenue_by_year <- retail_clean %>%
  group_by(Year) %>%
  summarise(
    total_revenue = sum(TotalPrice),
    n_tx = n()
  ) %>%
  arrange(Year)

revenue_by_year

## # A tibble: 2 × 3
##    Year total_revenue   n_tx
##   <dbl>         <dbl>  <int>
## 1  2010       572714.  26157
## 2  2011      8338694. 371727

5.3. Revenue per negara (Top 10)

revenue_by_country <- retail_clean %>%
  group_by(Country) %>%
  summarise(
    total_revenue = sum(TotalPrice),
    n_customer = n_distinct(CustomerID),
    n_tx = n()
  ) %>%
  arrange(desc(total_revenue))

head(revenue_by_country, 10)

## # A tibble: 10 × 4
##    Country        total_revenue n_customer   n_tx
##    <chr>                  <dbl>      <int>  <int>
##  1 United Kingdom      7308392.       3920 354321
##  2 Netherlands          285446.          9   2359
##  3 EIRE                 265546.          3   7236
##  4 Germany              228867.         94   9040
##  5 France               209024.         87   8341
##  6 Australia            138521.          9   1182
##  7 Spain                 61577.         30   2484
##  8 Switzerland           56444.         21   1841
##  9 Belgium               41196.         25   2031
## 10 Sweden                38378.          8    451

Plot Top 10 negara berdasarkan revenue

revenue_by_country %>%
  slice_max(order_by = total_revenue, n = 10) %>%
  ggplot(aes(x = reorder(Country, total_revenue),
             y = total_revenue)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 10 Negara berdasarkan Revenue",
    x = "Negara",
    y = "Total Revenue"
  )

# 5.4. Tren revenue harian

daily_revenue <- retail_clean %>%
  group_by(InvoiceDateDate) %>%
  summarise(
    daily_revenue = sum(TotalPrice),
    n_tx = n()
  )

ggplot(daily_revenue, aes(x = InvoiceDateDate, y = daily_revenue)) +
  geom_line() +
  labs(
    title = "Tren Revenue Harian",
    x = "Tanggal",
    y = "Revenue per Hari"
  )

# 6. ANALISIS PRODUK: TOP SELLING ITEMS # 6.1. Produk dengan penjualan tertinggi berdasarkan revenue

top_products <- retail_clean %>%
  group_by(StockCode, Description) %>%
  summarise(
    total_revenue = sum(TotalPrice),
    total_qty = sum(Quantity),
    n_tx = n()
  ) %>%
  arrange(desc(total_revenue))

## `summarise()` has grouped output by 'StockCode'. You can override using the
## `.groups` argument.

head(top_products, 10)

## # A tibble: 10 × 5
## # Groups:   StockCode [10]
##    StockCode Description                        total_revenue total_qty  n_tx
##    <chr>     <chr>                                      <dbl>     <dbl> <int>
##  1 23843     PAPER CRAFT , LITTLE BIRDIE              168470.     80995     1
##  2 22423     REGENCY CAKESTAND 3 TIER                 142593.     12402  1723
##  3 85123A    WHITE HANGING HEART T-LIGHT HOLDER       100448.     36725  2028
##  4 85099B    JUMBO BAG RED RETROSPOT                   85221.     46181  1618
##  5 23166     MEDIUM CERAMIC TOP STORAGE JAR            81417.     77916   198
##  6 POST      POSTAGE                                   77804.      3120  1099
##  7 47566     PARTY BUNTING                             68844.     15291  1396
##  8 84879     ASSORTED COLOUR BIRD ORNAMENT             56580.     35362  1408
##  9 M         Manual                                    53780.      7173   284
## 10 23084     RABBIT NIGHT LIGHT                        51346.     27202   842

Plot Top 10 produk berdasarkan revenue

top_products %>%
  slice_max(order_by = total_revenue, n = 10) %>%
  ggplot(aes(x = reorder(Description, total_revenue),
             y = total_revenue)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 10 Produk berdasarkan Revenue",
    x = "Produk",
    y = "Total Revenue"
  )

# 7. ANALISIS CUSTOMER: RFM (RECENCY, FREQUENCY, MONETARY) # 7.1. Tentukan “tanggal acuan” (tanggal terakhir di data)

max_date <- max(retail_clean$InvoiceDateDate, na.rm = TRUE)
max_date

## [1] "2011-12-09"

7.2. Hitung RFM per customer

rfm <- retail_clean %>%
  group_by(CustomerID) %>%
  summarise(
    recency   = as.numeric(max_date - max(InvoiceDateDate)),   # hari sejak transaksi terakhir
    frequency = n_distinct(InvoiceNo),                        # berapa kali transaksi
    monetary  = sum(TotalPrice)                               # total uang dibelanjakan
  )

head(rfm)

## # A tibble: 6 × 4
##   CustomerID recency frequency monetary
##        <dbl>   <dbl>     <int>    <dbl>
## 1      12346     325         1   77184.
## 2      12347       2         7    4310 
## 3      12348      75         4    1797.
## 4      12349      18         1    1758.
## 5      12350     310         1     334.
## 6      12352      36         8    2506.

summary(rfm)

##    CustomerID       recency         frequency          monetary        
##  Min.   :12346   Min.   :  0.00   Min.   :  1.000   Min.   :     3.75  
##  1st Qu.:13813   1st Qu.: 17.00   1st Qu.:  1.000   1st Qu.:   307.42  
##  Median :15300   Median : 50.00   Median :  2.000   Median :   674.49  
##  Mean   :15300   Mean   : 92.06   Mean   :  4.272   Mean   :  2054.27  
##  3rd Qu.:16779   3rd Qu.:141.75   3rd Qu.:  5.000   3rd Qu.:  1661.74  
##  Max.   :18287   Max.   :373.00   Max.   :209.000   Max.   :280206.02

7.3. Segmentasi sederhana berdasarkan kuantil

rfm_segment <- rfm %>%
  mutate(
    R_score = ntile(-recency, 3),   # recency semakin kecil => semakin bagus
    F_score = ntile(frequency, 3),
    M_score = ntile(monetary, 3),
    RFM_score = R_score + F_score + M_score
  )

head(rfm_segment)

## # A tibble: 6 × 8
##   CustomerID recency frequency monetary R_score F_score M_score RFM_score
##        <dbl>   <dbl>     <int>    <dbl>   <int>   <int>   <int>     <int>
## 1      12346     325         1   77184.       1       1       3         5
## 2      12347       2         7    4310        3       3       3         9
## 3      12348      75         4    1797.       2       2       3         7
## 4      12349      18         1    1758.       3       1       3         7
## 5      12350     310         1     334.       1       1       1         3
## 6      12352      36         8    2506.       2       3       3         8

7.4. Lihat distribusi skor RFM

rfm_segment %>%
  ggplot(aes(x = RFM_score)) +
  geom_histogram(binwidth = 1, boundary = 0) +
  labs(
    title = "Distribusi Skor RFM Customer",
    x = "Skor RFM (R+F+M)",
    y = "Jumlah Customer"
  )

# 8. CONTOH SEGMENTASI CUSTOMER BERDASARKAN RFM

rfm_segment <- rfm_segment %>%
  mutate(
    segment = case_when(
      RFM_score >= 8 ~ "Best Customers",
      RFM_score >= 5 ~ "Loyal / Potensial",
      TRUE           ~ "Perlu Perhatian"
    )
  )

Ringkasan segmentasi

rfm_segment %>%
  group_by(segment) %>%
  summarise(
    n_customer = n(),
    avg_monetary = mean(monetary),
    avg_frequency = mean(frequency),
    avg_recency = mean(recency)
  )

## # A tibble: 3 × 5
##   segment           n_customer avg_monetary avg_frequency avg_recency
##   <chr>                  <int>        <dbl>         <dbl>       <dbl>
## 1 Best Customers          1281        5336.          9.82        19.6
## 2 Loyal / Potensial       1812         966.          2.51        74.4
## 3 Perlu Perhatian         1245         261.          1.13       192.

9. CONTOH: MENGGABUNGKAN SEGMENTASI CUSTOMER DENGAN NEGARA

customer_country <- retail_clean %>%
  group_by(CustomerID, Country) %>%
  summarise(
    total_revenue = sum(TotalPrice),
    .groups = "drop"
  )

rfm_with_country <- rfm_segment %>%
  left_join(customer_country, by = "CustomerID")

Lihat distribusi segment per negara (misal Top 5 negara)

rfm_with_country %>%
  group_by(Country, segment) %>%
  summarise(
    n_customer = n_distinct(CustomerID),
    .groups = "drop"
  ) %>%
  group_by(Country) %>%
  mutate(
    prop = n_customer / sum(n_customer)
  ) %>%
  ungroup() %>%
  slice_max(order_by = n_customer, n = 30) %>%
  ggplot(aes(x = reorder(Country, n_customer),
             y = n_customer,
             fill = segment)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Distribusi Segment Customer per Negara",
    x = "Negara",
    y = "Jumlah Customer"
  )

Proyek Pengolahan Data Online Retail

Naufal Fawaaz Shofa

2025-12-06