library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.2
## ✔ purrr 1.2.0 ✔ tibble 3.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
retail <- read_excel("Online Retail.xlsx")
retail
## # A tibble: 541,909 × 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEA… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTE… 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEART… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLA… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE… 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NE… 2 2010-12-01 08:26:00 7.65
## 7 536365 21730 GLASS STAR FROSTE… 6 2010-12-01 08:26:00 4.25
## 8 536366 22633 HAND WARMER UNION… 6 2010-12-01 08:28:00 1.85
## 9 536366 22632 HAND WARMER RED P… 6 2010-12-01 08:28:00 1.85
## 10 536367 84879 ASSORTED COLOUR B… 32 2010-12-01 08:34:00 1.69
## # ℹ 541,899 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
head(retail)
## # A tibble: 6 × 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEAR… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEARTS… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLAG… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE … 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NES… 2 2010-12-01 08:26:00 7.65
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
str(retail)
## tibble [541,909 × 8] (S3: tbl_df/tbl/data.frame)
## $ InvoiceNo : chr [1:541909] "536365" "536365" "536365" "536365" ...
## $ StockCode : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: POSIXct[1:541909], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
## $ UnitPrice : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
## $ Country : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
summary(retail)
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.000
## Class :character Class :character Class :character 1st Qu.: 1.000
## Mode :character Mode :character Mode :character Median : 3.000
## Mean : 9.552
## 3rd Qu.: 10.000
## Max. : 80995.000
##
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00 Min. :-11062.060 Min. :12346
## 1st Qu.:2011-03-28 11:34:00 1st Qu.: 1.250 1st Qu.:13953
## Median :2011-07-19 17:17:00 Median : 2.080 Median :15152
## Mean :2011-07-04 13:34:57 Mean : 4.611 Mean :15288
## 3rd Qu.:2011-10-19 11:27:00 3rd Qu.: 4.130 3rd Qu.:16791
## Max. :2011-12-09 12:50:00 Max. : 38970.000 Max. :18287
## NA's :135080
## Country
## Length:541909
## Class :character
## Mode :character
##
##
##
##
colSums(is.na(retail))
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 0 0 1454 0 0 0
## CustomerID Country
## 135080 0
colMeans(is.na(retail)) * 100
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 0.0000000 0.0000000 0.2683107 0.0000000 0.0000000 0.0000000
## CustomerID Country
## 24.9266943 0.0000000
sum(duplicated(retail))
## [1] 5268
retail_clean <- retail
retail_clean <- retail_clean %>%
filter(Quantity > 0,
UnitPrice > 0)
retail_clean <- retail_clean %>%
filter(!is.na(CustomerID))
retail_clean <- retail_clean %>%
mutate(InvoiceNo = as.character(InvoiceNo)) %>%
filter(!str_starts(InvoiceNo, "C"))
nrow(retail); nrow(retail_clean)
## [1] 541909
## [1] 397884
retail_clean <- retail_clean %>%
mutate(
# Total nilai transaksi tiap baris
TotalPrice = Quantity * UnitPrice,
# Ekstrak tanggal (tanpa jam)
InvoiceDate = as.POSIXct(InvoiceDate),
InvoiceDateDate = as.Date(InvoiceDate),
Year = year(InvoiceDateDate),
Month = month(InvoiceDateDate, label = TRUE, abbr = TRUE),
Weekday = wday(InvoiceDateDate, label = TRUE, abbr = TRUE)
)
head(retail_clean)
## # A tibble: 6 × 13
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEAR… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEARTS… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLAG… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE … 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NES… 2 2010-12-01 08:26:00 7.65
## # ℹ 7 more variables: CustomerID <dbl>, Country <chr>, TotalPrice <dbl>,
## # InvoiceDateDate <date>, Year <dbl>, Month <ord>, Weekday <ord>
total_revenue <- sum(retail_clean$TotalPrice)
total_revenue
## [1] 8911408
revenue_by_year <- retail_clean %>%
group_by(Year) %>%
summarise(
total_revenue = sum(TotalPrice),
n_tx = n()
) %>%
arrange(Year)
revenue_by_year
## # A tibble: 2 × 3
## Year total_revenue n_tx
## <dbl> <dbl> <int>
## 1 2010 572714. 26157
## 2 2011 8338694. 371727
revenue_by_country <- retail_clean %>%
group_by(Country) %>%
summarise(
total_revenue = sum(TotalPrice),
n_customer = n_distinct(CustomerID),
n_tx = n()
) %>%
arrange(desc(total_revenue))
head(revenue_by_country, 10)
## # A tibble: 10 × 4
## Country total_revenue n_customer n_tx
## <chr> <dbl> <int> <int>
## 1 United Kingdom 7308392. 3920 354321
## 2 Netherlands 285446. 9 2359
## 3 EIRE 265546. 3 7236
## 4 Germany 228867. 94 9040
## 5 France 209024. 87 8341
## 6 Australia 138521. 9 1182
## 7 Spain 61577. 30 2484
## 8 Switzerland 56444. 21 1841
## 9 Belgium 41196. 25 2031
## 10 Sweden 38378. 8 451
revenue_by_country %>%
slice_max(order_by = total_revenue, n = 10) %>%
ggplot(aes(x = reorder(Country, total_revenue),
y = total_revenue)) +
geom_col() +
coord_flip() +
labs(
title = "Top 10 Negara berdasarkan Revenue",
x = "Negara",
y = "Total Revenue"
)
# 5.4. Tren revenue harian
daily_revenue <- retail_clean %>%
group_by(InvoiceDateDate) %>%
summarise(
daily_revenue = sum(TotalPrice),
n_tx = n()
)
ggplot(daily_revenue, aes(x = InvoiceDateDate, y = daily_revenue)) +
geom_line() +
labs(
title = "Tren Revenue Harian",
x = "Tanggal",
y = "Revenue per Hari"
)
# 6. ANALISIS PRODUK: TOP SELLING ITEMS # 6.1. Produk dengan penjualan
tertinggi berdasarkan revenue
top_products <- retail_clean %>%
group_by(StockCode, Description) %>%
summarise(
total_revenue = sum(TotalPrice),
total_qty = sum(Quantity),
n_tx = n()
) %>%
arrange(desc(total_revenue))
## `summarise()` has grouped output by 'StockCode'. You can override using the
## `.groups` argument.
head(top_products, 10)
## # A tibble: 10 × 5
## # Groups: StockCode [10]
## StockCode Description total_revenue total_qty n_tx
## <chr> <chr> <dbl> <dbl> <int>
## 1 23843 PAPER CRAFT , LITTLE BIRDIE 168470. 80995 1
## 2 22423 REGENCY CAKESTAND 3 TIER 142593. 12402 1723
## 3 85123A WHITE HANGING HEART T-LIGHT HOLDER 100448. 36725 2028
## 4 85099B JUMBO BAG RED RETROSPOT 85221. 46181 1618
## 5 23166 MEDIUM CERAMIC TOP STORAGE JAR 81417. 77916 198
## 6 POST POSTAGE 77804. 3120 1099
## 7 47566 PARTY BUNTING 68844. 15291 1396
## 8 84879 ASSORTED COLOUR BIRD ORNAMENT 56580. 35362 1408
## 9 M Manual 53780. 7173 284
## 10 23084 RABBIT NIGHT LIGHT 51346. 27202 842
top_products %>%
slice_max(order_by = total_revenue, n = 10) %>%
ggplot(aes(x = reorder(Description, total_revenue),
y = total_revenue)) +
geom_col() +
coord_flip() +
labs(
title = "Top 10 Produk berdasarkan Revenue",
x = "Produk",
y = "Total Revenue"
)
# 7. ANALISIS CUSTOMER: RFM (RECENCY, FREQUENCY, MONETARY) # 7.1.
Tentukan “tanggal acuan” (tanggal terakhir di data)
max_date <- max(retail_clean$InvoiceDateDate, na.rm = TRUE)
max_date
## [1] "2011-12-09"
rfm <- retail_clean %>%
group_by(CustomerID) %>%
summarise(
recency = as.numeric(max_date - max(InvoiceDateDate)), # hari sejak transaksi terakhir
frequency = n_distinct(InvoiceNo), # berapa kali transaksi
monetary = sum(TotalPrice) # total uang dibelanjakan
)
head(rfm)
## # A tibble: 6 × 4
## CustomerID recency frequency monetary
## <dbl> <dbl> <int> <dbl>
## 1 12346 325 1 77184.
## 2 12347 2 7 4310
## 3 12348 75 4 1797.
## 4 12349 18 1 1758.
## 5 12350 310 1 334.
## 6 12352 36 8 2506.
summary(rfm)
## CustomerID recency frequency monetary
## Min. :12346 Min. : 0.00 Min. : 1.000 Min. : 3.75
## 1st Qu.:13813 1st Qu.: 17.00 1st Qu.: 1.000 1st Qu.: 307.42
## Median :15300 Median : 50.00 Median : 2.000 Median : 674.49
## Mean :15300 Mean : 92.06 Mean : 4.272 Mean : 2054.27
## 3rd Qu.:16779 3rd Qu.:141.75 3rd Qu.: 5.000 3rd Qu.: 1661.74
## Max. :18287 Max. :373.00 Max. :209.000 Max. :280206.02
rfm_segment <- rfm %>%
mutate(
R_score = ntile(-recency, 3), # recency semakin kecil => semakin bagus
F_score = ntile(frequency, 3),
M_score = ntile(monetary, 3),
RFM_score = R_score + F_score + M_score
)
head(rfm_segment)
## # A tibble: 6 × 8
## CustomerID recency frequency monetary R_score F_score M_score RFM_score
## <dbl> <dbl> <int> <dbl> <int> <int> <int> <int>
## 1 12346 325 1 77184. 1 1 3 5
## 2 12347 2 7 4310 3 3 3 9
## 3 12348 75 4 1797. 2 2 3 7
## 4 12349 18 1 1758. 3 1 3 7
## 5 12350 310 1 334. 1 1 1 3
## 6 12352 36 8 2506. 2 3 3 8
rfm_segment %>%
ggplot(aes(x = RFM_score)) +
geom_histogram(binwidth = 1, boundary = 0) +
labs(
title = "Distribusi Skor RFM Customer",
x = "Skor RFM (R+F+M)",
y = "Jumlah Customer"
)
# 8. CONTOH SEGMENTASI CUSTOMER BERDASARKAN RFM
rfm_segment <- rfm_segment %>%
mutate(
segment = case_when(
RFM_score >= 8 ~ "Best Customers",
RFM_score >= 5 ~ "Loyal / Potensial",
TRUE ~ "Perlu Perhatian"
)
)
rfm_segment %>%
group_by(segment) %>%
summarise(
n_customer = n(),
avg_monetary = mean(monetary),
avg_frequency = mean(frequency),
avg_recency = mean(recency)
)
## # A tibble: 3 × 5
## segment n_customer avg_monetary avg_frequency avg_recency
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Best Customers 1281 5336. 9.82 19.6
## 2 Loyal / Potensial 1812 966. 2.51 74.4
## 3 Perlu Perhatian 1245 261. 1.13 192.
customer_country <- retail_clean %>%
group_by(CustomerID, Country) %>%
summarise(
total_revenue = sum(TotalPrice),
.groups = "drop"
)
rfm_with_country <- rfm_segment %>%
left_join(customer_country, by = "CustomerID")
rfm_with_country %>%
group_by(Country, segment) %>%
summarise(
n_customer = n_distinct(CustomerID),
.groups = "drop"
) %>%
group_by(Country) %>%
mutate(
prop = n_customer / sum(n_customer)
) %>%
ungroup() %>%
slice_max(order_by = n_customer, n = 30) %>%
ggplot(aes(x = reorder(Country, n_customer),
y = n_customer,
fill = segment)) +
geom_col() +
coord_flip() +
labs(
title = "Distribusi Segment Customer per Negara",
x = "Negara",
y = "Jumlah Customer"
)