R Markdown

library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
data <- read_excel("Online_Retail.xlsx")
data
## # A tibble: 541,909 × 8
##    InvoiceNo StockCode Description        Quantity InvoiceDate         UnitPrice
##    <chr>     <chr>     <chr>                 <dbl> <dttm>                  <dbl>
##  1 536365    85123A    WHITE HANGING HEA…        6 2010-12-01 08:26:00      2.55
##  2 536365    71053     WHITE METAL LANTE…        6 2010-12-01 08:26:00      3.39
##  3 536365    84406B    CREAM CUPID HEART…        8 2010-12-01 08:26:00      2.75
##  4 536365    84029G    KNITTED UNION FLA…        6 2010-12-01 08:26:00      3.39
##  5 536365    84029E    RED WOOLLY HOTTIE…        6 2010-12-01 08:26:00      3.39
##  6 536365    22752     SET 7 BABUSHKA NE…        2 2010-12-01 08:26:00      7.65
##  7 536365    21730     GLASS STAR FROSTE…        6 2010-12-01 08:26:00      4.25
##  8 536366    22633     HAND WARMER UNION…        6 2010-12-01 08:28:00      1.85
##  9 536366    22632     HAND WARMER RED P…        6 2010-12-01 08:28:00      1.85
## 10 536367    84879     ASSORTED COLOUR B…       32 2010-12-01 08:34:00      1.69
## # ℹ 541,899 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
data <- data %>%
mutate(TotalPrice = Quantity * UnitPrice)
data
## # A tibble: 541,909 × 9
##    InvoiceNo StockCode Description        Quantity InvoiceDate         UnitPrice
##    <chr>     <chr>     <chr>                 <dbl> <dttm>                  <dbl>
##  1 536365    85123A    WHITE HANGING HEA…        6 2010-12-01 08:26:00      2.55
##  2 536365    71053     WHITE METAL LANTE…        6 2010-12-01 08:26:00      3.39
##  3 536365    84406B    CREAM CUPID HEART…        8 2010-12-01 08:26:00      2.75
##  4 536365    84029G    KNITTED UNION FLA…        6 2010-12-01 08:26:00      3.39
##  5 536365    84029E    RED WOOLLY HOTTIE…        6 2010-12-01 08:26:00      3.39
##  6 536365    22752     SET 7 BABUSHKA NE…        2 2010-12-01 08:26:00      7.65
##  7 536365    21730     GLASS STAR FROSTE…        6 2010-12-01 08:26:00      4.25
##  8 536366    22633     HAND WARMER UNION…        6 2010-12-01 08:28:00      1.85
##  9 536366    22632     HAND WARMER RED P…        6 2010-12-01 08:28:00      1.85
## 10 536367    84879     ASSORTED COLOUR B…       32 2010-12-01 08:34:00      1.69
## # ℹ 541,899 more rows
## # ℹ 3 more variables: CustomerID <dbl>, Country <chr>, TotalPrice <dbl>
data <- distinct(data)
sum(is.na(data))
## [1] 136491
data <- data %>% filter(!is.na(CustomerID))
summary(select(data, Quantity, UnitPrice, TotalPrice))
##     Quantity           UnitPrice           TotalPrice        
##  Min.   :-80995.00   Min.   :    0.000   Min.   :-168469.60  
##  1st Qu.:     2.00   1st Qu.:    1.250   1st Qu.:      4.25  
##  Median :     5.00   Median :    1.950   Median :     11.70  
##  Mean   :    12.18   Mean   :    3.474   Mean   :     20.61  
##  3rd Qu.:    12.00   3rd Qu.:    3.750   3rd Qu.:     19.80  
##  Max.   : 80995.00   Max.   :38970.000   Max.   : 168469.60
library(scales)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
data %>%
  count(Country, sort = TRUE) %>%
  head(10) %>%
  ggplot(aes(x = reorder(Country, n), y = n)) +
  geom_col(fill = "tomato") +
  coord_flip() + scale_y_continuous(labels = comma) +
  labs(title = "Top 10 Negara dengan Transaksi Terbanyak",
       x = "Negara", y = "Jumlah Transaksi")

data %>%
  group_by(CustomerID) %>%
  summarise(TotalSpending = sum(TotalPrice)) %>%
  summarise(AvgSpending = mean(TotalSpending))
## # A tibble: 1 × 1
##   AvgSpending
##         <dbl>
## 1       1894.

Penjualan naik tajam menjelang akhir tahun (musim liburan).

monthly_sales <- data %>%
  mutate(Month = floor_date(InvoiceDate, "month")) %>%
  group_by(Month) %>%
  summarise(TotalSales = sum(TotalPrice)) %>%
  filter(Month < as.Date("2011-12-01"))  # buang Desember karena data tidak lengkap

ggplot(monthly_sales, aes(x = Month, y = TotalSales)) +
  geom_line(color = "#0072B2", linewidth = 1) +
  geom_point(color = "#0072B2", size = 2) +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Tren Penjualan Bulanan",
       x = "Bulan", y = "Total Penjualan (£)") +
  theme_minimal(base_size = 12)

Top 10 barang yang diretur (potensi masalah).

data <- data %>%
  mutate(IsCancelled = grepl("^C", InvoiceNo))

returned_products <- data %>%
  filter(IsCancelled == TRUE) %>%
  group_by(Description) %>%
  summarise(JumlahRetur = n(),
            TotalReturValue = sum(abs(Quantity * UnitPrice), na.rm = TRUE)) %>%
  arrange(desc(JumlahRetur)) %>%
  head(10) 

ggplot(returned_products, aes(x = reorder(Description, JumlahRetur), y = JumlahRetur)) +
  geom_col(fill = "#FF6F61") +
  geom_text(aes(label = JumlahRetur),
            hjust = -0.1, color = "black", size = 3.5) +
  coord_flip() +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Top 10 Produk dengan Jumlah Retur Terbanyak",
    x = "Nama Produk",
    y = "Jumlah Retur"
  ) +
  theme_minimal(base_size = 12)

10 pelanggan teratas menyumbang >40% revenue total.

top_customers <- data %>%
  group_by(CustomerID) %>%
  summarise(TotalSpending = sum(TotalPrice)) %>%
  arrange(desc(TotalSpending)) %>%
  head(10)

ggplot(top_customers, aes(x = reorder(as.factor(CustomerID), TotalSpending),
                          y = TotalSpending)) +
  geom_col(fill = "goldenrod") +
  coord_flip() + scale_y_continuous(labels = comma) +
  labs(title = "Top 10 Pelanggan Berdasarkan Total Belanja",
       x = "Customer ID", y = "Total Pembelian (£)")

Produk seperti “WHITE HANGING HEART T-LIGHT HOLDER” dan “REGENCY CAKESTAND 3 TIER” paling laku.

top_products <- data %>%
  group_by(Description) %>%
  summarise(TotalSales = sum(TotalPrice)) %>%
  arrange(desc(TotalSales)) %>%
  head(10)

ggplot(top_products, aes(x = reorder(Description, TotalSales), y = TotalSales)) +
  geom_col(fill = "steelblue") +
  coord_flip() + scale_y_continuous(labels = comma) +
  labs(title = "Top 10 Produk Berdasarkan Penjualan",
       x = "Produk", y = "Total Penjualan (£)")

data <- data %>% mutate(Day = wday(InvoiceDate, label = TRUE))

trans_day <- data %>%
  group_by(Day) %>%
  summarise(JumlahTransaksi = n())

ggplot(trans_day, aes(x = Day, y = JumlahTransaksi)) +
  geom_col(fill = "#009E73") +
  geom_text(aes(label = JumlahTransaksi), vjust = -0.3) +
  scale_y_continuous(labels = comma) +
  labs(title = "Jumlah Transaksi per Hari", x = "Hari", y = "Jumlah Transaksi") +
  theme_minimal(base_size = 12)

data <- data %>%
  mutate(Hour = hour(InvoiceDate))

trans_hour <- data %>%
  group_by(Hour) %>%
  summarise(JumlahTransaksi = n())

ggplot(trans_hour, aes(x = Hour, y = JumlahTransaksi)) +
  geom_col(fill = "#0073C2FF") +
  geom_text(aes(label = JumlahTransaksi), vjust = -0.3, size = 3) +
  scale_x_continuous(breaks = 0:23) +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Distribusi Transaksi per Jam",
    subtitle = "Mayoritas transaksi terjadi di jam kerja (09.00–17.00)",
    x = "Jam Transaksi", y = "Jumlah Transaksi"
  ) +
  theme_minimal(base_size = 12)

data_filtered <- data %>%
  filter(!is.na(UnitPrice), UnitPrice > 0, UnitPrice <= 50)

ggplot(data_filtered, aes(x = UnitPrice)) +
  geom_histogram(bins = 60, fill = "#E69F00", color = "white") +
  scale_x_continuous(labels = comma) +
  labs(
    title = "Distribusi Harga Produk",
    subtitle = "Mayoritas harga produk di bawah £5",
    x = "Harga per Unit (£)", y = "Jumlah Produk"
  ) +
  theme_minimal(base_size = 12)