library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
data <- read_excel("Online_Retail.xlsx")
data
## # A tibble: 541,909 × 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEA… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTE… 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEART… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLA… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE… 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NE… 2 2010-12-01 08:26:00 7.65
## 7 536365 21730 GLASS STAR FROSTE… 6 2010-12-01 08:26:00 4.25
## 8 536366 22633 HAND WARMER UNION… 6 2010-12-01 08:28:00 1.85
## 9 536366 22632 HAND WARMER RED P… 6 2010-12-01 08:28:00 1.85
## 10 536367 84879 ASSORTED COLOUR B… 32 2010-12-01 08:34:00 1.69
## # ℹ 541,899 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
data <- data %>%
mutate(TotalPrice = Quantity * UnitPrice)
data
## # A tibble: 541,909 × 9
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEA… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTE… 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEART… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLA… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE… 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NE… 2 2010-12-01 08:26:00 7.65
## 7 536365 21730 GLASS STAR FROSTE… 6 2010-12-01 08:26:00 4.25
## 8 536366 22633 HAND WARMER UNION… 6 2010-12-01 08:28:00 1.85
## 9 536366 22632 HAND WARMER RED P… 6 2010-12-01 08:28:00 1.85
## 10 536367 84879 ASSORTED COLOUR B… 32 2010-12-01 08:34:00 1.69
## # ℹ 541,899 more rows
## # ℹ 3 more variables: CustomerID <dbl>, Country <chr>, TotalPrice <dbl>
data <- distinct(data)
sum(is.na(data))
## [1] 136491
data <- data %>% filter(!is.na(CustomerID))
summary(select(data, Quantity, UnitPrice, TotalPrice))
## Quantity UnitPrice TotalPrice
## Min. :-80995.00 Min. : 0.000 Min. :-168469.60
## 1st Qu.: 2.00 1st Qu.: 1.250 1st Qu.: 4.25
## Median : 5.00 Median : 1.950 Median : 11.70
## Mean : 12.18 Mean : 3.474 Mean : 20.61
## 3rd Qu.: 12.00 3rd Qu.: 3.750 3rd Qu.: 19.80
## Max. : 80995.00 Max. :38970.000 Max. : 168469.60
library(scales)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
data %>%
count(Country, sort = TRUE) %>%
head(10) %>%
ggplot(aes(x = reorder(Country, n), y = n)) +
geom_col(fill = "tomato") +
coord_flip() + scale_y_continuous(labels = comma) +
labs(title = "Top 10 Negara dengan Transaksi Terbanyak",
x = "Negara", y = "Jumlah Transaksi")
data %>%
group_by(CustomerID) %>%
summarise(TotalSpending = sum(TotalPrice)) %>%
summarise(AvgSpending = mean(TotalSpending))
## # A tibble: 1 × 1
## AvgSpending
## <dbl>
## 1 1894.
Penjualan naik tajam menjelang akhir tahun (musim liburan).
monthly_sales <- data %>%
mutate(Month = floor_date(InvoiceDate, "month")) %>%
group_by(Month) %>%
summarise(TotalSales = sum(TotalPrice)) %>%
filter(Month < as.Date("2011-12-01")) # buang Desember karena data tidak lengkap
ggplot(monthly_sales, aes(x = Month, y = TotalSales)) +
geom_line(color = "#0072B2", linewidth = 1) +
geom_point(color = "#0072B2", size = 2) +
scale_y_continuous(labels = scales::comma) +
labs(title = "Tren Penjualan Bulanan",
x = "Bulan", y = "Total Penjualan (£)") +
theme_minimal(base_size = 12)
Top 10 barang yang diretur (potensi masalah).
data <- data %>%
mutate(IsCancelled = grepl("^C", InvoiceNo))
returned_products <- data %>%
filter(IsCancelled == TRUE) %>%
group_by(Description) %>%
summarise(JumlahRetur = n(),
TotalReturValue = sum(abs(Quantity * UnitPrice), na.rm = TRUE)) %>%
arrange(desc(JumlahRetur)) %>%
head(10)
ggplot(returned_products, aes(x = reorder(Description, JumlahRetur), y = JumlahRetur)) +
geom_col(fill = "#FF6F61") +
geom_text(aes(label = JumlahRetur),
hjust = -0.1, color = "black", size = 3.5) +
coord_flip() +
scale_y_continuous(labels = comma) +
labs(
title = "Top 10 Produk dengan Jumlah Retur Terbanyak",
x = "Nama Produk",
y = "Jumlah Retur"
) +
theme_minimal(base_size = 12)
10 pelanggan teratas menyumbang >40% revenue total.
top_customers <- data %>%
group_by(CustomerID) %>%
summarise(TotalSpending = sum(TotalPrice)) %>%
arrange(desc(TotalSpending)) %>%
head(10)
ggplot(top_customers, aes(x = reorder(as.factor(CustomerID), TotalSpending),
y = TotalSpending)) +
geom_col(fill = "goldenrod") +
coord_flip() + scale_y_continuous(labels = comma) +
labs(title = "Top 10 Pelanggan Berdasarkan Total Belanja",
x = "Customer ID", y = "Total Pembelian (£)")
Produk seperti “WHITE HANGING HEART T-LIGHT HOLDER” dan “REGENCY
CAKESTAND 3 TIER” paling laku.
top_products <- data %>%
group_by(Description) %>%
summarise(TotalSales = sum(TotalPrice)) %>%
arrange(desc(TotalSales)) %>%
head(10)
ggplot(top_products, aes(x = reorder(Description, TotalSales), y = TotalSales)) +
geom_col(fill = "steelblue") +
coord_flip() + scale_y_continuous(labels = comma) +
labs(title = "Top 10 Produk Berdasarkan Penjualan",
x = "Produk", y = "Total Penjualan (£)")
data <- data %>% mutate(Day = wday(InvoiceDate, label = TRUE))
trans_day <- data %>%
group_by(Day) %>%
summarise(JumlahTransaksi = n())
ggplot(trans_day, aes(x = Day, y = JumlahTransaksi)) +
geom_col(fill = "#009E73") +
geom_text(aes(label = JumlahTransaksi), vjust = -0.3) +
scale_y_continuous(labels = comma) +
labs(title = "Jumlah Transaksi per Hari", x = "Hari", y = "Jumlah Transaksi") +
theme_minimal(base_size = 12)
data <- data %>%
mutate(Hour = hour(InvoiceDate))
trans_hour <- data %>%
group_by(Hour) %>%
summarise(JumlahTransaksi = n())
ggplot(trans_hour, aes(x = Hour, y = JumlahTransaksi)) +
geom_col(fill = "#0073C2FF") +
geom_text(aes(label = JumlahTransaksi), vjust = -0.3, size = 3) +
scale_x_continuous(breaks = 0:23) +
scale_y_continuous(labels = comma) +
labs(
title = "Distribusi Transaksi per Jam",
subtitle = "Mayoritas transaksi terjadi di jam kerja (09.00–17.00)",
x = "Jam Transaksi", y = "Jumlah Transaksi"
) +
theme_minimal(base_size = 12)
data_filtered <- data %>%
filter(!is.na(UnitPrice), UnitPrice > 0, UnitPrice <= 50)
ggplot(data_filtered, aes(x = UnitPrice)) +
geom_histogram(bins = 60, fill = "#E69F00", color = "white") +
scale_x_continuous(labels = comma) +
labs(
title = "Distribusi Harga Produk",
subtitle = "Mayoritas harga produk di bawah £5",
x = "Harga per Unit (£)", y = "Jumlah Produk"
) +
theme_minimal(base_size = 12)