Analisis pada project ini dilakukan secara terstruktur mengikuti tahapan Data Mining/CRISP-DM, dimulai dari Business Understanding, Data Cleaning, Exploratory Data Analysis, Data Preparation, Modeling, hingga Model Evaluation. Setiap tahap memiliki tujuan yang jelas dan saling melengkapi untuk menghasilkan pemahaman mendalam terhadap pola transaksi dan faktor-faktor yang memengaruhi pendapatan (Revenue).
Analisis ini dirancang untuk menjawab dua fokus utama:
Tujuan analisis ini adalah mengetahui bagaimana kontribusi setiap produk terhadap pendapatan perusahaan. Hal ini dilakukan dengan:
• Mengidentifikasi produk yang menghasilkan pendapatan terbesar. • Menilai stabilitas atau fluktuasi jumlah pembelian setiap produk. • Mengetahui apakah terdapat produk tertentu yang menjadi top performer dan mendominasi total revenue.
Variabel yang dianalisis: • X (Fitur): StockCode, Description, Quantity, UnitPrice • Y (Target): Revenue (Quantity × UnitPrice)
Analisis ini bertujuan untuk memahami pola transaksi pelanggan dari sudut pandang waktu dan perilaku pembelian, yaitu:
• Mendeteksi tren transaksi berdasarkan hari, minggu, atau bulan. • Mengidentifikasi periode waktu dengan aktivitas transaksi tertinggi maupun terendah. • Melihat apakah terdapat pola musiman atau pola berulang dalam pembelian produk.
Variabel yang dianalisis: • X (Fitur): Quantity, UnitPrice, StockCode, InvoiceDate • Y (Target): Revenue per transaksi
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(forecast)
## Warning: package 'forecast' was built under R version 4.5.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
df <- read_excel("Online Retail.xlsx")
# Tampilkan 10 baris pertama
head(df, 10)
## # A tibble: 10 × 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEA… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTE… 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEART… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLA… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE… 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NE… 2 2010-12-01 08:26:00 7.65
## 7 536365 21730 GLASS STAR FROSTE… 6 2010-12-01 08:26:00 4.25
## 8 536366 22633 HAND WARMER UNION… 6 2010-12-01 08:28:00 1.85
## 9 536366 22632 HAND WARMER RED P… 6 2010-12-01 08:28:00 1.85
## 10 536367 84879 ASSORTED COLOUR B… 32 2010-12-01 08:34:00 1.69
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
# Cek missing values
colSums(is.na(df))
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 0 0 1454 0 0 0
## CustomerID Country
## 135080 0
# Hapus NA
df <- df %>% drop_na()
# Hapus quantity & harga negatif (umum pada dataset OnlineRetail)
df <- df %>% filter(Quantity >= 0, UnitPrice >= 0)
# Hapus duplikasi
df <- df %>% distinct()
# Konversi tanggal
df$InvoiceDate <- as.POSIXct(df$InvoiceDate, format="%Y-%m-%d %H:%M:%S")
# Revenue = Qty * Price
df <- df %>% mutate(Revenue = Quantity * UnitPrice)
# Normalisasi numerik
num_cols <- c("Quantity", "UnitPrice", "Revenue")
df_norm <- df
df_norm[num_cols] <- scale(df[num_cols])
### 4.1 Statistik Deskriptif
summary(df[, c("Quantity", "UnitPrice", "Revenue")])
## Quantity UnitPrice Revenue
## Min. : 1.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 2.00 1st Qu.: 1.250 1st Qu.: 4.95
## Median : 6.00 Median : 1.950 Median : 12.39
## Mean : 13.15 Mean : 3.126 Mean : 22.63
## 3rd Qu.: 12.00 3rd Qu.: 3.750 3rd Qu.: 19.80
## Max. :80995.00 Max. :8142.750 Max. :168469.60
### 4.2 Distribusi Variabel
ggplot(df, aes(x = Revenue)) + geom_histogram(bins = 50) + ggtitle("Distribusi Revenue")
ggplot(df, aes(x = UnitPrice)) + geom_histogram(bins = 50) + ggtitle("Distribusi Unit Price")
ggplot(df, aes(x = Quantity)) + geom_histogram(bins = 50) + ggtitle("Distribusi Quantity")
### 4.3 Korelasi
cor_matrix <- cor(df[, num_cols])
corrplot(cor_matrix, method = "circle")
### 4.4 Segmentasi Berdasarkan Kategori
# Segmentasi per Negara
df %>%
group_by(Country) %>%
summarise(Total_Revenue = sum(Revenue)) %>%
arrange(desc(Total_Revenue)) %>%
head(10)
## # A tibble: 10 × 2
## Country Total_Revenue
## <chr> <dbl>
## 1 United Kingdom 7285025.
## 2 Netherlands 285446.
## 3 EIRE 265262.
## 4 Germany 228678.
## 5 France 208934.
## 6 Australia 138454.
## 7 Spain 61559.
## 8 Switzerland 56444.
## 9 Belgium 41196.
## 10 Sweden 38368.
# Segmentasi berdasarkan StockCode
df %>%
group_by(StockCode) %>%
summarise(Total_Quantity = sum(Quantity),
Total_Revenue = sum(Revenue)) %>%
arrange(desc(Total_Revenue)) %>%
head(10)
## # A tibble: 10 × 3
## StockCode Total_Quantity Total_Revenue
## <chr> <dbl> <dbl>
## 1 23843 80995 168470.
## 2 22423 12384 142265.
## 3 85123A 36763 100547.
## 4 85099B 46078 85041.
## 5 23166 77916 81417.
## 6 POST 3120 77804.
## 7 47566 15283 68785.
## 8 84879 35263 56413.
## 9 M 6939 53420.
## 10 23084 27153 51251.
# Segmentasi berdasarkan CustomerID
df %>%
group_by(CustomerID) %>%
summarise(Total_Revenue = sum(Revenue)) %>%
arrange(desc(Total_Revenue)) %>%
head(10)
## # A tibble: 10 × 2
## CustomerID Total_Revenue
## <dbl> <dbl>
## 1 14646 280206.
## 2 18102 259657.
## 3 17450 194391.
## 4 16446 168472.
## 5 14911 143711.
## 6 12415 124915.
## 7 14156 117210.
## 8 17511 91062.
## 9 16029 80851.
## 10 12346 77184.
### 5.1 Produk dengan revenue terbesar
top_product <- df %>%
group_by(StockCode, Description) %>%
summarise(Total_Revenue = sum(Revenue),
Total_Quantity = sum(Quantity)) %>%
arrange(desc(Total_Revenue))
## `summarise()` has grouped output by 'StockCode'. You can override using the
## `.groups` argument.
head(top_product, 10)
## # A tibble: 10 × 4
## # Groups: StockCode [10]
## StockCode Description Total_Revenue Total_Quantity
## <chr> <chr> <dbl> <dbl>
## 1 23843 PAPER CRAFT , LITTLE BIRDIE 168470. 80995
## 2 22423 REGENCY CAKESTAND 3 TIER 142265. 12384
## 3 85123A WHITE HANGING HEART T-LIGHT HOLDER 100392. 36706
## 4 85099B JUMBO BAG RED RETROSPOT 85041. 46078
## 5 23166 MEDIUM CERAMIC TOP STORAGE JAR 81417. 77916
## 6 POST POSTAGE 77804. 3120
## 7 47566 PARTY BUNTING 68785. 15283
## 8 84879 ASSORTED COLOUR BIRD ORNAMENT 56413. 35263
## 9 M Manual 53420. 6939
## 10 23084 RABBIT NIGHT LIGHT 51251. 27153
### 5.2 Analisis fluktuasi quantity untuk identifikasi stabil / tidak
product_fluctuation <- df %>%
group_by(StockCode) %>%
summarise(mean_qty = mean(Quantity),
sd_qty = sd(Quantity),
cv = sd_qty / mean_qty) %>% # coefficient of variation
arrange(desc(cv))
head(product_fluctuation, 10)
## # A tibble: 10 × 4
## StockCode mean_qty sd_qty cv
## <chr> <dbl> <dbl> <dbl>
## 1 23166 394. 5273. 13.4
## 2 22508 6.40 48.3 7.55
## 3 22413 10.4 76.3 7.34
## 4 23461 9.31 65.5 7.03
## 5 22476 10.4 67.7 6.52
## 6 23243 12.1 76.4 6.33
## 7 22920 17.5 111. 6.32
## 8 21175 17.2 109. 6.31
## 9 22350 15.2 93.7 6.15
## 10 21108 52.2 317. 6.07
### 6.1 Pola transaksi harian
daily_trend <- df %>%
mutate(Date = as.Date(InvoiceDate)) %>%
group_by(Date) %>%
summarise(Daily_Revenue = sum(Revenue))
ggplot(daily_trend, aes(Date, Daily_Revenue)) +
geom_line() +
ggtitle("Daily Revenue Trend")
### 6.2 Pola transaksi mingguan
weekly_trend <- df %>%
mutate(Week = floor_date(InvoiceDate, "week")) %>%
group_by(Week) %>%
summarise(Weekly_Revenue = sum(Revenue))
ggplot(weekly_trend, aes(Week, Weekly_Revenue)) +
geom_line() +
ggtitle("Weekly Revenue Trend")
### 6.3 Pola transaksi bulanan
monthly_trend <- df %>%
mutate(Month = floor_date(InvoiceDate, "month")) %>%
group_by(Month) %>%
summarise(Monthly_Revenue = sum(Revenue))
ggplot(monthly_trend, aes(Month, Monthly_Revenue)) +
geom_line() +
ggtitle("Monthly Revenue Trend")
### 6.4 Seasonality (Musiman)
ts_data <- ts(daily_trend$Daily_Revenue, frequency = 7) # mingguan
plot(decompose(ts_data))
model <- lm(Revenue ~ Quantity + UnitPrice, data = df)
summary(model)
##
## Call:
## lm(formula = Revenue ~ Quantity + UnitPrice, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38420 -3 0 5 42304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.610341 0.205134 -7.85 4.16e-15 ***
## Quantity 1.557685 0.001116 1396.07 < 2e-16 ***
## UnitPrice 1.199830 0.009110 131.71 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 127 on 392729 degrees of freedom
## Multiple R-squared: 0.8334, Adjusted R-squared: 0.8334
## F-statistic: 9.824e+05 on 2 and 392729 DF, p-value: < 2.2e-16
df$Predicted <- predict(model, df)
# Root Mean Squared Error
mape <- mean(abs((df$Revenue - df$Predicted) / df$Revenue)) * 100
rmse <- sqrt(mean((df$Revenue - df$Predicted)^2))
mape; rmse
## [1] Inf
## [1] 126.9703
pred_table <- df %>%
select(Revenue, Predicted) %>%
head(20)
print(pred_table)
## # A tibble: 20 × 2
## Revenue Predicted
## <dbl> <dbl>
## 1 15.3 10.8
## 2 20.3 11.8
## 3 22 14.2
## 4 20.3 11.8
## 5 20.3 11.8
## 6 15.3 10.7
## 7 25.5 12.8
## 8 11.1 9.96
## 9 11.1 9.96
## 10 54.1 50.3
## 11 12.6 10.3
## 12 12.6 10.3
## 13 30 15.4
## 14 9.9 9.72
## 15 25.5 12.8
## 16 14.8 9.00
## 17 19.9 13.4
## 18 17.8 10.2
## 19 17.8 10.2
## 20 31.8 14.2