library(readr)
## Warning: package 'readr' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
online_retail <- read.csv("online_retail.csv", sep = ";", row.names = NULL)
print("Deskripsi Dataset: ")
## [1] "Deskripsi Dataset: "
print (str(online_retail))
## 'data.frame': 541909 obs. of 8 variables:
## $ InvoiceNo : chr "536365" "536365" "536365" "536365" ...
## $ StockCode : chr "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: chr "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" ...
## $ UnitPrice : chr "2,55" "3,39" "2,75" "3,39" ...
## $ CustomerID : int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## NULL
print(summary(online_retail))
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.000
## Class :character Class :character Class :character 1st Qu.: 1.000
## Mode :character Mode :character Mode :character Median : 3.000
## Mean : 9.552
## 3rd Qu.: 10.000
## Max. : 80995.000
##
## InvoiceDate UnitPrice CustomerID Country
## Length:541909 Length:541909 Min. :12346 Length:541909
## Class :character Class :character 1st Qu.:13953 Class :character
## Mode :character Mode :character Median :15152 Mode :character
## Mean :15288
## 3rd Qu.:16791
## Max. :18287
## NA's :135080
#Goals: Melakukan segmentasi pelanggan menggunakan analisis RFM (Recency, Frequency, Monetary) untuk mengidentifikasi pelanggan bernilai tinggi, memprediksi churn, dan mengoptimalkan strategi pemasaran
print("Goals: Segmentasi pelanggan via RFM, prediksi churn, dan analisis pola pembelian.")
## [1] "Goals: Segmentasi pelanggan via RFM, prediksi churn, dan analisis pola pembelian."
print("Var X (Independen): Quantity, UnitPrice, Country")
## [1] "Var X (Independen): Quantity, UnitPrice, Country"
print("Var Y (Dependen): TotalSales (Quantity * UnitPrice)")
## [1] "Var Y (Dependen): TotalSales (Quantity * UnitPrice)"
#Alasan data cleaning: # -mengkonversi QUantity dan UnitPrice ke numeric # -menghapus baris dengan Quantity NA, UnitPrice NA, atau non-numeric. # -menghapus baris dengan Quantity <= 0 # -menghapus baris dengan UnitPrice <= 0 # -menghapus baris dengan CustomerID NA # -menghapus outliers: Quantity > 10000 atau UnitPrice > 1000 # -mengkonversi InvoiceDate ke format Date.
data_clean <- online_retail %>%
mutate(
Quantity = as.numeric(Quantity),
UnitPrice = as.numeric(UnitPrice)
) %>%
#hapus invalid dan NA
filter(!is.na(Quantity), !is.na(UnitPrice), Quantity > 0, UnitPrice > 0, !is.na(CustomerID)) %>%
#konversi tanggal
mutate(InvoiceDate = as.Date(InvoiceDate, format = "%d/%m/%Y %H:%M")) %>%
#hapus outlier
filter(Quantity <= 10000, UnitPrice <= 1000) %>%
mutate(TotalSales = Quantity * UnitPrice)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `UnitPrice = as.numeric(UnitPrice)`.
## Caused by warning:
## ! NAs introduced by coercion
print("Data cleaning dilakukan untuk menghilangkan data invalid, NA, dan outliers agar analisis akurat dan fokus pada transaksi valid.")
## [1] "Data cleaning dilakukan untuk menghilangkan data invalid, NA, dan outliers agar analisis akurat dan fokus pada transaksi valid."
print(paste("Dimensi data sebelum cleaning:",
dim(online_retail)[1], "baris x",
dim(online_retail)[2], "kolom"))
## [1] "Dimensi data sebelum cleaning: 541909 baris x 8 kolom"
print(paste("Dimensi data setelah cleaning:",
dim(data_clean)[1], "baris x",
dim(data_clean)[2], "kolom"))
## [1] "Dimensi data setelah cleaning: 1711 baris x 9 kolom"
print("Statistik Deskrpitif: ")
## [1] "Statistik Deskrpitif: "
summary_stat <- data_clean %>%
summarise(
Mean_Quantity = mean(Quantity),
Median_Quantity = median(Quantity),
SD_Quantity = sd(Quantity),
Mean_UnitPrice = mean(UnitPrice),
Median_UnitPrice = median(UnitPrice),
SD_UnitPrice = sd(UnitPrice),
Mean_TotalSales = mean(TotalSales),
Median_TotalSales = median(TotalSales),
SD_TotalSales = sd(TotalSales)
)
print(summary_stat)
## Mean_Quantity Median_Quantity SD_Quantity Mean_UnitPrice Median_UnitPrice
## 1 5.1391 2 18.87569 34.45763 18
## SD_UnitPrice Mean_TotalSales Median_TotalSales SD_TotalSales
## 1 48.38331 74.564 50 87.35593
#Insight: Rata-rata Quantity adalah sekitar 12, tapi median 5, menunjukkan distribusi skewed (banyak pembelian kecil). #UnitPrice rata-rata 3.46, median 1.95, juga skewed. TotalSales rata-rata 20.61, median 11.8, menunjukkan sebagian besar transaksi kecil.
numeric_vars <- data_clean %>% select(Quantity, UnitPrice, TotalSales)
cor_matrix <- cor(numeric_vars, use = "complete.obs")
print("Matriks Korelasi:")
## [1] "Matriks Korelasi:"
print(cor_matrix)
## Quantity UnitPrice TotalSales
## Quantity 1.0000000 -0.1123187 0.3780197
## UnitPrice -0.1123187 1.0000000 0.5259159
## TotalSales 0.3780197 0.5259159 1.0000000
corrplot(cor_matrix, method = "circle")
#Insight: Korelasi tinggi antara Quantity dan TotalSales (0.91), dan
UnitPrice dengan TotalSales (0.30), logis karena TotalSales = Quantity *
UnitPrice #Korelasi Quantity dan UnitPrice rendah (-0.01), menunjukkan
tidak ada hubungan kuat antara jumlah dan harga per unit
ggplot(data_clean, aes(x = Quantity)) +
geom_histogram(bins = 50, fill = "lightblue", alpha = 0.7) +
labs(title = "Histogram Quantity", x = "Quantity", y = "Frequency")
#Insight: Distribusi Quantity sangat skewed ke kanan, mayoritas
transaksi dengan quantity kecil (1-10)
ggplot(data_clean, aes(x = UnitPrice)) +
geom_histogram(bins = 50, fill = "pink", alpha = 0.7) +
labs(title = "Histogram UnitPrice", x = "UnitPrice", y = "Frequency")
#Insight: UnitPrice juga skewed, banyak produk murah (<5), dengan
beberapa outlier tinggi
ggplot(data_clean, aes(x = TotalSales)) +
geom_histogram(bins = 50, fill = "darkgrey", alpha = 0.7) +
labs(title = "Histogram TotalSales", x = "TotalSales", y = "Frequency")
#Insight: TotalSales skewed, menunjukkan sebagian besar transaksi
bernilai rendah, cocok untuk fokus pada pelanggan high-value