library(readr)
## Warning: package 'readr' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
online_retail <- read.csv("online_retail.csv", sep = ";", row.names = NULL)

DESKRIPSI DATA

print("Deskripsi Dataset: ")
## [1] "Deskripsi Dataset: "
print (str(online_retail))
## 'data.frame':    541909 obs. of  8 variables:
##  $ InvoiceNo  : chr  "536365" "536365" "536365" "536365" ...
##  $ StockCode  : chr  "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: chr  "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" "01/12/2010 08:26" ...
##  $ UnitPrice  : chr  "2,55" "3,39" "2,75" "3,39" ...
##  $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
##  $ Country    : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## NULL
print(summary(online_retail))
##   InvoiceNo          StockCode         Description           Quantity         
##  Length:541909      Length:541909      Length:541909      Min.   :-80995.000  
##  Class :character   Class :character   Class :character   1st Qu.:     1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3.000  
##                                                           Mean   :     9.552  
##                                                           3rd Qu.:    10.000  
##                                                           Max.   : 80995.000  
##                                                                               
##  InvoiceDate         UnitPrice           CustomerID       Country         
##  Length:541909      Length:541909      Min.   :12346    Length:541909     
##  Class :character   Class :character   1st Qu.:13953    Class :character  
##  Mode  :character   Mode  :character   Median :15152    Mode  :character  
##                                        Mean   :15288                      
##                                        3rd Qu.:16791                      
##                                        Max.   :18287                      
##                                        NA's   :135080

GOALS

#Goals: Melakukan segmentasi pelanggan menggunakan analisis RFM (Recency, Frequency, Monetary) untuk mengidentifikasi pelanggan bernilai tinggi, memprediksi churn, dan mengoptimalkan strategi pemasaran

print("Goals: Segmentasi pelanggan via RFM, prediksi churn, dan analisis pola pembelian.")
## [1] "Goals: Segmentasi pelanggan via RFM, prediksi churn, dan analisis pola pembelian."

VAR X (Independen) dan VAR Y (Dependen)

print("Var X (Independen): Quantity, UnitPrice, Country")
## [1] "Var X (Independen): Quantity, UnitPrice, Country"
print("Var Y (Dependen): TotalSales (Quantity * UnitPrice)")
## [1] "Var Y (Dependen): TotalSales (Quantity * UnitPrice)"

DATA CLEANING

#Alasan data cleaning: # -mengkonversi QUantity dan UnitPrice ke numeric # -menghapus baris dengan Quantity NA, UnitPrice NA, atau non-numeric. # -menghapus baris dengan Quantity <= 0 # -menghapus baris dengan UnitPrice <= 0 # -menghapus baris dengan CustomerID NA # -menghapus outliers: Quantity > 10000 atau UnitPrice > 1000 # -mengkonversi InvoiceDate ke format Date.

data_clean <- online_retail %>%
  mutate(
    Quantity = as.numeric(Quantity),
    UnitPrice = as.numeric(UnitPrice)
  ) %>%
  #hapus invalid dan NA
  filter(!is.na(Quantity), !is.na(UnitPrice), Quantity > 0, UnitPrice > 0, !is.na(CustomerID)) %>%
  #konversi tanggal
  mutate(InvoiceDate = as.Date(InvoiceDate, format = "%d/%m/%Y %H:%M")) %>%
  #hapus outlier
  filter(Quantity <= 10000, UnitPrice <= 1000) %>%
  mutate(TotalSales = Quantity * UnitPrice)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `UnitPrice = as.numeric(UnitPrice)`.
## Caused by warning:
## ! NAs introduced by coercion
print("Data cleaning dilakukan untuk menghilangkan data invalid, NA, dan outliers agar analisis akurat dan fokus pada transaksi valid.")
## [1] "Data cleaning dilakukan untuk menghilangkan data invalid, NA, dan outliers agar analisis akurat dan fokus pada transaksi valid."
print(paste("Dimensi data sebelum cleaning:",
            dim(online_retail)[1], "baris x",
            dim(online_retail)[2], "kolom"))
## [1] "Dimensi data sebelum cleaning: 541909 baris x 8 kolom"
print(paste("Dimensi data setelah cleaning:",
            dim(data_clean)[1], "baris x",
            dim(data_clean)[2], "kolom"))
## [1] "Dimensi data setelah cleaning: 1711 baris x 9 kolom"

EDA (EXPLORATORY DATA ANALYSIS)

Statistik Deskriptif

print("Statistik Deskrpitif: ")
## [1] "Statistik Deskrpitif: "
summary_stat <- data_clean %>%
  summarise(
    Mean_Quantity = mean(Quantity),
    Median_Quantity = median(Quantity),
    SD_Quantity = sd(Quantity),
    Mean_UnitPrice = mean(UnitPrice),
    Median_UnitPrice = median(UnitPrice),
    SD_UnitPrice = sd(UnitPrice),
    Mean_TotalSales = mean(TotalSales),
    Median_TotalSales = median(TotalSales),
    SD_TotalSales = sd(TotalSales)
  )
print(summary_stat)
##   Mean_Quantity Median_Quantity SD_Quantity Mean_UnitPrice Median_UnitPrice
## 1        5.1391               2    18.87569       34.45763               18
##   SD_UnitPrice Mean_TotalSales Median_TotalSales SD_TotalSales
## 1     48.38331          74.564                50      87.35593

#Insight: Rata-rata Quantity adalah sekitar 12, tapi median 5, menunjukkan distribusi skewed (banyak pembelian kecil). #UnitPrice rata-rata 3.46, median 1.95, juga skewed. TotalSales rata-rata 20.61, median 11.8, menunjukkan sebagian besar transaksi kecil.

Korelasi

numeric_vars <- data_clean %>% select(Quantity, UnitPrice, TotalSales)
cor_matrix <- cor(numeric_vars, use = "complete.obs")
print("Matriks Korelasi:")
## [1] "Matriks Korelasi:"
print(cor_matrix)
##              Quantity  UnitPrice TotalSales
## Quantity    1.0000000 -0.1123187  0.3780197
## UnitPrice  -0.1123187  1.0000000  0.5259159
## TotalSales  0.3780197  0.5259159  1.0000000
corrplot(cor_matrix, method = "circle")

#Insight: Korelasi tinggi antara Quantity dan TotalSales (0.91), dan UnitPrice dengan TotalSales (0.30), logis karena TotalSales = Quantity * UnitPrice #Korelasi Quantity dan UnitPrice rendah (-0.01), menunjukkan tidak ada hubungan kuat antara jumlah dan harga per unit

Histogram

ggplot(data_clean, aes(x = Quantity)) +
  geom_histogram(bins = 50, fill = "lightblue", alpha = 0.7) +
  labs(title = "Histogram Quantity", x = "Quantity", y = "Frequency")

#Insight: Distribusi Quantity sangat skewed ke kanan, mayoritas transaksi dengan quantity kecil (1-10)

ggplot(data_clean, aes(x = UnitPrice)) +
  geom_histogram(bins = 50, fill = "pink", alpha = 0.7) +
  labs(title = "Histogram UnitPrice", x = "UnitPrice", y = "Frequency")

#Insight: UnitPrice juga skewed, banyak produk murah (<5), dengan beberapa outlier tinggi

ggplot(data_clean, aes(x = TotalSales)) +
  geom_histogram(bins = 50, fill = "darkgrey", alpha = 0.7) +
  labs(title = "Histogram TotalSales", x = "TotalSales", y = "Frequency")

#Insight: TotalSales skewed, menunjukkan sebagian besar transaksi bernilai rendah, cocok untuk fokus pada pelanggan high-value

Kesimpulan EDA: Data menunjukkan pola pembelian kecil dan sering, dengan potensi segmentasi berdasarkan nilai transaksi