Import Library

library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Import Dataset

sales <- read_excel("01_RETAIL_SALES.xlsx")
head(sales)
## # A tibble: 6 × 7
##   transaction_id date                product     quantity price channel store_id
##            <dbl> <dttm>              <chr>          <dbl> <dbl> <chr>   <chr>   
## 1              1 2023-10-01 00:00:00 Sports             7 5  e5 Online  S011    
## 2              2 2024-11-25 00:00:00 Food              10 1  e6 Online  S011    
## 3              3 2024-06-18 00:00:00 Home & Gar…        4 1.5e5 Online  S018    
## 4              4 2024-03-13 00:00:00 Beauty             2 2.5e5 <NA>    S018    
## 5              5 2023-04-24 00:00:00 Electronics        7 1.5e5 <NA>    S011    
## 6              6 2023-04-24 00:00:00 Beauty             6 1  e5 Online  S012

Struktur Dataset

str(sales)
## tibble [1,000 × 7] (S3: tbl_df/tbl/data.frame)
##  $ transaction_id: num [1:1000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ date          : POSIXct[1:1000], format: "2023-10-01" "2024-11-25" ...
##  $ product       : chr [1:1000] "Sports" "Food" "Home & Garden" "Beauty" ...
##  $ quantity      : num [1:1000] 7 10 4 2 7 6 3 6 6 5 ...
##  $ price         : num [1:1000] 500000 1000000 150000 250000 150000 100000 1000000 150000 100000 50000 ...
##  $ channel       : chr [1:1000] "Online" "Online" "Online" NA ...
##  $ store_id      : chr [1:1000] "S011" "S011" "S018" "S018" ...
summary(sales)
##  transaction_id        date                       product         
##  Min.   :   1.0   Min.   :2023-01-04 00:00:00   Length:1000       
##  1st Qu.: 250.8   1st Qu.:2023-06-22 00:00:00   Class :character  
##  Median : 500.5   Median :2023-12-29 00:00:00   Mode  :character  
##  Mean   : 500.5   Mean   :2023-12-24 08:42:43                     
##  3rd Qu.: 750.2   3rd Qu.:2024-06-27 06:00:00                     
##  Max.   :1000.0   Max.   :2024-12-30 00:00:00                     
##                                                                   
##     quantity          price           channel            store_id        
##  Min.   : 1.000   Min.   :  50000   Length:1000        Length:1000       
##  1st Qu.: 3.000   1st Qu.: 100000   Class :character   Class :character  
##  Median : 5.000   Median : 150000   Mode  :character   Mode  :character  
##  Mean   : 6.888   Mean   : 505000                                        
##  3rd Qu.: 8.000   3rd Qu.: 500000                                        
##  Max.   :97.000   Max.   :2000000                                        
##                   NA's   :30

Data Cleaning

-Mengecek Missing Value

colSums(is.na(sales))
## transaction_id           date        product       quantity          price 
##              0              0              0              0             30 
##        channel       store_id 
##             20              0

-Menampilkan Data yang Memiliki Missing Value

sales %>% filter(is.na(price) | is.na(channel))
## # A tibble: 50 × 7
##    transaction_id date                product   quantity  price channel store_id
##             <dbl> <dttm>              <chr>        <dbl>  <dbl> <chr>   <chr>   
##  1              4 2024-03-13 00:00:00 Beauty           2 250000 <NA>    S018    
##  2              5 2023-04-24 00:00:00 Electron…        7 150000 <NA>    S011    
##  3             49 2024-02-04 00:00:00 Sports           2     NA Online  S020    
##  4             77 2024-07-17 00:00:00 Electron…        9     NA Online  S017    
##  5             86 2023-08-26 00:00:00 Food             5 500000 <NA>    S012    
##  6             91 2023-03-29 00:00:00 Food             6 100000 <NA>    S010    
##  7            102 2024-04-09 00:00:00 Electron…        5     NA Offline S008    
##  8            124 2023-03-22 00:00:00 Electron…        1     NA Online  S003    
##  9            138 2024-05-28 00:00:00 electron…        9     NA Online  S008    
## 10            139 2023-09-23 00:00:00 Fashion          4     NA Offline S017    
## # ℹ 40 more rows

-Menangani Missing Value

Channel

sales$channel[is.na(sales$channel)] <- "Offline"

Price

sales$price[is.na(sales$price)] <- mean(sales$price, na.rm = TRUE)

-Cleaning Duplikasi Kategori Product

sales$product <- tools::toTitleCase(tolower(sales$product))

Mengecek Hasil Cleaning

unique(sales$product)
## [1] "Sports"        "Food"          "Home & Garden" "Beauty"       
## [5] "Electronics"   "Fashion"
table(sales$product)
## 
##        Beauty   Electronics       Fashion          Food Home & Garden 
##           174           167           176           163           149 
##        Sports 
##           171

Cek Ulang

colSums(is.na(sales))
## transaction_id           date        product       quantity          price 
##              0              0              0              0              0 
##        channel       store_id 
##              0              0
sum(duplicated(sales))
## [1] 0

Transformasi Data

sales <- sales %>%
  mutate(
    total_sales = quantity * price,
    month = month(date, label = TRUE),
    year = year(date)
  )

Insight Dataset

-Total Penjualan Keseluruhan

sum(sales$total_sales)
## [1] 3756170000

-Produk Paling Banyak Terjual

product_summary <- sales %>%
  group_by(product) %>%
  summarise(total_quantity = sum(quantity)) %>%
  arrange(desc(total_quantity))

product_summary
## # A tibble: 6 × 2
##   product       total_quantity
##   <chr>                  <dbl>
## 1 Beauty                  1398
## 2 Fashion                 1240
## 3 Home & Garden           1223
## 4 Electronics             1065
## 5 Sports                  1023
## 6 Food                     939

-Channel Penjualan Terbanyak

channel_summary <- sales %>%
  group_by(channel) %>%
  summarise(total_sales = sum(total_sales))

channel_summary
## # A tibble: 2 × 2
##   channel total_sales
##   <chr>         <dbl>
## 1 Offline  1333095000
## 2 Online   2423075000

-Store dengan Penjualan Tertinggi

store_summary <- sales %>%
  group_by(store_id) %>%
  summarise(total_sales = sum(total_sales)) %>%
  arrange(desc(total_sales))

head(store_summary)
## # A tibble: 6 × 2
##   store_id total_sales
##   <chr>          <dbl>
## 1 S016       371305000
## 2 S015       311400000
## 3 S007       309125000
## 4 S013       255465000
## 5 S009       250940000
## 6 S014       238110000

Visualisasi Data

-Bar Chart Produk Terlaris

ggplot(product_summary,
       aes(x = reorder(product, total_quantity),
           y = total_quantity,
           fill = product)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Produk Terlaris",
    x = "Produk",
    y = "Total Quantity"
  ) +
  theme_minimal()

Insight: Produk dengan jumlah pembelian tertinggi menunjukkan kategori yang paling diminati pelanggan.

-Pie Chart Channel Penjualan

ggplot(channel_summary,
       aes(x = "", y = total_sales, fill = channel)) +
  geom_col(width = 1) +
  coord_polar("y") +
  labs(title = "Persentase Penjualan Berdasarkan Channel") +
  theme_void()

Insight: Visualisasi ini menunjukkan kontribusi penjualan dari channel Online dan Offline.

-Line Chart Penjualan Bulanan

monthly_sales <- sales %>%
  group_by(month) %>%
  summarise(total_sales = sum(total_sales))

monthly_sales
## # A tibble: 12 × 2
##    month total_sales
##    <ord>       <dbl>
##  1 Jan     504360000
##  2 Feb     224805000
##  3 Mar     372330000
##  4 Apr     272750000
##  5 May     254480000
##  6 Jun     257615000
##  7 Jul     409795000
##  8 Aug     289410000
##  9 Sep     286980000
## 10 Oct     182720000
## 11 Nov     475990000
## 12 Dec     224935000
ggplot(monthly_sales,
       aes(x = month,
           y = total_sales,
           group = 1)) +
  geom_line(size = 1.2, color = "blue") +
  geom_point(size = 3, color = "red") +
  labs(
    title = "Trend Penjualan Bulanan",
    x = "Bulan",
    y = "Total Sales"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Insight: Grafik menunjukkan perubahan penjualan setiap bulan dan membantu mengetahui periode penjualan tertinggi.

-Boxplot Harga Produk

ggplot(sales,
       aes(x = product,
           y = price,
           fill = product)) +
  geom_boxplot() +
  labs(
    title = "Distribusi Harga Produk",
    x = "Produk",
    y = "Harga"
  ) +
  theme_minimal()

Insight: Boxplot digunakan untuk melihat persebaran harga dan mendeteksi outlier.

-Scatter Plot Quantity vs Total Sales

ggplot(sales,
       aes(x = quantity,
           y = total_sales,
           color = channel)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "Hubungan Quantity dan Total Sales",
    x = "Quantity",
    y = "Total Sales"
  ) +
  theme_minimal()

Insight: Semakin besar quantity pembelian maka total sales cenderung meningkat.

-Density Plot

ggplot(sales,
       aes(x = total_sales,
           fill = channel)) +
  geom_density(alpha = 0.5) +
  labs(
    title = "Density Plot Total Sales",
    x = "Total Sales"
  ) +
  theme_minimal()

Insight: Membandingkan distribusi sales Online vs Offline.
Menunjukkan pola penjualan tiap channel.

-Violin Plot

ggplot(sales,
       aes(x = channel,
           y = total_sales,
           fill = channel)) +
  geom_violin(trim = FALSE) +
  labs(
    title = "Distribusi Total Sales per Channel",
    x = "Channel",
    y = "Total Sales"
  ) +
  theme_minimal()

Insight: Menunjukkan distribusi data dan kepadatan transaksi.
Membandingkan pola transaksi tiap channel.

-produk berdasarkan channel.

ggplot(sales,
       aes(x = product,
           fill = channel)) +
  geom_bar() +
  labs(
    title = "Jumlah Produk Berdasarkan Channel",
    x = "Produk",
    y = "Jumlah"
  ) +
  theme_minimal()

Insight: Melihat produk lebih sering dibeli lewat Online atau Offline.

-Korelasi

menghitung korelasi dan mengubah matrix

correlation_data <- sales %>%
  select(quantity, price, total_sales)

correlation_matrix <- cor(correlation_data)

correlation_df <- as.data.frame(as.table(correlation_matrix))

colnames(correlation_df) <- c("Var1", "Var2", "Correlation")

correlation_df
##          Var1        Var2 Correlation
## 1    quantity    quantity  1.00000000
## 2       price    quantity  0.04212659
## 3 total_sales    quantity  0.60366996
## 4    quantity       price  0.04212659
## 5       price       price  1.00000000
## 6 total_sales       price  0.43304145
## 7    quantity total_sales  0.60366996
## 8       price total_sales  0.43304145
## 9 total_sales total_sales  1.00000000
ggplot(correlation_df,
       aes(x = Var1,
           y = Var2,
           fill = Correlation)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Correlation, 2)),
            color = "black",
            size = 5) +
  scale_fill_gradient2(
    low = "blue",
    mid = "white",
    high = "red",
    midpoint = 0
  ) +
  labs(
    title = "Heatmap Korelasi Variabel"
  ) +
  theme_minimal()


Warna merah artinya korelasi positif kuat
Warna biru artinya korelasi negatif
Semakin mendekati 1 artinya hubungan semakin kuat
total_sales biasanya sangat berkorelasi dengan quantity

Kesimpulan

Berdasarkan hasil EDA:
1. Dataset memiliki sedikit missing value pada kolom channel.
2. Setelah cleaning, data siap digunakan untuk analisis.
3. Produk dengan quantity tertinggi menjadi kategori paling diminati.
4. Channel dengan total sales terbesar menunjukkan metode belanja favorit pelanggan.
5. Trend penjualan bulanan dapat digunakan untuk strategi bisnis dan promosi.
6. Visualisasi membantu memahami pola data dengan lebih mudah.