library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
retail <- read_excel("01_RETAIL_SALES.xlsx")
str(retail)
## tibble [1,000 × 7] (S3: tbl_df/tbl/data.frame)
##  $ transaction_id: num [1:1000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ date          : POSIXct[1:1000], format: "2023-10-01" "2024-11-25" ...
##  $ product       : chr [1:1000] "Sports" "Food" "Home & Garden" "Beauty" ...
##  $ quantity      : num [1:1000] 7 10 4 2 7 6 3 6 6 5 ...
##  $ price         : num [1:1000] 500000 1000000 150000 250000 150000 100000 1000000 150000 100000 50000 ...
##  $ channel       : chr [1:1000] "Online" "Online" "Online" NA ...
##  $ store_id      : chr [1:1000] "S011" "S011" "S018" "S018" ...
head(retail)
## # A tibble: 6 × 7
##   transaction_id date                product     quantity price channel store_id
##            <dbl> <dttm>              <chr>          <dbl> <dbl> <chr>   <chr>   
## 1              1 2023-10-01 00:00:00 Sports             7 5  e5 Online  S011    
## 2              2 2024-11-25 00:00:00 Food              10 1  e6 Online  S011    
## 3              3 2024-06-18 00:00:00 Home & Gar…        4 1.5e5 Online  S018    
## 4              4 2024-03-13 00:00:00 Beauty             2 2.5e5 <NA>    S018    
## 5              5 2023-04-24 00:00:00 Electronics        7 1.5e5 <NA>    S011    
## 6              6 2023-04-24 00:00:00 Beauty             6 1  e5 Online  S012
summary(retail)
##  transaction_id        date                       product         
##  Min.   :   1.0   Min.   :2023-01-04 00:00:00   Length:1000       
##  1st Qu.: 250.8   1st Qu.:2023-06-22 00:00:00   Class :character  
##  Median : 500.5   Median :2023-12-29 00:00:00   Mode  :character  
##  Mean   : 500.5   Mean   :2023-12-24 08:42:43                     
##  3rd Qu.: 750.2   3rd Qu.:2024-06-27 06:00:00                     
##  Max.   :1000.0   Max.   :2024-12-30 00:00:00                     
##                                                                   
##     quantity          price           channel            store_id        
##  Min.   : 1.000   Min.   :  50000   Length:1000        Length:1000       
##  1st Qu.: 3.000   1st Qu.: 100000   Class :character   Class :character  
##  Median : 5.000   Median : 150000   Mode  :character   Mode  :character  
##  Mean   : 6.888   Mean   : 505000                                        
##  3rd Qu.: 8.000   3rd Qu.: 500000                                        
##  Max.   :97.000   Max.   :2000000                                        
##                   NA's   :30

Membersihkan Data

Mengecek missing value

colSums(is.na(retail))
## transaction_id           date        product       quantity          price 
##              0              0              0              0             30 
##        channel       store_id 
##             20              0

Menghapus data kosong

retail_clean <- na.omit(retail)
retail_clean
## # A tibble: 950 × 7
##    transaction_id date                product    quantity price channel store_id
##             <dbl> <dttm>              <chr>         <dbl> <dbl> <chr>   <chr>   
##  1              1 2023-10-01 00:00:00 Sports            7 5  e5 Online  S011    
##  2              2 2024-11-25 00:00:00 Food             10 1  e6 Online  S011    
##  3              3 2024-06-18 00:00:00 Home & Ga…        4 1.5e5 Online  S018    
##  4              6 2023-04-24 00:00:00 Beauty            6 1  e5 Online  S012    
##  5              7 2023-02-12 00:00:00 Food              3 1  e6 Online  S005    
##  6              8 2024-09-24 00:00:00 Food              6 1.5e5 Online  S009    
##  7              9 2024-03-14 00:00:00 Electroni…        6 1  e5 Online  S009    
##  8             10 2024-05-31 00:00:00 Fashion           5 5  e4 Offline S010    
##  9             11 2023-01-16 00:00:00 Electroni…        4 1.5e5 Online  S019    
## 10             12 2024-12-09 00:00:00 Sports           10 1  e6 Online  S001    
## # ℹ 940 more rows

Mengecek data duplikat

sum(duplicated(retail_clean))
## [1] 0

Memastikan tipe data benar

retail_clean$date <- as.Date(retail_clean$date)
retail_clean$quantity <- as.numeric(retail_clean$quantity)
retail_clean$price <- as.numeric(retail_clean$price)

Mengecek hasil cleaning

summary(retail_clean)
##  transaction_id        date              product             quantity     
##  Min.   :   1.0   Min.   :2023-01-04   Length:950         Min.   : 1.000  
##  1st Qu.: 251.2   1st Qu.:2023-06-17   Class :character   1st Qu.: 3.000  
##  Median : 498.5   Median :2023-12-28   Mode  :character   Median : 5.000  
##  Mean   : 500.6   Mean   :2023-12-24                      Mean   : 6.811  
##  3rd Qu.: 751.8   3rd Qu.:2024-06-28                      3rd Qu.: 8.000  
##  Max.   :1000.0   Max.   :2024-12-30                      Max.   :97.000  
##      price           channel            store_id        
##  Min.   :  50000   Length:950         Length:950        
##  1st Qu.: 100000   Class :character   Class :character  
##  Median : 150000   Mode  :character   Mode  :character  
##  Mean   : 503553                                        
##  3rd Qu.: 500000                                        
##  Max.   :2000000

Transformasi Data

Membuat kolom total penjualan

retail_clean <- retail_clean %>%
  mutate(total_sales = quantity * price)

Membuat kolom bulan dan tahun

retail_clean <- retail_clean %>%
  mutate(month = month(date, label = TRUE),
         year = year(date))

Melihat hasil transformasi

head(retail_clean)
## # A tibble: 6 × 10
##   transaction_id date       product  quantity price channel store_id total_sales
##            <dbl> <date>     <chr>       <dbl> <dbl> <chr>   <chr>          <dbl>
## 1              1 2023-10-01 Sports          7 5  e5 Online  S011         3500000
## 2              2 2024-11-25 Food           10 1  e6 Online  S011        10000000
## 3              3 2024-06-18 Home & …        4 1.5e5 Online  S018          600000
## 4              6 2023-04-24 Beauty          6 1  e5 Online  S012          600000
## 5              7 2023-02-12 Food            3 1  e6 Online  S005         3000000
## 6              8 2024-09-24 Food            6 1.5e5 Online  S009          900000
## # ℹ 2 more variables: month <ord>, year <dbl>

Insight

A. Total Penjualan per Produk

sales_product <- retail_clean %>%
  group_by(product) %>%
  summarise(total_sales = sum(total_sales)) %>%
  arrange(desc(total_sales))

sales_product
## # A tibble: 12 × 2
##    product       total_sales
##    <chr>               <dbl>
##  1 Beauty          706225000
##  2 Home & Garden   632175000
##  3 Fashion         616025000
##  4 Electronics     594875000
##  5 Food            391175000
##  6 Sports          389075000
##  7 electronics      30650000
##  8 sports           18000000
##  9 beauty           12200000
## 10 fashion          12100000
## 11 food              2725000
## 12 home & garden      450000

B. Penjualan per Bulan

sales_month <- retail_clean %>%
  group_by(month) %>%
  summarise(total_sales = sum(total_sales)) %>%
  arrange(desc(total_sales))

sales_month
## # A tibble: 12 × 2
##    month total_sales
##    <ord>       <dbl>
##  1 Nov     465925000
##  2 Jul     397675000
##  3 Mar     352525000
##  4 Jan     330825000
##  5 Aug     280400000
##  6 Apr     260675000
##  7 Jun     251050000
##  8 May     244400000
##  9 Sep     240425000
## 10 Dec     221175000
## 11 Feb     197900000
## 12 Oct     162700000

C. Penjualan Berdasarkan Channel

sales_channel <- retail_clean %>%
  group_by(channel) %>%
  summarise(total_sales = sum(total_sales))

sales_channel
## # A tibble: 2 × 2
##   channel total_sales
##   <chr>         <dbl>
## 1 Offline  1035625000
## 2 Online   2370050000

Visualisasi Data

A. Bar Chart Penjualan per Produk

ggplot(sales_product, aes(x = reorder(product, total_sales), y = total_sales)) +
  geom_col(fill = "skyblue") +
  coord_flip() +
  labs(title = "Total Penjualan per Produk",
       x = "Produk",
       y = "Total Penjualan") +
  theme_minimal()

B. Grafik Penjualan per Bulan

ggplot(sales_month, aes(x = month, y = total_sales, group = 1)) +
  geom_line(color = "pink", linewidth = 1.2) +
  geom_point(color = "purple", size = 2) +
  labs(title = "Tren Penjualan per Bulan",
       x = "Bulan",
       y = "Total Penjualan") +
  theme_minimal()

C. Pie Chart Penjualan Berdasarkan Channel

ggplot(sales_channel, aes(x = "", y = total_sales, fill = channel)) +
  geom_col(width = 1) +
  coord_polar("y", start = 0) +
  labs(title = "Penjualan Berdasarkan Channel") +
  theme_void()