library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
retail <- read_excel("01_RETAIL_SALES.xlsx")
str(retail)
## tibble [1,000 × 7] (S3: tbl_df/tbl/data.frame)
## $ transaction_id: num [1:1000] 1 2 3 4 5 6 7 8 9 10 ...
## $ date : POSIXct[1:1000], format: "2023-10-01" "2024-11-25" ...
## $ product : chr [1:1000] "Sports" "Food" "Home & Garden" "Beauty" ...
## $ quantity : num [1:1000] 7 10 4 2 7 6 3 6 6 5 ...
## $ price : num [1:1000] 500000 1000000 150000 250000 150000 100000 1000000 150000 100000 50000 ...
## $ channel : chr [1:1000] "Online" "Online" "Online" NA ...
## $ store_id : chr [1:1000] "S011" "S011" "S018" "S018" ...
head(retail)
## # A tibble: 6 × 7
## transaction_id date product quantity price channel store_id
## <dbl> <dttm> <chr> <dbl> <dbl> <chr> <chr>
## 1 1 2023-10-01 00:00:00 Sports 7 5 e5 Online S011
## 2 2 2024-11-25 00:00:00 Food 10 1 e6 Online S011
## 3 3 2024-06-18 00:00:00 Home & Gar… 4 1.5e5 Online S018
## 4 4 2024-03-13 00:00:00 Beauty 2 2.5e5 <NA> S018
## 5 5 2023-04-24 00:00:00 Electronics 7 1.5e5 <NA> S011
## 6 6 2023-04-24 00:00:00 Beauty 6 1 e5 Online S012
summary(retail)
## transaction_id date product
## Min. : 1.0 Min. :2023-01-04 00:00:00 Length:1000
## 1st Qu.: 250.8 1st Qu.:2023-06-22 00:00:00 Class :character
## Median : 500.5 Median :2023-12-29 00:00:00 Mode :character
## Mean : 500.5 Mean :2023-12-24 08:42:43
## 3rd Qu.: 750.2 3rd Qu.:2024-06-27 06:00:00
## Max. :1000.0 Max. :2024-12-30 00:00:00
##
## quantity price channel store_id
## Min. : 1.000 Min. : 50000 Length:1000 Length:1000
## 1st Qu.: 3.000 1st Qu.: 100000 Class :character Class :character
## Median : 5.000 Median : 150000 Mode :character Mode :character
## Mean : 6.888 Mean : 505000
## 3rd Qu.: 8.000 3rd Qu.: 500000
## Max. :97.000 Max. :2000000
## NA's :30
Mengecek missing value
colSums(is.na(retail))
## transaction_id date product quantity price
## 0 0 0 0 30
## channel store_id
## 20 0
Menghapus data kosong
retail_clean <- na.omit(retail)
retail_clean
## # A tibble: 950 × 7
## transaction_id date product quantity price channel store_id
## <dbl> <dttm> <chr> <dbl> <dbl> <chr> <chr>
## 1 1 2023-10-01 00:00:00 Sports 7 5 e5 Online S011
## 2 2 2024-11-25 00:00:00 Food 10 1 e6 Online S011
## 3 3 2024-06-18 00:00:00 Home & Ga… 4 1.5e5 Online S018
## 4 6 2023-04-24 00:00:00 Beauty 6 1 e5 Online S012
## 5 7 2023-02-12 00:00:00 Food 3 1 e6 Online S005
## 6 8 2024-09-24 00:00:00 Food 6 1.5e5 Online S009
## 7 9 2024-03-14 00:00:00 Electroni… 6 1 e5 Online S009
## 8 10 2024-05-31 00:00:00 Fashion 5 5 e4 Offline S010
## 9 11 2023-01-16 00:00:00 Electroni… 4 1.5e5 Online S019
## 10 12 2024-12-09 00:00:00 Sports 10 1 e6 Online S001
## # ℹ 940 more rows
Mengecek data duplikat
sum(duplicated(retail_clean))
## [1] 0
Memastikan tipe data benar
retail_clean$date <- as.Date(retail_clean$date)
retail_clean$quantity <- as.numeric(retail_clean$quantity)
retail_clean$price <- as.numeric(retail_clean$price)
Mengecek hasil cleaning
summary(retail_clean)
## transaction_id date product quantity
## Min. : 1.0 Min. :2023-01-04 Length:950 Min. : 1.000
## 1st Qu.: 251.2 1st Qu.:2023-06-17 Class :character 1st Qu.: 3.000
## Median : 498.5 Median :2023-12-28 Mode :character Median : 5.000
## Mean : 500.6 Mean :2023-12-24 Mean : 6.811
## 3rd Qu.: 751.8 3rd Qu.:2024-06-28 3rd Qu.: 8.000
## Max. :1000.0 Max. :2024-12-30 Max. :97.000
## price channel store_id
## Min. : 50000 Length:950 Length:950
## 1st Qu.: 100000 Class :character Class :character
## Median : 150000 Mode :character Mode :character
## Mean : 503553
## 3rd Qu.: 500000
## Max. :2000000
Membuat kolom total penjualan
retail_clean <- retail_clean %>%
mutate(total_sales = quantity * price)
Membuat kolom bulan dan tahun
retail_clean <- retail_clean %>%
mutate(month = month(date, label = TRUE),
year = year(date))
Melihat hasil transformasi
head(retail_clean)
## # A tibble: 6 × 10
## transaction_id date product quantity price channel store_id total_sales
## <dbl> <date> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 2023-10-01 Sports 7 5 e5 Online S011 3500000
## 2 2 2024-11-25 Food 10 1 e6 Online S011 10000000
## 3 3 2024-06-18 Home & … 4 1.5e5 Online S018 600000
## 4 6 2023-04-24 Beauty 6 1 e5 Online S012 600000
## 5 7 2023-02-12 Food 3 1 e6 Online S005 3000000
## 6 8 2024-09-24 Food 6 1.5e5 Online S009 900000
## # ℹ 2 more variables: month <ord>, year <dbl>
A. Total Penjualan per Produk
sales_product <- retail_clean %>%
group_by(product) %>%
summarise(total_sales = sum(total_sales)) %>%
arrange(desc(total_sales))
sales_product
## # A tibble: 12 × 2
## product total_sales
## <chr> <dbl>
## 1 Beauty 706225000
## 2 Home & Garden 632175000
## 3 Fashion 616025000
## 4 Electronics 594875000
## 5 Food 391175000
## 6 Sports 389075000
## 7 electronics 30650000
## 8 sports 18000000
## 9 beauty 12200000
## 10 fashion 12100000
## 11 food 2725000
## 12 home & garden 450000
B. Penjualan per Bulan
sales_month <- retail_clean %>%
group_by(month) %>%
summarise(total_sales = sum(total_sales)) %>%
arrange(desc(total_sales))
sales_month
## # A tibble: 12 × 2
## month total_sales
## <ord> <dbl>
## 1 Nov 465925000
## 2 Jul 397675000
## 3 Mar 352525000
## 4 Jan 330825000
## 5 Aug 280400000
## 6 Apr 260675000
## 7 Jun 251050000
## 8 May 244400000
## 9 Sep 240425000
## 10 Dec 221175000
## 11 Feb 197900000
## 12 Oct 162700000
C. Penjualan Berdasarkan Channel
sales_channel <- retail_clean %>%
group_by(channel) %>%
summarise(total_sales = sum(total_sales))
sales_channel
## # A tibble: 2 × 2
## channel total_sales
## <chr> <dbl>
## 1 Offline 1035625000
## 2 Online 2370050000
A. Bar Chart Penjualan per Produk
ggplot(sales_product, aes(x = reorder(product, total_sales), y = total_sales)) +
geom_col(fill = "skyblue") +
coord_flip() +
labs(title = "Total Penjualan per Produk",
x = "Produk",
y = "Total Penjualan") +
theme_minimal()
B. Grafik Penjualan per Bulan
ggplot(sales_month, aes(x = month, y = total_sales, group = 1)) +
geom_line(color = "pink", linewidth = 1.2) +
geom_point(color = "purple", size = 2) +
labs(title = "Tren Penjualan per Bulan",
x = "Bulan",
y = "Total Penjualan") +
theme_minimal()
C. Pie Chart Penjualan Berdasarkan Channel
ggplot(sales_channel, aes(x = "", y = total_sales, fill = channel)) +
geom_col(width = 1) +
coord_polar("y", start = 0) +
labs(title = "Penjualan Berdasarkan Channel") +
theme_void()