library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
sales <- read_excel("01_RETAIL_SALES.xlsx")
head(sales)
## # A tibble: 6 × 7
## transaction_id date product quantity price channel store_id
## <dbl> <dttm> <chr> <dbl> <dbl> <chr> <chr>
## 1 1 2023-10-01 00:00:00 Sports 7 5 e5 Online S011
## 2 2 2024-11-25 00:00:00 Food 10 1 e6 Online S011
## 3 3 2024-06-18 00:00:00 Home & Gar… 4 1.5e5 Online S018
## 4 4 2024-03-13 00:00:00 Beauty 2 2.5e5 <NA> S018
## 5 5 2023-04-24 00:00:00 Electronics 7 1.5e5 <NA> S011
## 6 6 2023-04-24 00:00:00 Beauty 6 1 e5 Online S012
str(sales)
## tibble [1,000 × 7] (S3: tbl_df/tbl/data.frame)
## $ transaction_id: num [1:1000] 1 2 3 4 5 6 7 8 9 10 ...
## $ date : POSIXct[1:1000], format: "2023-10-01" "2024-11-25" ...
## $ product : chr [1:1000] "Sports" "Food" "Home & Garden" "Beauty" ...
## $ quantity : num [1:1000] 7 10 4 2 7 6 3 6 6 5 ...
## $ price : num [1:1000] 500000 1000000 150000 250000 150000 100000 1000000 150000 100000 50000 ...
## $ channel : chr [1:1000] "Online" "Online" "Online" NA ...
## $ store_id : chr [1:1000] "S011" "S011" "S018" "S018" ...
summary(sales)
## transaction_id date product
## Min. : 1.0 Min. :2023-01-04 00:00:00 Length:1000
## 1st Qu.: 250.8 1st Qu.:2023-06-22 00:00:00 Class :character
## Median : 500.5 Median :2023-12-29 00:00:00 Mode :character
## Mean : 500.5 Mean :2023-12-24 08:42:43
## 3rd Qu.: 750.2 3rd Qu.:2024-06-27 06:00:00
## Max. :1000.0 Max. :2024-12-30 00:00:00
##
## quantity price channel store_id
## Min. : 1.000 Min. : 50000 Length:1000 Length:1000
## 1st Qu.: 3.000 1st Qu.: 100000 Class :character Class :character
## Median : 5.000 Median : 150000 Mode :character Mode :character
## Mean : 6.888 Mean : 505000
## 3rd Qu.: 8.000 3rd Qu.: 500000
## Max. :97.000 Max. :2000000
## NA's :30
colSums(is.na(sales))
## transaction_id date product quantity price
## 0 0 0 0 30
## channel store_id
## 20 0
sales %>% filter(is.na(price) | is.na(channel))
## # A tibble: 50 × 7
## transaction_id date product quantity price channel store_id
## <dbl> <dttm> <chr> <dbl> <dbl> <chr> <chr>
## 1 4 2024-03-13 00:00:00 Beauty 2 250000 <NA> S018
## 2 5 2023-04-24 00:00:00 Electron… 7 150000 <NA> S011
## 3 49 2024-02-04 00:00:00 Sports 2 NA Online S020
## 4 77 2024-07-17 00:00:00 Electron… 9 NA Online S017
## 5 86 2023-08-26 00:00:00 Food 5 500000 <NA> S012
## 6 91 2023-03-29 00:00:00 Food 6 100000 <NA> S010
## 7 102 2024-04-09 00:00:00 Electron… 5 NA Offline S008
## 8 124 2023-03-22 00:00:00 Electron… 1 NA Online S003
## 9 138 2024-05-28 00:00:00 electron… 9 NA Online S008
## 10 139 2023-09-23 00:00:00 Fashion 4 NA Offline S017
## # ℹ 40 more rows
sales$channel[is.na(sales$channel)] <- "Offline"
sales$price[is.na(sales$price)] <- mean(sales$price, na.rm = TRUE)
sales$product <- tools::toTitleCase(tolower(sales$product))
unique(sales$product)
## [1] "Sports" "Food" "Home & Garden" "Beauty"
## [5] "Electronics" "Fashion"
table(sales$product)
##
## Beauty Electronics Fashion Food Home & Garden
## 174 167 176 163 149
## Sports
## 171
colSums(is.na(sales))
## transaction_id date product quantity price
## 0 0 0 0 0
## channel store_id
## 0 0
sum(duplicated(sales))
## [1] 0
sales <- sales %>%
mutate(
total_sales = quantity * price,
month = month(date, label = TRUE),
year = year(date)
)
sum(sales$total_sales)
## [1] 3756170000
product_summary <- sales %>%
group_by(product) %>%
summarise(total_quantity = sum(quantity)) %>%
arrange(desc(total_quantity))
product_summary
## # A tibble: 6 × 2
## product total_quantity
## <chr> <dbl>
## 1 Beauty 1398
## 2 Fashion 1240
## 3 Home & Garden 1223
## 4 Electronics 1065
## 5 Sports 1023
## 6 Food 939
channel_summary <- sales %>%
group_by(channel) %>%
summarise(total_sales = sum(total_sales))
channel_summary
## # A tibble: 2 × 2
## channel total_sales
## <chr> <dbl>
## 1 Offline 1333095000
## 2 Online 2423075000
store_summary <- sales %>%
group_by(store_id) %>%
summarise(total_sales = sum(total_sales)) %>%
arrange(desc(total_sales))
head(store_summary)
## # A tibble: 6 × 2
## store_id total_sales
## <chr> <dbl>
## 1 S016 371305000
## 2 S015 311400000
## 3 S007 309125000
## 4 S013 255465000
## 5 S009 250940000
## 6 S014 238110000
ggplot(product_summary,
aes(x = reorder(product, total_quantity),
y = total_quantity,
fill = product)) +
geom_col() +
coord_flip() +
labs(
title = "Produk Terlaris",
x = "Produk",
y = "Total Quantity"
) +
theme_minimal()
Insight: Produk dengan jumlah pembelian tertinggi menunjukkan kategori yang paling diminati pelanggan.
ggplot(channel_summary,
aes(x = "", y = total_sales, fill = channel)) +
geom_col(width = 1) +
coord_polar("y") +
labs(title = "Persentase Penjualan Berdasarkan Channel") +
theme_void()
Insight: Visualisasi ini menunjukkan kontribusi penjualan dari channel Online dan Offline.
monthly_sales <- sales %>%
group_by(month) %>%
summarise(total_sales = sum(total_sales))
monthly_sales
## # A tibble: 12 × 2
## month total_sales
## <ord> <dbl>
## 1 Jan 504360000
## 2 Feb 224805000
## 3 Mar 372330000
## 4 Apr 272750000
## 5 May 254480000
## 6 Jun 257615000
## 7 Jul 409795000
## 8 Aug 289410000
## 9 Sep 286980000
## 10 Oct 182720000
## 11 Nov 475990000
## 12 Dec 224935000
ggplot(monthly_sales,
aes(x = month,
y = total_sales,
group = 1)) +
geom_line(size = 1.2, color = "blue") +
geom_point(size = 3, color = "red") +
labs(
title = "Trend Penjualan Bulanan",
x = "Bulan",
y = "Total Sales"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Insight: Grafik menunjukkan perubahan penjualan setiap bulan dan membantu mengetahui periode penjualan tertinggi.
ggplot(sales,
aes(x = product,
y = price,
fill = product)) +
geom_boxplot() +
labs(
title = "Distribusi Harga Produk",
x = "Produk",
y = "Harga"
) +
theme_minimal()
Insight: Boxplot digunakan untuk melihat persebaran harga dan mendeteksi outlier.
ggplot(sales,
aes(x = quantity,
y = total_sales,
color = channel)) +
geom_point(size = 3, alpha = 0.7) +
labs(
title = "Hubungan Quantity dan Total Sales",
x = "Quantity",
y = "Total Sales"
) +
theme_minimal()
Insight: Semakin besar quantity pembelian maka total sales cenderung meningkat.
ggplot(sales,
aes(x = total_sales,
fill = channel)) +
geom_density(alpha = 0.5) +
labs(
title = "Density Plot Total Sales",
x = "Total Sales"
) +
theme_minimal()
Insight: Membandingkan distribusi sales Online vs Offline.
Menunjukkan pola penjualan tiap channel.
ggplot(sales,
aes(x = channel,
y = total_sales,
fill = channel)) +
geom_violin(trim = FALSE) +
labs(
title = "Distribusi Total Sales per Channel",
x = "Channel",
y = "Total Sales"
) +
theme_minimal()
Insight: Menunjukkan distribusi data dan kepadatan transaksi.
Membandingkan pola transaksi tiap channel.
ggplot(sales,
aes(x = product,
fill = channel)) +
geom_bar() +
labs(
title = "Jumlah Produk Berdasarkan Channel",
x = "Produk",
y = "Jumlah"
) +
theme_minimal()
Insight: Melihat produk lebih sering dibeli lewat Online atau Offline.
correlation_data <- sales %>%
select(quantity, price, total_sales)
correlation_matrix <- cor(correlation_data)
correlation_df <- as.data.frame(as.table(correlation_matrix))
colnames(correlation_df) <- c("Var1", "Var2", "Correlation")
correlation_df
## Var1 Var2 Correlation
## 1 quantity quantity 1.00000000
## 2 price quantity 0.04212659
## 3 total_sales quantity 0.60366996
## 4 quantity price 0.04212659
## 5 price price 1.00000000
## 6 total_sales price 0.43304145
## 7 quantity total_sales 0.60366996
## 8 price total_sales 0.43304145
## 9 total_sales total_sales 1.00000000
ggplot(correlation_df,
aes(x = Var1,
y = Var2,
fill = Correlation)) +
geom_tile(color = "white") +
geom_text(aes(label = round(Correlation, 2)),
color = "black",
size = 5) +
scale_fill_gradient2(
low = "blue",
mid = "white",
high = "red",
midpoint = 0
) +
labs(
title = "Heatmap Korelasi Variabel"
) +
theme_minimal()
Warna merah artinya korelasi positif kuat
Warna biru artinya
korelasi negatif
Semakin mendekati 1 artinya hubungan semakin kuat
total_sales biasanya sangat berkorelasi dengan quantity
Berdasarkan hasil EDA:
1. Dataset memiliki sedikit missing value
pada kolom channel.
2. Setelah cleaning, data siap digunakan untuk
analisis.
3. Produk dengan quantity tertinggi menjadi kategori
paling diminati.
4. Channel dengan total sales terbesar menunjukkan
metode belanja favorit pelanggan.
5. Trend penjualan bulanan dapat
digunakan untuk strategi bisnis dan promosi.
6. Visualisasi
membantu memahami pola data dengan lebih mudah.