##Eksplorasi data Online Sales
#Set Direnctory
setwd("C:/Users/LENOVO/Downloads")
#Load Data
dat <- read.csv("online_sales_dataset.csv")
#Statistik Deskriptif Data
library("skimr")
## Warning: package 'skimr' was built under R version 4.3.3
summary(dat) #Statistik deskriptif data
## InvoiceNo StockCode Description Quantity
## Min. :100005 Length:49782 Length:49782 Min. :-50.00
## 1st Qu.:324543 Class :character Class :character 1st Qu.: 11.00
## Median :552244 Mode :character Mode :character Median : 23.00
## Mean :550681 Mean : 22.37
## 3rd Qu.:776364 3rd Qu.: 37.00
## Max. :999997 Max. : 49.00
##
## InvoiceDate UnitPrice CustomerID Country
## Length:49782 Min. :-99.98 Min. :10001 Length:49782
## Class :character 1st Qu.: 23.59 1st Qu.:32751 Class :character
## Mode :character Median : 48.92 Median :55165 Mode :character
## Mean : 47.54 Mean :55033
## 3rd Qu.: 74.61 3rd Qu.:77306
## Max. :100.00 Max. :99998
## NA's :4978
## Discount PaymentMethod ShippingCost Category
## Min. :0.0000 Length:49782 Min. : 5.00 Length:49782
## 1st Qu.:0.1300 Class :character 1st Qu.:11.22 Class :character
## Median :0.2600 Mode :character Median :17.50 Mode :character
## Mean :0.2757 Mean :17.49
## 3rd Qu.:0.3800 3rd Qu.:23.72
## Max. :1.9998 Max. :30.00
## NA's :2489
## SalesChannel ReturnStatus ShipmentProvider WarehouseLocation
## Length:49782 Length:49782 Length:49782 Length:49782
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## OrderPriority
## Length:49782
## Class :character
## Mode :character
##
##
##
##
str(dat) #Mengenali struktur data tiap variabel
## 'data.frame': 49782 obs. of 17 variables:
## $ InvoiceNo : int 221958 771155 231932 465838 359178 744167 210268 832180 154886 237337 ...
## $ StockCode : chr "SKU_1964" "SKU_1241" "SKU_1501" "SKU_1760" ...
## $ Description : chr "White Mug" "White Mug" "Headphones" "Desk Lamp" ...
## $ Quantity : int 38 18 49 14 -30 47 25 8 19 40 ...
## $ InvoiceDate : chr "2020-01-01 00:00" "2020-01-01 01:00" "2020-01-01 02:00" "2020-01-01 03:00" ...
## $ UnitPrice : num 1.71 41.25 29.11 76.68 -68.11 ...
## $ CustomerID : num 37039 19144 50472 96586 NA ...
## $ Country : chr "Australia" "Spain" "Germany" "Netherlands" ...
## $ Discount : num 0.47 0.19 0.35 0.14 1.5 ...
## $ PaymentMethod : chr "Bank Transfer" "paypall" "Bank Transfer" "paypall" ...
## $ ShippingCost : num 10.79 9.51 23.03 11.08 NA ...
## $ Category : chr "Apparel" "Electronics" "Electronics" "Accessories" ...
## $ SalesChannel : chr "In-store" "Online" "Online" "Online" ...
## $ ReturnStatus : chr "Not Returned" "Not Returned" "Returned" "Not Returned" ...
## $ ShipmentProvider : chr "UPS" "UPS" "UPS" "Royal Mail" ...
## $ WarehouseLocation: chr "London" "Rome" "Berlin" "Rome" ...
## $ OrderPriority : chr "Medium" "Medium" "High" "Low" ...
Terdapat 44804 transaksi yang tercatat pada dataset ini. Dataset ini memiliki 17 Variabel yang terdiri dari: 1. InvoiceNo (Nomor Transaksi) 2. StockCode (Kode Stock) 3. Description (Nama Produk) 4. Quantity (Jumlah Barang yang di Beli) 5. InvoiceDate (Tanggal Transaksi) 6. UnitPrice (Harga Barang) 7. CustomerID (ID Pembeli) 8. Country (Negara Pembeli) 9. Discount (Potongan Harga) 10. PaymentMethod (Metode Pembayaran) 11. ShippingCost (Biaya Pengiriman) 12. Category (Kategori Produk) 13. SalesChannel (Jalur Penjualan) 14. ReturnStatus (Status Pengembalian) 15. ShipmentProvider (Layanan Pengiriman) 16. WarehouseLocation (Lokasi Gudang) 17. OrderPriority (Priorotas Pembelian).
#Cleaning Data
colSums(is.na(dat)) #Mengidentifikasi jumlah NA pada tiap kolom/variabel
## InvoiceNo StockCode Description Quantity
## 0 0 0 0
## InvoiceDate UnitPrice CustomerID Country
## 0 0 4978 0
## Discount PaymentMethod ShippingCost Category
## 0 0 2489 0
## SalesChannel ReturnStatus ShipmentProvider WarehouseLocation
## 0 0 0 0
## OrderPriority
## 0
Terdapat 4978 missing value pada kolom CustomerID dan 2489 missing value pada kolom ShippingCost. Kemudian akan dilakukan penghapusan baris yang terdapat missing value pada data.
dat <- na.omit(dat) #Menghapus baris yg memiliki NA
str(dat)
## 'data.frame': 44804 obs. of 17 variables:
## $ InvoiceNo : int 221958 771155 231932 465838 744167 210268 832180 154886 237337 621430 ...
## $ StockCode : chr "SKU_1964" "SKU_1241" "SKU_1501" "SKU_1760" ...
## $ Description : chr "White Mug" "White Mug" "Headphones" "Desk Lamp" ...
## $ Quantity : int 38 18 49 14 47 25 8 19 40 49 ...
## $ InvoiceDate : chr "2020-01-01 00:00" "2020-01-01 01:00" "2020-01-01 02:00" "2020-01-01 03:00" ...
## $ UnitPrice : num 1.71 41.25 29.11 76.68 70.16 ...
## $ CustomerID : num 37039 19144 50472 96586 53887 ...
## $ Country : chr "Australia" "Spain" "Germany" "Netherlands" ...
## $ Discount : num 0.47 0.19 0.35 0.14 0.48 0.15 0.04 0.05 0.16 0.19 ...
## $ PaymentMethod : chr "Bank Transfer" "paypall" "Bank Transfer" "paypall" ...
## $ ShippingCost : num 10.79 9.51 23.03 11.08 13.98 ...
## $ Category : chr "Apparel" "Electronics" "Electronics" "Accessories" ...
## $ SalesChannel : chr "In-store" "Online" "Online" "Online" ...
## $ ReturnStatus : chr "Not Returned" "Not Returned" "Returned" "Not Returned" ...
## $ ShipmentProvider : chr "UPS" "UPS" "UPS" "Royal Mail" ...
## $ WarehouseLocation: chr "London" "Rome" "Berlin" "Rome" ...
## $ OrderPriority : chr "Medium" "Medium" "High" "Low" ...
## - attr(*, "na.action")= 'omit' Named int [1:4978] 5 50 63 72 104 145 148 168 174 177 ...
## ..- attr(*, "names")= chr [1:4978] "5" "50" "63" "72" ...
duplicated<- dat[duplicated(dat), ]
print(duplicated)
## [1] InvoiceNo StockCode Description Quantity
## [5] InvoiceDate UnitPrice CustomerID Country
## [9] Discount PaymentMethod ShippingCost Category
## [13] SalesChannel ReturnStatus ShipmentProvider WarehouseLocation
## [17] OrderPriority
## <0 rows> (or 0-length row.names)
Tidak terdapat duplikasi pada data.
#Subset data Invoice Date Tahun 2020-2023
max(dat$InvoiceDate)
## [1] "2025-09-05 05:00"
Tanggal transkasi pada data paling baru adalah 5 September 2025. Akan dilakukan subset data hanya pada transaksi yang berlangsung pada pertengahan Tahun 2024.
dat<-subset(dat, dat$InvoiceDate<"2024-07-01 00:00") #Subset data sebelum 2024
max(dat$InvoiceDate)
## [1] "2024-06-30 23:00"
#Ekpolrasi data : Total Pendapatan tiap Bulan
dat$TotalPrice <- (dat$Quantity*dat$UnitPrice)+(dat$ShippingCost)-dat$Discount
head(dat)
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 1 221958 SKU_1964 White Mug 38 2020-01-01 00:00 1.71
## 2 771155 SKU_1241 White Mug 18 2020-01-01 01:00 41.25
## 3 231932 SKU_1501 Headphones 49 2020-01-01 02:00 29.11
## 4 465838 SKU_1760 Desk Lamp 14 2020-01-01 03:00 76.68
## 6 744167 SKU_1006 Office Chair 47 2020-01-01 05:00 70.16
## 7 210268 SKU_1087 USB Cable 25 2020-01-01 06:00 85.74
## CustomerID Country Discount PaymentMethod ShippingCost Category
## 1 37039 Australia 0.47 Bank Transfer 10.79 Apparel
## 2 19144 Spain 0.19 paypall 9.51 Electronics
## 3 50472 Germany 0.35 Bank Transfer 23.03 Electronics
## 4 96586 Netherlands 0.14 paypall 11.08 Accessories
## 6 53887 Sweden 0.48 Credit Card 13.98 Electronics
## 7 46567 Belgium 0.15 Bank Transfer 12.92 Stationery
## SalesChannel ReturnStatus ShipmentProvider WarehouseLocation OrderPriority
## 1 In-store Not Returned UPS London Medium
## 2 Online Not Returned UPS Rome Medium
## 3 Online Returned UPS Berlin High
## 4 Online Not Returned Royal Mail Rome Low
## 6 Online Not Returned DHL London Medium
## 7 Online Not Returned FedEx Amsterdam High
## TotalPrice
## 1 75.30
## 2 751.82
## 3 1449.07
## 4 1084.46
## 6 3311.02
## 7 2156.27
dat$InvoiceDate <- as.Date(dat$InvoiceDate, format = "%Y-%m-%d")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.3.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
revenue <- dat %>%
mutate(MonthDate = floor_date(InvoiceDate, "month")) %>%
group_by(MonthDate) %>%
summarise(TotalRevenue = sum(TotalPrice))
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ggplot(revenue, aes(x = MonthDate, y = TotalRevenue)) +
geom_line(color = "#70c6e8", size = 0.7) +
geom_point(color = "#70c6e8", size = 1.2) +
labs(
title = "Total Revenue Per Month",
x = "Month",
y = "Total Revenue"
) +
theme(plot.title = element_text(
hjust = 0.5,
size = 16,
face = "bold"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Eksplorasi data : Top 5 Pelanggan
library(janitor)
## Warning: package 'janitor' was built under R version 4.3.3
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
Cust <- data.frame(dat$CustomerID,dat$TotalPrice)
colnames(Cust)<-c("CustomerID", "TotalPrice")
Cust5 <- Cust %>%
arrange(desc(TotalPrice)) %>%
slice_head(n = 5)
print(Cust5)
## CustomerID TotalPrice
## 1 93080 4910.54
## 2 25189 4908.77
## 3 63706 4908.23
## 4 43562 4899.51
## 5 99784 4892.44
Dari tabel tersebut ditampilkan 5 pembeli dengan total pembelian tenbanyak. Pembeli dengan total pemeblian terbanyak adalah pembeli dengan CustomerID 93080.
#Eksplorasi data : Top 5 Produk paling laris
#Top5 Product
Prod <- dat %>%
group_by(Description) %>%
summarise(Total=sum(Quantity)) %>%
arrange(desc(Total)) %>%
slice_head(n = 5)
print(Prod)
## # A tibble: 5 × 2
## Description Total
## <chr> <int>
## 1 Wall Clock 82321
## 2 USB Cable 81495
## 3 White Mug 81494
## 4 Backpack 81458
## 5 Office Chair 81110
Produk yang paling laris di pasaran adalah Wall Clock.