##Eksplorasi data Online Sales

#Set Direnctory

setwd("C:/Users/LENOVO/Downloads")

#Load Data

dat <- read.csv("online_sales_dataset.csv")

#Statistik Deskriptif Data

library("skimr")
## Warning: package 'skimr' was built under R version 4.3.3
summary(dat) #Statistik deskriptif data
##    InvoiceNo       StockCode         Description           Quantity     
##  Min.   :100005   Length:49782       Length:49782       Min.   :-50.00  
##  1st Qu.:324543   Class :character   Class :character   1st Qu.: 11.00  
##  Median :552244   Mode  :character   Mode  :character   Median : 23.00  
##  Mean   :550681                                         Mean   : 22.37  
##  3rd Qu.:776364                                         3rd Qu.: 37.00  
##  Max.   :999997                                         Max.   : 49.00  
##                                                                         
##  InvoiceDate          UnitPrice        CustomerID      Country         
##  Length:49782       Min.   :-99.98   Min.   :10001   Length:49782      
##  Class :character   1st Qu.: 23.59   1st Qu.:32751   Class :character  
##  Mode  :character   Median : 48.92   Median :55165   Mode  :character  
##                     Mean   : 47.54   Mean   :55033                     
##                     3rd Qu.: 74.61   3rd Qu.:77306                     
##                     Max.   :100.00   Max.   :99998                     
##                                      NA's   :4978                      
##     Discount      PaymentMethod       ShippingCost     Category        
##  Min.   :0.0000   Length:49782       Min.   : 5.00   Length:49782      
##  1st Qu.:0.1300   Class :character   1st Qu.:11.22   Class :character  
##  Median :0.2600   Mode  :character   Median :17.50   Mode  :character  
##  Mean   :0.2757                      Mean   :17.49                     
##  3rd Qu.:0.3800                      3rd Qu.:23.72                     
##  Max.   :1.9998                      Max.   :30.00                     
##                                      NA's   :2489                      
##  SalesChannel       ReturnStatus       ShipmentProvider   WarehouseLocation 
##  Length:49782       Length:49782       Length:49782       Length:49782      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  OrderPriority     
##  Length:49782      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
str(dat) #Mengenali struktur data tiap variabel 
## 'data.frame':    49782 obs. of  17 variables:
##  $ InvoiceNo        : int  221958 771155 231932 465838 359178 744167 210268 832180 154886 237337 ...
##  $ StockCode        : chr  "SKU_1964" "SKU_1241" "SKU_1501" "SKU_1760" ...
##  $ Description      : chr  "White Mug" "White Mug" "Headphones" "Desk Lamp" ...
##  $ Quantity         : int  38 18 49 14 -30 47 25 8 19 40 ...
##  $ InvoiceDate      : chr  "2020-01-01 00:00" "2020-01-01 01:00" "2020-01-01 02:00" "2020-01-01 03:00" ...
##  $ UnitPrice        : num  1.71 41.25 29.11 76.68 -68.11 ...
##  $ CustomerID       : num  37039 19144 50472 96586 NA ...
##  $ Country          : chr  "Australia" "Spain" "Germany" "Netherlands" ...
##  $ Discount         : num  0.47 0.19 0.35 0.14 1.5 ...
##  $ PaymentMethod    : chr  "Bank Transfer" "paypall" "Bank Transfer" "paypall" ...
##  $ ShippingCost     : num  10.79 9.51 23.03 11.08 NA ...
##  $ Category         : chr  "Apparel" "Electronics" "Electronics" "Accessories" ...
##  $ SalesChannel     : chr  "In-store" "Online" "Online" "Online" ...
##  $ ReturnStatus     : chr  "Not Returned" "Not Returned" "Returned" "Not Returned" ...
##  $ ShipmentProvider : chr  "UPS" "UPS" "UPS" "Royal Mail" ...
##  $ WarehouseLocation: chr  "London" "Rome" "Berlin" "Rome" ...
##  $ OrderPriority    : chr  "Medium" "Medium" "High" "Low" ...

Terdapat 44804 transaksi yang tercatat pada dataset ini. Dataset ini memiliki 17 Variabel yang terdiri dari: 1. InvoiceNo (Nomor Transaksi) 2. StockCode (Kode Stock) 3. Description (Nama Produk) 4. Quantity (Jumlah Barang yang di Beli) 5. InvoiceDate (Tanggal Transaksi) 6. UnitPrice (Harga Barang) 7. CustomerID (ID Pembeli) 8. Country (Negara Pembeli) 9. Discount (Potongan Harga) 10. PaymentMethod (Metode Pembayaran) 11. ShippingCost (Biaya Pengiriman) 12. Category (Kategori Produk) 13. SalesChannel (Jalur Penjualan) 14. ReturnStatus (Status Pengembalian) 15. ShipmentProvider (Layanan Pengiriman) 16. WarehouseLocation (Lokasi Gudang) 17. OrderPriority (Priorotas Pembelian).

#Cleaning Data

colSums(is.na(dat)) #Mengidentifikasi jumlah NA pada tiap kolom/variabel
##         InvoiceNo         StockCode       Description          Quantity 
##                 0                 0                 0                 0 
##       InvoiceDate         UnitPrice        CustomerID           Country 
##                 0                 0              4978                 0 
##          Discount     PaymentMethod      ShippingCost          Category 
##                 0                 0              2489                 0 
##      SalesChannel      ReturnStatus  ShipmentProvider WarehouseLocation 
##                 0                 0                 0                 0 
##     OrderPriority 
##                 0

Terdapat 4978 missing value pada kolom CustomerID dan 2489 missing value pada kolom ShippingCost. Kemudian akan dilakukan penghapusan baris yang terdapat missing value pada data.

dat <- na.omit(dat) #Menghapus baris yg memiliki NA
str(dat)
## 'data.frame':    44804 obs. of  17 variables:
##  $ InvoiceNo        : int  221958 771155 231932 465838 744167 210268 832180 154886 237337 621430 ...
##  $ StockCode        : chr  "SKU_1964" "SKU_1241" "SKU_1501" "SKU_1760" ...
##  $ Description      : chr  "White Mug" "White Mug" "Headphones" "Desk Lamp" ...
##  $ Quantity         : int  38 18 49 14 47 25 8 19 40 49 ...
##  $ InvoiceDate      : chr  "2020-01-01 00:00" "2020-01-01 01:00" "2020-01-01 02:00" "2020-01-01 03:00" ...
##  $ UnitPrice        : num  1.71 41.25 29.11 76.68 70.16 ...
##  $ CustomerID       : num  37039 19144 50472 96586 53887 ...
##  $ Country          : chr  "Australia" "Spain" "Germany" "Netherlands" ...
##  $ Discount         : num  0.47 0.19 0.35 0.14 0.48 0.15 0.04 0.05 0.16 0.19 ...
##  $ PaymentMethod    : chr  "Bank Transfer" "paypall" "Bank Transfer" "paypall" ...
##  $ ShippingCost     : num  10.79 9.51 23.03 11.08 13.98 ...
##  $ Category         : chr  "Apparel" "Electronics" "Electronics" "Accessories" ...
##  $ SalesChannel     : chr  "In-store" "Online" "Online" "Online" ...
##  $ ReturnStatus     : chr  "Not Returned" "Not Returned" "Returned" "Not Returned" ...
##  $ ShipmentProvider : chr  "UPS" "UPS" "UPS" "Royal Mail" ...
##  $ WarehouseLocation: chr  "London" "Rome" "Berlin" "Rome" ...
##  $ OrderPriority    : chr  "Medium" "Medium" "High" "Low" ...
##  - attr(*, "na.action")= 'omit' Named int [1:4978] 5 50 63 72 104 145 148 168 174 177 ...
##   ..- attr(*, "names")= chr [1:4978] "5" "50" "63" "72" ...
duplicated<- dat[duplicated(dat), ]
print(duplicated)
##  [1] InvoiceNo         StockCode         Description       Quantity         
##  [5] InvoiceDate       UnitPrice         CustomerID        Country          
##  [9] Discount          PaymentMethod     ShippingCost      Category         
## [13] SalesChannel      ReturnStatus      ShipmentProvider  WarehouseLocation
## [17] OrderPriority    
## <0 rows> (or 0-length row.names)

Tidak terdapat duplikasi pada data.

#Subset data Invoice Date Tahun 2020-2023
max(dat$InvoiceDate)
## [1] "2025-09-05 05:00"

Tanggal transkasi pada data paling baru adalah 5 September 2025. Akan dilakukan subset data hanya pada transaksi yang berlangsung pada pertengahan Tahun 2024.

dat<-subset(dat, dat$InvoiceDate<"2024-07-01 00:00") #Subset data sebelum 2024
max(dat$InvoiceDate)
## [1] "2024-06-30 23:00"

#Ekpolrasi data : Total Pendapatan tiap Bulan

dat$TotalPrice <- (dat$Quantity*dat$UnitPrice)+(dat$ShippingCost)-dat$Discount 
head(dat)
##   InvoiceNo StockCode  Description Quantity      InvoiceDate UnitPrice
## 1    221958  SKU_1964    White Mug       38 2020-01-01 00:00      1.71
## 2    771155  SKU_1241    White Mug       18 2020-01-01 01:00     41.25
## 3    231932  SKU_1501   Headphones       49 2020-01-01 02:00     29.11
## 4    465838  SKU_1760    Desk Lamp       14 2020-01-01 03:00     76.68
## 6    744167  SKU_1006 Office Chair       47 2020-01-01 05:00     70.16
## 7    210268  SKU_1087    USB Cable       25 2020-01-01 06:00     85.74
##   CustomerID     Country Discount PaymentMethod ShippingCost    Category
## 1      37039   Australia     0.47 Bank Transfer        10.79     Apparel
## 2      19144       Spain     0.19       paypall         9.51 Electronics
## 3      50472     Germany     0.35 Bank Transfer        23.03 Electronics
## 4      96586 Netherlands     0.14       paypall        11.08 Accessories
## 6      53887      Sweden     0.48   Credit Card        13.98 Electronics
## 7      46567     Belgium     0.15 Bank Transfer        12.92  Stationery
##   SalesChannel ReturnStatus ShipmentProvider WarehouseLocation OrderPriority
## 1     In-store Not Returned              UPS            London        Medium
## 2       Online Not Returned              UPS              Rome        Medium
## 3       Online     Returned              UPS            Berlin          High
## 4       Online Not Returned       Royal Mail              Rome           Low
## 6       Online Not Returned              DHL            London        Medium
## 7       Online Not Returned            FedEx         Amsterdam          High
##   TotalPrice
## 1      75.30
## 2     751.82
## 3    1449.07
## 4    1084.46
## 6    3311.02
## 7    2156.27
dat$InvoiceDate <- as.Date(dat$InvoiceDate, format = "%Y-%m-%d")

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.3.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
revenue <- dat %>%
  mutate(MonthDate = floor_date(InvoiceDate, "month")) %>% 
  group_by(MonthDate) %>%
  summarise(TotalRevenue = sum(TotalPrice))

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.0
## ✔ readr   2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
ggplot(revenue, aes(x = MonthDate, y = TotalRevenue)) +
  geom_line(color = "#70c6e8", size = 0.7) +
  geom_point(color = "#70c6e8", size = 1.2) +
  labs(
    title = "Total Revenue Per Month",
    x = "Month",
    y = "Total Revenue"
  ) +
  theme(plot.title = element_text(
    hjust = 0.5,          
    size = 16,            
    face = "bold")) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Eksplorasi data : Top 5 Pelanggan

library(janitor)
## Warning: package 'janitor' was built under R version 4.3.3
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
Cust <- data.frame(dat$CustomerID,dat$TotalPrice)
colnames(Cust)<-c("CustomerID", "TotalPrice")
Cust5 <- Cust %>%
  arrange(desc(TotalPrice)) %>%
  slice_head(n = 5) 
print(Cust5)
##   CustomerID TotalPrice
## 1      93080    4910.54
## 2      25189    4908.77
## 3      63706    4908.23
## 4      43562    4899.51
## 5      99784    4892.44

Dari tabel tersebut ditampilkan 5 pembeli dengan total pembelian tenbanyak. Pembeli dengan total pemeblian terbanyak adalah pembeli dengan CustomerID 93080.

#Eksplorasi data : Top 5 Produk paling laris

#Top5 Product
Prod <- dat %>%
  group_by(Description) %>%
  summarise(Total=sum(Quantity)) %>%
  arrange(desc(Total)) %>%
  slice_head(n = 5)
print(Prod)
## # A tibble: 5 × 2
##   Description  Total
##   <chr>        <int>
## 1 Wall Clock   82321
## 2 USB Cable    81495
## 3 White Mug    81494
## 4 Backpack     81458
## 5 Office Chair 81110

Produk yang paling laris di pasaran adalah Wall Clock.