DESKRIPSI DATA Dataset ini berisi catatan transaksi penjualan dari sebuah toko ritel online yang menjual berbagai produk rumah tangga dan hadiah. Setiap baris mewakili satu item yang dibeli dalam satu transaksi (invoice). - InvoiceNo: Nomor faktur transaksi. Setiap transaksi memiliki nomor unik. - StockCode: Kode unik untuk setiap produk. - Description: Nama atau deskripsi produk. - Quantity: Jumlah unit produk yang dibeli. - InvoiceDate: Tanggal dan waktu transaksi terjadi. - UnitPrice: Harga satuan produk. - CustomerID: ID unik untuk setiap pelanggan. - Country: Negara asal pelanggan.

Variabel X (independen): faktor yang mempengaruhi sesuatu. Variabel Y (dependen): hasil atau hal yang dipengaruhi.

library(readxl)
online_retail <- read_excel ("Online Retail.xlsx") 
online_retail
## # A tibble: 541,909 × 8
##    InvoiceNo StockCode Description        Quantity InvoiceDate         UnitPrice
##    <chr>     <chr>     <chr>                 <dbl> <dttm>                  <dbl>
##  1 536365    85123A    WHITE HANGING HEA…        6 2010-12-01 08:26:00      2.55
##  2 536365    71053     WHITE METAL LANTE…        6 2010-12-01 08:26:00      3.39
##  3 536365    84406B    CREAM CUPID HEART…        8 2010-12-01 08:26:00      2.75
##  4 536365    84029G    KNITTED UNION FLA…        6 2010-12-01 08:26:00      3.39
##  5 536365    84029E    RED WOOLLY HOTTIE…        6 2010-12-01 08:26:00      3.39
##  6 536365    22752     SET 7 BABUSHKA NE…        2 2010-12-01 08:26:00      7.65
##  7 536365    21730     GLASS STAR FROSTE…        6 2010-12-01 08:26:00      4.25
##  8 536366    22633     HAND WARMER UNION…        6 2010-12-01 08:28:00      1.85
##  9 536366    22632     HAND WARMER RED P…        6 2010-12-01 08:28:00      1.85
## 10 536367    84879     ASSORTED COLOUR B…       32 2010-12-01 08:34:00      1.69
## # ℹ 541,899 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
#Cek apakah ada missing value
colSums(is.na(online_retail))
##   InvoiceNo   StockCode Description    Quantity InvoiceDate   UnitPrice 
##           0           0        1454           0           0           0 
##  CustomerID     Country 
##      135080           0
#Menghapus baris yang punya NA di kolom tertentu
library (dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
online_retail1 <- online_retail %>%
  filter(if_all(c(Description, CustomerID), ~ !is.na(.))) 
online_retail1
## # A tibble: 406,829 × 8
##    InvoiceNo StockCode Description        Quantity InvoiceDate         UnitPrice
##    <chr>     <chr>     <chr>                 <dbl> <dttm>                  <dbl>
##  1 536365    85123A    WHITE HANGING HEA…        6 2010-12-01 08:26:00      2.55
##  2 536365    71053     WHITE METAL LANTE…        6 2010-12-01 08:26:00      3.39
##  3 536365    84406B    CREAM CUPID HEART…        8 2010-12-01 08:26:00      2.75
##  4 536365    84029G    KNITTED UNION FLA…        6 2010-12-01 08:26:00      3.39
##  5 536365    84029E    RED WOOLLY HOTTIE…        6 2010-12-01 08:26:00      3.39
##  6 536365    22752     SET 7 BABUSHKA NE…        2 2010-12-01 08:26:00      7.65
##  7 536365    21730     GLASS STAR FROSTE…        6 2010-12-01 08:26:00      4.25
##  8 536366    22633     HAND WARMER UNION…        6 2010-12-01 08:28:00      1.85
##  9 536366    22632     HAND WARMER RED P…        6 2010-12-01 08:28:00      1.85
## 10 536367    84879     ASSORTED COLOUR B…       32 2010-12-01 08:34:00      1.69
## # ℹ 406,819 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
#Menghapus baris yang punya NA di semua kolom 
online_retail1 <- online_retail %>%
  filter(if_all(everything(), ~ !is.na(.)))
online_retail1
## # A tibble: 406,829 × 8
##    InvoiceNo StockCode Description        Quantity InvoiceDate         UnitPrice
##    <chr>     <chr>     <chr>                 <dbl> <dttm>                  <dbl>
##  1 536365    85123A    WHITE HANGING HEA…        6 2010-12-01 08:26:00      2.55
##  2 536365    71053     WHITE METAL LANTE…        6 2010-12-01 08:26:00      3.39
##  3 536365    84406B    CREAM CUPID HEART…        8 2010-12-01 08:26:00      2.75
##  4 536365    84029G    KNITTED UNION FLA…        6 2010-12-01 08:26:00      3.39
##  5 536365    84029E    RED WOOLLY HOTTIE…        6 2010-12-01 08:26:00      3.39
##  6 536365    22752     SET 7 BABUSHKA NE…        2 2010-12-01 08:26:00      7.65
##  7 536365    21730     GLASS STAR FROSTE…        6 2010-12-01 08:26:00      4.25
##  8 536366    22633     HAND WARMER UNION…        6 2010-12-01 08:28:00      1.85
##  9 536366    22632     HAND WARMER RED P…        6 2010-12-01 08:28:00      1.85
## 10 536367    84879     ASSORTED COLOUR B…       32 2010-12-01 08:34:00      1.69
## # ℹ 406,819 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
#Cek lagi apakah masih ada missing value
colSums(is.na(online_retail1))
##   InvoiceNo   StockCode Description    Quantity InvoiceDate   UnitPrice 
##           0           0           0           0           0           0 
##  CustomerID     Country 
##           0           0
#Cek duplikasi
online_retail2 <- online_retail1 %>%
  filter(duplicated(.))

online_retail3 <- online_retail2 %>%
  group_by(across(everything())) %>%
  filter(n()>1)
online_retail3
## # A tibble: 678 × 8
## # Groups:   InvoiceNo, StockCode, Description, Quantity, InvoiceDate,
## #   UnitPrice, CustomerID, Country [290]
##    InvoiceNo StockCode Description        Quantity InvoiceDate         UnitPrice
##    <chr>     <chr>     <chr>                 <dbl> <dttm>                  <dbl>
##  1 536412    21448     12 DAISY PEGS IN …        2 2010-12-01 11:49:00      1.65
##  2 536412    21448     12 DAISY PEGS IN …        2 2010-12-01 11:49:00      1.65
##  3 536464    22866     HAND WARMER SCOTT…        1 2010-12-01 12:23:00      2.1 
##  4 536464    22866     HAND WARMER SCOTT…        1 2010-12-01 12:23:00      2.1 
##  5 536749    21415     CLAM SHELL SMALL          2 2010-12-02 13:49:00      2.1 
##  6 536749    22174     PHOTO CUBE                2 2010-12-02 13:49:00      1.65
##  7 536749    21415     CLAM SHELL SMALL          2 2010-12-02 13:49:00      2.1 
##  8 536749    22174     PHOTO CUBE                2 2010-12-02 13:49:00      1.65
##  9 536796    21967     PACK OF 12 SKULL …        1 2010-12-02 15:46:00      0.29
## 10 536796    21967     PACK OF 12 SKULL …        1 2010-12-02 15:46:00      0.29
## # ℹ 668 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
#Delete duplicate
online_retail4 <- online_retail3%>%
  distinct()
online_retail4
## # A tibble: 290 × 8
## # Groups:   InvoiceNo, StockCode, Description, Quantity, InvoiceDate,
## #   UnitPrice, CustomerID, Country [290]
##    InvoiceNo StockCode Description        Quantity InvoiceDate         UnitPrice
##    <chr>     <chr>     <chr>                 <dbl> <dttm>                  <dbl>
##  1 536412    21448     12 DAISY PEGS IN …        2 2010-12-01 11:49:00      1.65
##  2 536464    22866     HAND WARMER SCOTT…        1 2010-12-01 12:23:00      2.1 
##  3 536749    21415     CLAM SHELL SMALL          2 2010-12-02 13:49:00      2.1 
##  4 536749    22174     PHOTO CUBE                2 2010-12-02 13:49:00      1.65
##  5 536796    21967     PACK OF 12 SKULL …        1 2010-12-02 15:46:00      0.29
##  6 536874    22866     HAND WARMER SCOTT…        1 2010-12-03 11:35:00      2.1 
##  7 537042    21579     LOLITA  DESIGN  C…        1 2010-12-05 10:45:00      2.25
##  8 537051    22730     ALARM CLOCK BAKEL…        1 2010-12-05 11:12:00      3.75
##  9 537144    22086     PAPER CHAIN KIT 5…        1 2010-12-05 13:00:00      2.95
## 10 537144    21882     SKULLS TAPE               1 2010-12-05 13:00:00      0.65
## # ℹ 280 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
#Cek duplikasi lagi
duplicated (online_retail4)
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE
online_retail5 <- online_retail4 %>%
  mutate(price_class = if_else(UnitPrice > 3, "0", "1"))
#0=mahal, 1=murah
online_retail5
## # A tibble: 290 × 9
## # Groups:   InvoiceNo, StockCode, Description, Quantity, InvoiceDate,
## #   UnitPrice, CustomerID, Country [290]
##    InvoiceNo StockCode Description        Quantity InvoiceDate         UnitPrice
##    <chr>     <chr>     <chr>                 <dbl> <dttm>                  <dbl>
##  1 536412    21448     12 DAISY PEGS IN …        2 2010-12-01 11:49:00      1.65
##  2 536464    22866     HAND WARMER SCOTT…        1 2010-12-01 12:23:00      2.1 
##  3 536749    21415     CLAM SHELL SMALL          2 2010-12-02 13:49:00      2.1 
##  4 536749    22174     PHOTO CUBE                2 2010-12-02 13:49:00      1.65
##  5 536796    21967     PACK OF 12 SKULL …        1 2010-12-02 15:46:00      0.29
##  6 536874    22866     HAND WARMER SCOTT…        1 2010-12-03 11:35:00      2.1 
##  7 537042    21579     LOLITA  DESIGN  C…        1 2010-12-05 10:45:00      2.25
##  8 537051    22730     ALARM CLOCK BAKEL…        1 2010-12-05 11:12:00      3.75
##  9 537144    22086     PAPER CHAIN KIT 5…        1 2010-12-05 13:00:00      2.95
## 10 537144    21882     SKULLS TAPE               1 2010-12-05 13:00:00      0.65
## # ℹ 280 more rows
## # ℹ 3 more variables: CustomerID <dbl>, Country <chr>, price_class <chr>
#Distribusi Harga Produk
library (ggplot2)
ggplot(online_retail5, aes(x = price_class)) +
  geom_bar(fill = "steelblue") +
  ggtitle("Distribusi Harga Produk (>3 vs <=3)") +
  theme_minimal()

online_retail4 %>%
  summarise(
    min_UnitPrice = min(UnitPrice, na.rm = TRUE),
    max_UnitPrice = max(UnitPrice, na.rm = TRUE),
    mean_UnitPrice = mean(UnitPrice, na.rm = TRUE),
    median_UnitPrice = median(UnitPrice, na.rm = TRUE),
    sd_UnitPrice = sd(UnitPrice, na.rm = TRUE),
    .groups = "drop"
)
## # A tibble: 290 × 13
##    InvoiceNo StockCode Description        Quantity InvoiceDate         UnitPrice
##    <chr>     <chr>     <chr>                 <dbl> <dttm>                  <dbl>
##  1 536412    21448     12 DAISY PEGS IN …        2 2010-12-01 11:49:00      1.65
##  2 536464    22866     HAND WARMER SCOTT…        1 2010-12-01 12:23:00      2.1 
##  3 536749    21415     CLAM SHELL SMALL          2 2010-12-02 13:49:00      2.1 
##  4 536749    22174     PHOTO CUBE                2 2010-12-02 13:49:00      1.65
##  5 536796    21967     PACK OF 12 SKULL …        1 2010-12-02 15:46:00      0.29
##  6 536874    22866     HAND WARMER SCOTT…        1 2010-12-03 11:35:00      2.1 
##  7 537042    21579     LOLITA  DESIGN  C…        1 2010-12-05 10:45:00      2.25
##  8 537051    22730     ALARM CLOCK BAKEL…        1 2010-12-05 11:12:00      3.75
##  9 537144    21882     SKULLS TAPE               1 2010-12-05 13:00:00      0.65
## 10 537144    22086     PAPER CHAIN KIT 5…        1 2010-12-05 13:00:00      2.95
## # ℹ 280 more rows
## # ℹ 7 more variables: CustomerID <dbl>, Country <chr>, min_UnitPrice <dbl>,
## #   max_UnitPrice <dbl>, mean_UnitPrice <dbl>, median_UnitPrice <dbl>,
## #   sd_UnitPrice <dbl>
cor(online_retail4$Quantity, online_retail4$UnitPrice, use = "complete.obs")
## [1] -0.162108
ggplot(online_retail4, aes(x = Quantity, y = UnitPrice)) +
  geom_point(color = "steelblue", size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "darkred") +
  labs(
    title = "Scatter Plot: Quantity vs UnitPrice",
    x = "Jumlah Produk Dibeli (Quantity)",
    y = "Harga Satuan (UnitPrice)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#Perbandingan jumlah produk terjual per kategori
library(dplyr)
produk_terjual <- online_retail5 %>%
  group_by(Description) %>%
  summarise(Jumlah = sum(Quantity)) %>%
  arrange(desc(Jumlah)) %>%
  slice_head(n = 10)  #mengambil 10 produk teratas
produk_terjual
## # A tibble: 10 × 2
##    Description                         Jumlah
##    <chr>                                <dbl>
##  1 Manual                                  48
##  2 FANCY FONTS BIRTHDAY WRAP               25
##  3 BLUE STONES ON WIRE FOR CANDLE          24
##  4 DISCO BALL CHRISTMAS DECORATION         24
##  5 HEART FILIGREE DOVE  SMALL              24
##  6 SMALL DOLLY MIX DESIGN ORANGE BOWL      16
##  7 CLEAR DRAWER KNOB ACRYLIC EDWARDIAN     12
##  8 COOK WITH WINE METAL SIGN               12
##  9 HOT BATHS METAL SIGN                    12
## 10 MOROCCAN TEA GLASS                      12
library(ggplot2)
ggplot(produk_terjual, aes(x = reorder(Description, Jumlah), y = Jumlah)) +
  geom_bar(stat = "identity", fill = "pink") +
  coord_flip() +
  labs(title = "Top 10 Produk Paling Banyak Terjual",
       x = "Produk",
       y = "Jumlah Terjual") +
  theme_minimal()

#Perbandingan Jumlah Item Terjual per Pelanggan
pelanggan_teraktif <- online_retail5 %>%
  group_by(CustomerID) %>%
  summarise(Jumlah = sum(Quantity)) %>%
  arrange(desc(Jumlah)) %>%
  slice_head(n = 10)
pelanggan_teraktif
## # A tibble: 10 × 2
##    CustomerID Jumlah
##         <dbl>  <dbl>
##  1      14607     48
##  2      14102     42
##  3      15919     38
##  4      12748     33
##  5      14525     27
##  6      14432     24
##  7      15856     24
##  8      17885     24
##  9      17084     13
## 10      16814     12
ggplot(pelanggan_teraktif, aes(x = as.factor(CustomerID), y = Jumlah, fill = as.factor(CustomerID))) +
  geom_bar(stat = "identity") +
  labs(title = "Top 10 Customer Berdasarkan Jumlah Item Terjual",
       x = "CustomerID",
       y = "Jumlah Item") +
  theme_minimal()

#Histogram Jumlah Item Berdasarkan Negara
ggplot(online_retail5, aes(x = Quantity, fill = Country)) +
  geom_histogram(position = "identity", bins = 30, alpha = 0.6) +
  labs(title = "Distribusi Jumlah Item Berdasarkan Negara",
       x = "Jumlah Item per Baris Transaksi",
       y = "Frekuensi") +
  theme_minimal()