DESKRIPSI DATA Dataset ini berisi catatan transaksi penjualan dari sebuah toko ritel online yang menjual berbagai produk rumah tangga dan hadiah. Setiap baris mewakili satu item yang dibeli dalam satu transaksi (invoice). - InvoiceNo: Nomor faktur transaksi. Setiap transaksi memiliki nomor unik. - StockCode: Kode unik untuk setiap produk. - Description: Nama atau deskripsi produk. - Quantity: Jumlah unit produk yang dibeli. - InvoiceDate: Tanggal dan waktu transaksi terjadi. - UnitPrice: Harga satuan produk. - CustomerID: ID unik untuk setiap pelanggan. - Country: Negara asal pelanggan.
Variabel X (independen): faktor yang mempengaruhi sesuatu. Variabel Y (dependen): hasil atau hal yang dipengaruhi.
Goals 1: Menganalisis distribusi harga produk Pertanyaan: Bagaimana distribusi produk berdasarkan kategori harga (mahal >3 vs murah ≤3)? Variabel: X: price_class Y: Frekuensi produk per kategori
Goals 2: Menganalisis hubungan Quantity dan UnitPrice Pertanyaan: Apakah ada hubungan antara jumlah produk yang dibeli dengan harga satuan produk? Variabel: X: Quantity Y: UnitPrice
Goals 3: Mengidentifikasi produk terlaris Pertanyaan: Produk apa saja yang paling banyak terjual? Variabel: X: Description (nama produk) Y: Jumlah terjual (Quantity)
Goals 4: Identifikasi pelanggan teraktif Pertanyaan: Siapa saja pelanggan dengan jumlah pembelian terbanyak? Variabel: X: CustomerID Y: Jumlah item yang dibeli (Quantity)
Goals 5: Analisis distribusi transaksi berdasarkan negara Pertanyaan: Bagaimana distribusi jumlah item yang dibeli berdasarkan asal negara pelanggan? Variabel: X: Country Y: Quantity
library(readxl)
online_retail <- read_excel ("Online Retail.xlsx")
online_retail
## # A tibble: 541,909 × 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEA… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTE… 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEART… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLA… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE… 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NE… 2 2010-12-01 08:26:00 7.65
## 7 536365 21730 GLASS STAR FROSTE… 6 2010-12-01 08:26:00 4.25
## 8 536366 22633 HAND WARMER UNION… 6 2010-12-01 08:28:00 1.85
## 9 536366 22632 HAND WARMER RED P… 6 2010-12-01 08:28:00 1.85
## 10 536367 84879 ASSORTED COLOUR B… 32 2010-12-01 08:34:00 1.69
## # ℹ 541,899 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
#Cek apakah ada missing value
colSums(is.na(online_retail))
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 0 0 1454 0 0 0
## CustomerID Country
## 135080 0
#Menghapus baris yang punya NA di kolom tertentu
library (dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
online_retail1 <- online_retail %>%
filter(if_all(c(Description, CustomerID), ~ !is.na(.)))
online_retail1
## # A tibble: 406,829 × 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEA… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTE… 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEART… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLA… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE… 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NE… 2 2010-12-01 08:26:00 7.65
## 7 536365 21730 GLASS STAR FROSTE… 6 2010-12-01 08:26:00 4.25
## 8 536366 22633 HAND WARMER UNION… 6 2010-12-01 08:28:00 1.85
## 9 536366 22632 HAND WARMER RED P… 6 2010-12-01 08:28:00 1.85
## 10 536367 84879 ASSORTED COLOUR B… 32 2010-12-01 08:34:00 1.69
## # ℹ 406,819 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
#Menghapus baris yang punya NA di semua kolom
online_retail1 <- online_retail %>%
filter(if_all(everything(), ~ !is.na(.)))
online_retail1
## # A tibble: 406,829 × 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEA… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTE… 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEART… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLA… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE… 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NE… 2 2010-12-01 08:26:00 7.65
## 7 536365 21730 GLASS STAR FROSTE… 6 2010-12-01 08:26:00 4.25
## 8 536366 22633 HAND WARMER UNION… 6 2010-12-01 08:28:00 1.85
## 9 536366 22632 HAND WARMER RED P… 6 2010-12-01 08:28:00 1.85
## 10 536367 84879 ASSORTED COLOUR B… 32 2010-12-01 08:34:00 1.69
## # ℹ 406,819 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
#Cek lagi apakah masih ada missing value
colSums(is.na(online_retail1))
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 0 0 0 0 0 0
## CustomerID Country
## 0 0
#Cek duplikasi
online_retail2 <- online_retail1 %>%
filter(duplicated(.))
online_retail3 <- online_retail2 %>%
group_by(across(everything())) %>%
filter(n()>1)
online_retail3
## # A tibble: 678 × 8
## # Groups: InvoiceNo, StockCode, Description, Quantity, InvoiceDate,
## # UnitPrice, CustomerID, Country [290]
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536412 21448 12 DAISY PEGS IN … 2 2010-12-01 11:49:00 1.65
## 2 536412 21448 12 DAISY PEGS IN … 2 2010-12-01 11:49:00 1.65
## 3 536464 22866 HAND WARMER SCOTT… 1 2010-12-01 12:23:00 2.1
## 4 536464 22866 HAND WARMER SCOTT… 1 2010-12-01 12:23:00 2.1
## 5 536749 21415 CLAM SHELL SMALL 2 2010-12-02 13:49:00 2.1
## 6 536749 22174 PHOTO CUBE 2 2010-12-02 13:49:00 1.65
## 7 536749 21415 CLAM SHELL SMALL 2 2010-12-02 13:49:00 2.1
## 8 536749 22174 PHOTO CUBE 2 2010-12-02 13:49:00 1.65
## 9 536796 21967 PACK OF 12 SKULL … 1 2010-12-02 15:46:00 0.29
## 10 536796 21967 PACK OF 12 SKULL … 1 2010-12-02 15:46:00 0.29
## # ℹ 668 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
#Delete duplicate
online_retail4 <- online_retail3%>%
distinct()
online_retail4
## # A tibble: 290 × 8
## # Groups: InvoiceNo, StockCode, Description, Quantity, InvoiceDate,
## # UnitPrice, CustomerID, Country [290]
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536412 21448 12 DAISY PEGS IN … 2 2010-12-01 11:49:00 1.65
## 2 536464 22866 HAND WARMER SCOTT… 1 2010-12-01 12:23:00 2.1
## 3 536749 21415 CLAM SHELL SMALL 2 2010-12-02 13:49:00 2.1
## 4 536749 22174 PHOTO CUBE 2 2010-12-02 13:49:00 1.65
## 5 536796 21967 PACK OF 12 SKULL … 1 2010-12-02 15:46:00 0.29
## 6 536874 22866 HAND WARMER SCOTT… 1 2010-12-03 11:35:00 2.1
## 7 537042 21579 LOLITA DESIGN C… 1 2010-12-05 10:45:00 2.25
## 8 537051 22730 ALARM CLOCK BAKEL… 1 2010-12-05 11:12:00 3.75
## 9 537144 22086 PAPER CHAIN KIT 5… 1 2010-12-05 13:00:00 2.95
## 10 537144 21882 SKULLS TAPE 1 2010-12-05 13:00:00 0.65
## # ℹ 280 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
#Cek duplikasi lagi
duplicated (online_retail4)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE
online_retail5 <- online_retail4 %>%
mutate(price_class = if_else(UnitPrice > 3, "0", "1"))
#0=mahal, 1=murah
online_retail5
## # A tibble: 290 × 9
## # Groups: InvoiceNo, StockCode, Description, Quantity, InvoiceDate,
## # UnitPrice, CustomerID, Country [290]
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536412 21448 12 DAISY PEGS IN … 2 2010-12-01 11:49:00 1.65
## 2 536464 22866 HAND WARMER SCOTT… 1 2010-12-01 12:23:00 2.1
## 3 536749 21415 CLAM SHELL SMALL 2 2010-12-02 13:49:00 2.1
## 4 536749 22174 PHOTO CUBE 2 2010-12-02 13:49:00 1.65
## 5 536796 21967 PACK OF 12 SKULL … 1 2010-12-02 15:46:00 0.29
## 6 536874 22866 HAND WARMER SCOTT… 1 2010-12-03 11:35:00 2.1
## 7 537042 21579 LOLITA DESIGN C… 1 2010-12-05 10:45:00 2.25
## 8 537051 22730 ALARM CLOCK BAKEL… 1 2010-12-05 11:12:00 3.75
## 9 537144 22086 PAPER CHAIN KIT 5… 1 2010-12-05 13:00:00 2.95
## 10 537144 21882 SKULLS TAPE 1 2010-12-05 13:00:00 0.65
## # ℹ 280 more rows
## # ℹ 3 more variables: CustomerID <dbl>, Country <chr>, price_class <chr>
#Distribusi Harga Produk
library (ggplot2)
ggplot(online_retail5, aes(x = price_class)) +
geom_bar(fill = "steelblue") +
ggtitle("Distribusi Harga Produk (>3 vs <=3)") +
theme_minimal()
online_retail4 %>%
summarise(
min_UnitPrice = min(UnitPrice, na.rm = TRUE),
max_UnitPrice = max(UnitPrice, na.rm = TRUE),
mean_UnitPrice = mean(UnitPrice, na.rm = TRUE),
median_UnitPrice = median(UnitPrice, na.rm = TRUE),
sd_UnitPrice = sd(UnitPrice, na.rm = TRUE),
.groups = "drop"
)
## # A tibble: 290 × 13
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536412 21448 12 DAISY PEGS IN … 2 2010-12-01 11:49:00 1.65
## 2 536464 22866 HAND WARMER SCOTT… 1 2010-12-01 12:23:00 2.1
## 3 536749 21415 CLAM SHELL SMALL 2 2010-12-02 13:49:00 2.1
## 4 536749 22174 PHOTO CUBE 2 2010-12-02 13:49:00 1.65
## 5 536796 21967 PACK OF 12 SKULL … 1 2010-12-02 15:46:00 0.29
## 6 536874 22866 HAND WARMER SCOTT… 1 2010-12-03 11:35:00 2.1
## 7 537042 21579 LOLITA DESIGN C… 1 2010-12-05 10:45:00 2.25
## 8 537051 22730 ALARM CLOCK BAKEL… 1 2010-12-05 11:12:00 3.75
## 9 537144 21882 SKULLS TAPE 1 2010-12-05 13:00:00 0.65
## 10 537144 22086 PAPER CHAIN KIT 5… 1 2010-12-05 13:00:00 2.95
## # ℹ 280 more rows
## # ℹ 7 more variables: CustomerID <dbl>, Country <chr>, min_UnitPrice <dbl>,
## # max_UnitPrice <dbl>, mean_UnitPrice <dbl>, median_UnitPrice <dbl>,
## # sd_UnitPrice <dbl>
cor(online_retail4$Quantity, online_retail4$UnitPrice, use = "complete.obs")
## [1] -0.162108
ggplot(online_retail4, aes(x = Quantity, y = UnitPrice)) +
geom_point(color = "steelblue", size = 3) +
geom_smooth(method = "lm", se = FALSE, color = "darkred") +
labs(
title = "Scatter Plot: Quantity vs UnitPrice",
x = "Jumlah Produk Dibeli (Quantity)",
y = "Harga Satuan (UnitPrice)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#Perbandingan jumlah produk terjual per kategori
library(dplyr)
produk_terjual <- online_retail5 %>%
group_by(Description) %>%
summarise(Jumlah = sum(Quantity)) %>%
arrange(desc(Jumlah)) %>%
slice_head(n = 10) #mengambil 10 produk teratas
produk_terjual
## # A tibble: 10 × 2
## Description Jumlah
## <chr> <dbl>
## 1 Manual 48
## 2 FANCY FONTS BIRTHDAY WRAP 25
## 3 BLUE STONES ON WIRE FOR CANDLE 24
## 4 DISCO BALL CHRISTMAS DECORATION 24
## 5 HEART FILIGREE DOVE SMALL 24
## 6 SMALL DOLLY MIX DESIGN ORANGE BOWL 16
## 7 CLEAR DRAWER KNOB ACRYLIC EDWARDIAN 12
## 8 COOK WITH WINE METAL SIGN 12
## 9 HOT BATHS METAL SIGN 12
## 10 MOROCCAN TEA GLASS 12
library(ggplot2)
ggplot(produk_terjual, aes(x = reorder(Description, Jumlah), y = Jumlah)) +
geom_bar(stat = "identity", fill = "pink") +
coord_flip() +
labs(title = "Top 10 Produk Paling Banyak Terjual",
x = "Produk",
y = "Jumlah Terjual") +
theme_minimal()
#Perbandingan Jumlah Item Terjual per Pelanggan
pelanggan_teraktif <- online_retail5 %>%
group_by(CustomerID) %>%
summarise(Jumlah = sum(Quantity)) %>%
arrange(desc(Jumlah)) %>%
slice_head(n = 10)
pelanggan_teraktif
## # A tibble: 10 × 2
## CustomerID Jumlah
## <dbl> <dbl>
## 1 14607 48
## 2 14102 42
## 3 15919 38
## 4 12748 33
## 5 14525 27
## 6 14432 24
## 7 15856 24
## 8 17885 24
## 9 17084 13
## 10 16814 12
ggplot(pelanggan_teraktif, aes(x = as.factor(CustomerID), y = Jumlah, fill = as.factor(CustomerID))) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Customer Berdasarkan Jumlah Item Terjual",
x = "CustomerID",
y = "Jumlah Item") +
theme_minimal()
#Histogram Jumlah Item Berdasarkan Negara
ggplot(online_retail5, aes(x = Quantity, fill = Country)) +
geom_histogram(position = "identity", bins = 30, alpha = 0.6) +
labs(title = "Distribusi Jumlah Item Berdasarkan Negara",
x = "Jumlah Item per Baris Transaksi",
y = "Frekuensi") +
theme_minimal()