# Load library yang dibutuhkan
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dsbox)
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(dplyr)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

  1. Eksplorasi Data Awal o Tampilkan informasi umum dataset (jumlah data, kolom, tipe data). o Lakukan pembersihan data sederhana jika diperlukan (null values, duplikat, dll).
# Dataset LEGO Sales dari dsbox
data("lego_sales")
lego_data <- lego_sales

glimpse(lego_data)
## Rows: 620
## Columns: 14
## $ first_name   <chr> "Kimberly", "Neel", "Neel", "Chelsea", "Chelsea", "Chelse…
## $ last_name    <chr> "Beckstead", "Garvin", "Garvin", "Bouchard", "Bouchard", …
## $ age          <dbl> 24, 35, 35, 41, 41, 41, 19, 19, 37, 37, 19, 19, 20, 36, 3…
## $ phone_number <chr> "216-555-2549", "819-555-3189", "819-555-3189", NA, NA, N…
## $ set_id       <dbl> 24701, 25626, 24665, 24695, 25626, 24721, 24797, 24701, 2…
## $ number       <chr> "76062", "70595", "21031", "31048", "70595", "10831", "75…
## $ theme        <chr> "DC Comics Super Heroes", "Ninjago", "Architecture", "Cre…
## $ subtheme     <chr> "Mighty Micros", "Rise of the Villains", NA, NA, "Rise of…
## $ year         <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 201…
## $ name         <chr> "Robin vs. Bane", "Ultra Stealth Raider", "Burj Khalifa",…
## $ pieces       <dbl> 77, 1093, 333, 368, 1093, 19, 233, 77, 108, NA, 13, 15, 6…
## $ us_price     <dbl> 9.99, 119.99, 39.99, 29.99, 119.99, 9.99, 24.99, 9.99, 9.…
## $ image_url    <chr> "http://images.brickset.com/sets/images/76062-1.jpg", "ht…
## $ quantity     <dbl> 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
head(lego_data, 620)
## # A tibble: 620 × 14
##    first_name last_name      age phone_number set_id number theme subtheme  year
##    <chr>      <chr>        <dbl> <chr>         <dbl> <chr>  <chr> <chr>    <dbl>
##  1 Kimberly   Beckstead       24 216-555-2549  24701 76062  DC C… Mighty …  2018
##  2 Neel       Garvin          35 819-555-3189  25626 70595  Ninj… Rise of…  2018
##  3 Neel       Garvin          35 819-555-3189  24665 21031  Arch… <NA>      2018
##  4 Chelsea    Bouchard        41 <NA>          24695 31048  Crea… <NA>      2018
##  5 Chelsea    Bouchard        41 <NA>          25626 70595  Ninj… Rise of…  2018
##  6 Chelsea    Bouchard        41 <NA>          24721 10831  Duplo <NA>      2018
##  7 Bryanna    Welsh           19 <NA>          24797 75138  Star… Episode…  2018
##  8 Bryanna    Welsh           19 <NA>          24701 76062  DC C… Mighty …  2018
##  9 Caleb      Garcia-Wide…    37 907-555-9236  24730 41115  Frie… <NA>      2018
## 10 Caleb      Garcia-Wide…    37 907-555-9236  25611 21127  Mine… Minifig…  2018
## # ℹ 610 more rows
## # ℹ 5 more variables: name <chr>, pieces <dbl>, us_price <dbl>,
## #   image_url <chr>, quantity <dbl>
sum(is.na(lego_data))
## [1] 392
lego_data <- distinct(lego_data)

colnames(lego_data)
##  [1] "first_name"   "last_name"    "age"          "phone_number" "set_id"      
##  [6] "number"       "theme"        "subtheme"     "year"         "name"        
## [11] "pieces"       "us_price"     "image_url"    "quantity"
# Menghapus baris yang mengandung NA atau Inf
clean_data <- lego_data %>%
  select_if(is.numeric) %>%
  na.omit()
  1. Visualisasi Wajib Buat minimal 5 visualisasi dari kategori berikut: o Tren Penjualan LEGO per Tahun o 10 customer dengan jumlah trasaksi terbanyak o Sebaran Jumlah Pieces dan Harga o Komposisi Penjualan Berdasarkan Usia o Heatmap Korelasi Antar Variabel Numerik
## Visualisasi 10 customer LEGO Terpopuler Berdasarkan Penjualan
library(dplyr)
library(ggplot2)

# Gabungkan nama depan dan belakang untuk identitas customer
lego_data <- lego_data %>%
  mutate(
    sales = us_price * quantity,
    customer = paste(first_name, last_name)
  )

# Hitung total penjualan per customer
top_customers <- lego_data %>%
  group_by(customer) %>%
  summarise(Total_Sales = sum(sales, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(Total_Sales)) %>%
  slice_head(n = 10)

# Bar chart customer terpopuler
ggplot(top_customers, aes(x = reorder(customer, Total_Sales), y = Total_Sales)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "10 Customer LEGO Terpopuler Berdasarkan Penjualan",
       x = "Customer",
       y = "Total Penjualan (USD)") +
  theme_minimal()

## Visualisasi 10 customer dengan jumlah trasaksi terbanyak
# Hitung total transaksi tiap customer
top_customers_trans <- lego_data %>%
  group_by(first_name, last_name) %>%
  summarise(total_transactions = sum(quantity, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total_transactions)) %>%
  head(10)

# Buat bar chart
ggplot(top_customers_trans, aes(x = reorder(paste(first_name, last_name), total_transactions),
                                y = total_transactions)) +
  geom_col(fill = "darkorange") +
  coord_flip() +
  labs(title = "10 Customer dengan Jumlah Transaksi Terbanyak",
       x = "Nama Pelanggan",
       y = "Total Transaksi (Quantity LEGO)") +
  theme_minimal(base_size = 13)

## Visualisasi Sebaran Jumlah Pieces dan Harga
ggplot(lego_data, aes(x = pieces, y = us_price)) +
  geom_point(color = "steelblue", alpha = 0.7, size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    title = "Sebaran Jumlah Pieces vs Harga LEGO",
    x = "Jumlah Pieces",
    y = "Harga (USD)"
  ) +
  theme_minimal(base_size = 13)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 69 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 69 rows containing missing values or values outside the scale range
## (`geom_point()`).

##Visualisasi Komposisi Penjualan Berdasarkan Usia
# Menghitung penjualan total per usia dan menambahkan kolom sales
lego_data <- lego_data %>%
  mutate(sales = us_price * quantity)

# Membagi data ke dalam kelompok usia
lego_data <- lego_data %>%
  mutate(age_group = case_when(
    age < 18 ~ "<18",
    age >= 18 & age < 30 ~ "18-29",
    age >= 30 & age < 45 ~ "30-44",
    age >= 45 & age < 60 ~ "45-59",
    age >= 60 ~ "60+",
    TRUE ~ "Unknown"
  ))

# Menghitung total penjualan berdasarkan kelompok usia
sales_by_age <- lego_data %>%
  group_by(age_group) %>%
  summarise(Total_Sales = sum(sales, na.rm = TRUE), .groups = "drop")

# Membuat visualisasi bar chart
ggplot(sales_by_age, aes(x = age_group, y = Total_Sales, fill = age_group)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = scales::dollar(Total_Sales)), vjust = -0.3, size = 5, color = "black") +
  scale_fill_manual(values = c("skyblue", "lightgreen", "lightcoral", "lightpink", "lightyellow")) +
  labs(title = "Komposisi Penjualan Berdasarkan Kelompok Usia",
       x = "Kelompok Usia",
       y = "Total Penjualan (USD)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

## Visualisasi Heatmap Korelasi Antar Variabel Numerik
# Tambahkan kolom sales
lego_data <- lego_data %>%
  mutate(sales = us_price * quantity)

# Pilih kolom numerik yang akan dianalisis korelasinya
numeric_data <- lego_data %>%
  select(pieces, age, us_price, quantity)

# Hitung korelasi
corr_matrix <- cor(numeric_data, use = "complete.obs")
melted_corr <- melt(corr_matrix)

ggplot(melted_corr, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "green", mid = "white",
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name = "Korelasi") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 4) +
  theme_minimal() +
  labs(title = "Heatmap Korelasi Antar Variabel Numerik (Quantity, US Price, Pieces, Age)",
       x = NULL, y = NULL)

  1. Insight dan Narasi
  1. 10 Customer LEGO Terpopuler Berdasarkan Penjualan: Visualisasi menunjukkan bahwa sebagian besar penjualan berasal dari segelintir customer yang melakukan pembelian dalam jumlah besar. Hal ini menunjukkan adanya pelanggan prioritas, seperti kolektor atau reseller. Insight: Pendapatan sangat dipengaruhi oleh customer dengan volume pembelian tinggi — penting untuk membina hubungan baik dengan mereka.

  2. 10 Customer dengan Jumlah Transaksi Terbanyak: Dari analisis jumlah transaksi, terlihat bahwa 10 pelanggan teratas paling sering melakukan pembelian. Meskipun nilai penjualannya tidak selalu tertinggi, frekuensi pembelian mereka menjadikan mereka penting dalam menjaga stabilitas transaksi. Insight: Pelanggan yang sering bertransaksi adalah target penting untuk program loyalitas dan promosi.

  3. Sebaran Jumlah Pieces dan Harga: hubungan antara jumlah pieces dalam set LEGO dengan harganya dalam satuan USD. Dari grafik terlihat adanya pola hubungan linier positif, yang mengindikasikan bahwa semakin banyak pieces yang dimiliki oleh sebuah set LEGO, maka harganya pun cenderung lebih tinggi. Ini tercermin dari garis regresi merah yang naik dari kiri bawah ke kanan atas. Namun, terdapat beberapa outlier yang menyimpang dari tren umum ini, kemungkinan karena faktor lain seperti nilai koleksi, tema eksklusif, atau edisi terbatas dari set tersebut. Insight: Terlihat bahwa semakin banyak jumlah pieces LEGO, maka harganya cenderung meningkat. Ini ditunjukkan oleh arah naik dari garis merah regresi.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.