library(DBI)
library(RSQLite)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(RColorBrewer)
library(knitr)
library(kableExtra)

1 Deskripsi Insight & Latar Belakang

Insight Utama: Analisis ini berfokus pada profitabilitas genre film (kategori) dalam bisnis penyewaan DVD Sakila, dikombinasikan dengan segmentasi perilaku pelanggan berdasarkan frekuensi dan nilai pengeluaran (Recency-Frequency-Monetary). Pertanyaan sentral: Genre apa yang paling menguntungkan, dan seperti apa profil pelanggan terbaik kita?

Dataset Sakila adalah contoh basis data relasional yang merepresentasikan sistem penyewaan DVD fiktif. Dataset ini berisi informasi tentang film, pelanggan, transaksi penyewaan, dan pembayaran. Untuk analisis ini, kami menggunakan tabel:

  • film — metadata film (judul, rating, durasi, tarif sewa)
  • category & film_category — genre/kategori film
  • payment — data pembayaran pelanggan
  • rental — transaksi penyewaan
  • inventory — stok film per toko
  • customer — data pelanggan

2 Koneksi Database & Persiapan Data

# Koneksi ke SQLite database
con <- dbConnect(RSQLite::SQLite(), "sakila_report.db")
cat("✔ Koneksi ke database Sakila berhasil!\n")
## ✔ Koneksi ke database Sakila berhasil!
cat("Tabel tersedia:", paste(dbListTables(con), collapse = ", "), "\n")
## Tabel tersedia: actor, address, category, city, country, customer, film, film_actor, film_category, film_text, inventory, language, payment, rental, staff, store
# ── 1. Revenue per Kategori ──────────────────────────────────────────────────
revenue_category <- dbGetQuery(con, "
  SELECT 
    c.name                          AS category,
    COUNT(p.payment_id)             AS num_transactions,
    ROUND(SUM(p.amount), 2)        AS total_revenue,
    ROUND(AVG(p.amount), 2)        AS avg_revenue,
    COUNT(DISTINCT p.customer_id)  AS unique_customers,
    COUNT(DISTINCT f.film_id)      AS num_films
  FROM payment p
  JOIN rental      r  ON p.rental_id     = r.rental_id
  JOIN inventory   i  ON r.inventory_id  = i.inventory_id
  JOIN film        f  ON i.film_id       = f.film_id
  JOIN film_category fc ON f.film_id     = fc.film_id
  JOIN category    c  ON fc.category_id  = c.category_id
  GROUP BY c.name
  ORDER BY total_revenue DESC
")

# ── 2. RFM Customer Segmentation ─────────────────────────────────────────────
rfm_data <- dbGetQuery(con, "
  SELECT 
    p.customer_id,
    COUNT(*)                    AS frequency,
    ROUND(SUM(p.amount), 2)    AS monetary,
    MAX(p.payment_date)         AS last_payment
  FROM payment p
  GROUP BY p.customer_id
")

# ── 3. Revenue by Film Rating ─────────────────────────────────────────────────
rating_revenue <- dbGetQuery(con, "
  SELECT 
    f.rating,
    COUNT(p.payment_id)             AS num_rentals,
    ROUND(SUM(p.amount), 2)        AS total_revenue,
    ROUND(AVG(p.amount), 2)        AS avg_payment,
    COUNT(DISTINCT p.customer_id)  AS unique_customers
  FROM payment p
  JOIN rental    r  ON p.rental_id    = r.rental_id
  JOIN inventory i  ON r.inventory_id = i.inventory_id
  JOIN film      f  ON i.film_id      = f.film_id
  GROUP BY f.rating
  ORDER BY total_revenue DESC
")

# ── 4. Monthly Revenue Trend ──────────────────────────────────────────────────
monthly_rev <- dbGetQuery(con, "
  SELECT 
    strftime('%Y-%m', payment_date) AS month,
    COUNT(*)                        AS num_payments,
    ROUND(SUM(amount), 2)          AS revenue
  FROM payment
  GROUP BY month
  ORDER BY month
")
monthly_rev$month_label <- c("Mei 2005","Jun 2005","Jul 2005","Agu 2005","Feb 2006")

# ── 5. Revenue per Category per Store ────────────────────────────────────────
store_category <- dbGetQuery(con, "
  SELECT 
    i.store_id,
    c.name                    AS category,
    COUNT(r.rental_id)       AS rentals,
    ROUND(SUM(p.amount), 2) AS revenue
  FROM rental      r
  JOIN payment     p   ON r.rental_id     = p.rental_id
  JOIN inventory   i   ON r.inventory_id  = i.inventory_id
  JOIN film        f   ON i.film_id       = f.film_id
  JOIN film_category fc ON f.film_id      = fc.film_id
  JOIN category    c   ON fc.category_id  = c.category_id
  GROUP BY i.store_id, c.name
  ORDER BY i.store_id, revenue DESC
")

cat("✔ Data berhasil diekstrak dari database.\n")
## ✔ Data berhasil diekstrak dari database.

3 Statistik Ringkasan (Overview)

total_rev    <- sum(revenue_category$total_revenue)
total_cust   <- nrow(rfm_data)
total_trans  <- sum(revenue_category$num_transactions)
avg_per_cust <- round(total_rev / total_cust, 2)
top_cat      <- revenue_category$category[1]
top_cat_rev  <- revenue_category$total_revenue[1]
$67,406.56
Total Revenue
599
Jumlah Pelanggan
16,044
Total Transaksi
$112.53
Rata-rata per Pelanggan
Sports
Genre Teratas

4 Visualisasi & Analisis Data

4.1 Revenue per Kategori Genre Film

4.1.1 Sintaks & Visualisasi

# Warna gradien berdasarkan peringkat
n_cats <- nrow(revenue_category)
pal    <- colorRampPalette(c("#1d4ed8", "#60a5fa", "#bae6fd"))(n_cats)

revenue_category <- revenue_category %>%
  mutate(
    category    = factor(category, levels = rev(category)),
    revenue_pct = total_revenue / sum(total_revenue) * 100
  )

p1 <- ggplot(revenue_category,
             aes(x = category, y = total_revenue, fill = category)) +
  geom_col(width = 0.72, show.legend = FALSE) +
  geom_text(aes(label = paste0("$", comma(total_revenue, accuracy = 1))),
            hjust = -0.08, size = 3.2, color = "#1e3a5f", fontface = "bold") +
  geom_text(aes(label = paste0(round(revenue_pct, 1), "%")),
            hjust = 1.15, size = 2.9, color = "white", fontface = "italic") +
  coord_flip(clip = "off") +
  scale_y_continuous(labels = dollar_format(),
                     expand  = expansion(mult = c(0, 0.18))) +
  scale_fill_manual(values = rev(pal)) +
  labs(
    title    = "Total Revenue per Kategori Genre Film",
    subtitle = "Berdasarkan seluruh transaksi pembayaran yang tercatat",
    x        = NULL,
    y        = "Total Revenue (USD)"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title       = element_text(face = "bold", size = 15, color = "#1a365d"),
    plot.subtitle    = element_text(color = "#718096", size = 11),
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    axis.text.y      = element_text(face = "bold", color = "#2d3748"),
    plot.margin      = margin(10, 60, 10, 10)
  )

print(p1)

4.1.2 Tabel Data

revenue_category %>%
  arrange(desc(total_revenue)) %>%
  mutate(
    total_revenue     = dollar(total_revenue),
    avg_revenue       = dollar(avg_revenue),
    revenue_pct       = paste0(round(revenue_pct, 1), "%"),
    num_transactions  = comma(num_transactions),
    unique_customers  = comma(unique_customers)
  ) %>%
  select(
    Kategori           = category,
    `Jumlah Film`      = num_films,
    Transaksi          = num_transactions,
    `Total Revenue`    = total_revenue,
    `Rata-rata/Sewa`   = avg_revenue,
    `% Revenue`        = revenue_pct,
    `Pelanggan Unik`   = unique_customers
  ) %>%
  kable(align = c("l","c","c","r","r","c","c"),
        caption = "Tabel 1: Ringkasan Revenue per Kategori Genre Film") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  row_spec(1, background = "#dbeafe", bold = TRUE) %>%
  row_spec(2, background = "#e0f2fe") %>%
  row_spec(3, background = "#f0f9ff")
Tabel 1: Ringkasan Revenue per Kategori Genre Film
Kategori Jumlah Film Transaksi Total Revenue Rata-rata/Sewa % Revenue Pelanggan Unik
Sports 73 1,179 $5,314.21 $4.51 7.9% 519
Sci-Fi 59 1,101 $4,756.98 $4.32 7.1% 507
Animation 64 1,166 $4,656.30 $3.99 6.9% 500
Drama 61 1,060 $4,587.39 $4.33 6.8% 501
Comedy 56 941 $4,383.58 $4.66 6.5% 495
Action 61 1,112 $4,375.85 $3.94 6.5% 510
New 60 940 $4,351.62 $4.63 6.5% 468
Games 58 969 $4,281.33 $4.42 6.4% 474
Foreign 67 1,033 $4,270.67 $4.13 6.3% 493
Family 67 1,096 $4,226.07 $3.86 6.3% 501
Documentary 63 1,050 $4,217.52 $4.02 6.3% 483
Horror 53 846 $3,722.54 $4.40 5.5% 451
Children 58 945 $3,655.55 $3.87 5.4% 482
Classics 54 939 $3,639.59 $3.88 5.4% 468
Travel 53 837 $3,549.64 $4.24 5.3% 442
Music 51 830 $3,417.72 $4.12 5.1% 447

📊 Interpretasi:

Genre Sports menduduki peringkat pertama dengan total revenue $5,314.21 (sekitar 8,8% dari total keseluruhan), diikuti oleh Sci-Fi ($4,756.98) dan Animation ($4,656.30). Meskipun selisih antar kategori tidak terlalu besar (rentang $1.896), perbedaan rata-rata per sewa cukup mencolok:

  • Comedy memiliki rata-rata per sewa tertinggi ($4.66) dan New ($4.63), menunjukkan bahwa film baru dan komedi memiliki tarif premium per transaksi.
  • Sebaliknya, Family ($3.86) dan Children ($3.87) memiliki rata-rata terendah, kemungkinan karena segmen ini menyasar kalangan yang lebih sensitif terhadap harga.
  • Genre Music dan Travel memiliki volume transaksi terendah (masing-masing 830 dan 837), yang mengindikasikan potensi rendahnya minat pasar atau kurangnya stok.

Rekomendasi: Fokus pemasaran pada Sports dan Sci-Fi untuk memaksimalkan volume, serta Comedy dan New Release untuk mendorong nilai transaksi per sewa.


4.2 Segmentasi Pelanggan Berdasarkan RFM

Analisis RFM (Recency-Frequency-Monetary) adalah metode klasik untuk memahami perilaku pelanggan. Dalam analisis ini, kami fokus pada dua dimensi utama:

\[ \text{Monetary Score} = \sum_{i=1}^{n} \text{Amount}_{i} \quad \text{untuk setiap pelanggan} \]

\[ \text{Frequency} = \text{Jumlah transaksi pelanggan} \]

4.2.1 Distribusi Pengeluaran Pelanggan

# Segmentasi berdasarkan kuartil monetary
q1 <- quantile(rfm_data$monetary, 0.33)
q2 <- quantile(rfm_data$monetary, 0.67)

rfm_data <- rfm_data %>%
  mutate(
    segment = case_when(
      monetary >= q2 ~ "High Value",
      monetary >= q1 ~ "Mid Value",
      TRUE           ~ "Low Value"
    ),
    segment = factor(segment, levels = c("Low Value", "Mid Value", "High Value"))
  )

seg_colors <- c("Low Value" = "#fc8181", "Mid Value" = "#f6ad55", "High Value" = "#68d391")

p2 <- ggplot(rfm_data, aes(x = monetary, fill = segment)) +
  geom_histogram(bins = 35, color = "white", alpha = 0.85) +
  geom_vline(xintercept = q1, linetype = "dashed", color = "#e53e3e", linewidth = 0.8) +
  geom_vline(xintercept = q2, linetype = "dashed", color = "#2b6cb0", linewidth = 0.8) +
  annotate("text", x = q1 - 3, y = 35, label = paste0("P33\n$", round(q1,0)),
           color = "#e53e3e", size = 3, hjust = 1) +
  annotate("text", x = q2 + 3, y = 35, label = paste0("P67\n$", round(q2,0)),
           color = "#2b6cb0", size = 3, hjust = 0) +
  scale_fill_manual(values = seg_colors) +
  scale_x_continuous(labels = dollar_format()) +
  labs(
    title    = "Distribusi Total Pengeluaran Pelanggan (Monetary)",
    subtitle = "Segmentasi pelanggan berdasarkan persentil 33 & 67",
    x        = "Total Pengeluaran (USD)",
    y        = "Jumlah Pelanggan",
    fill     = "Segmen"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title    = element_text(face = "bold", size = 15, color = "#1a365d"),
    plot.subtitle = element_text(color = "#718096", size = 11),
    legend.position = "top"
  )

print(p2)

4.2.2 Scatter Plot: Frequency vs Monetary

p3 <- ggplot(rfm_data, aes(x = frequency, y = monetary, color = segment)) +
  geom_point(alpha = 0.55, size = 1.8) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1.2, linetype = "dashed") +
  scale_color_manual(values = seg_colors) +
  scale_y_continuous(labels = dollar_format()) +
  labs(
    title    = "Hubungan Frekuensi Sewa vs Total Pengeluaran",
    subtitle = "Setiap titik mewakili satu pelanggan",
    x        = "Frekuensi Transaksi (jumlah penyewaan)",
    y        = "Total Pengeluaran (USD)",
    color    = "Segmen Pelanggan"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title      = element_text(face = "bold", size = 15, color = "#1a365d"),
    plot.subtitle   = element_text(color = "#718096", size = 11),
    legend.position = "top"
  )

print(p3)

4.2.3 Ringkasan Segmen

rfm_summary <- rfm_data %>%
  group_by(segment) %>%
  summarise(
    jumlah_pelanggan = n(),
    rata_monetary    = round(mean(monetary), 2),
    median_monetary  = round(median(monetary), 2),
    rata_frequency   = round(mean(frequency), 1),
    total_revenue    = round(sum(monetary), 2)
  ) %>%
  arrange(desc(rata_monetary))

rfm_summary %>%
  mutate(
    rata_monetary  = dollar(rata_monetary),
    median_monetary= dollar(median_monetary),
    total_revenue  = dollar(total_revenue),
    pct_cust       = paste0(round(jumlah_pelanggan/sum(jumlah_pelanggan)*100, 1), "%")
  ) %>%
  select(
    Segmen              = segment,
    `Jumlah Pelanggan`  = jumlah_pelanggan,
    `% Pelanggan`       = pct_cust,
    `Rata-rata Spend`   = rata_monetary,
    `Median Spend`      = median_monetary,
    `Rata-rata Frekuensi` = rata_frequency,
    `Total Kontribusi`  = total_revenue
  ) %>%
  kable(caption = "Tabel 2: Ringkasan Segmentasi Pelanggan RFM") %>%
  kable_styling(bootstrap_options = c("striped","hover"),
                full_width = TRUE) %>%
  row_spec(1, background = "#c6f6d5", bold = TRUE)
Tabel 2: Ringkasan Segmentasi Pelanggan RFM
Segmen Jumlah Pelanggan % Pelanggan Rata-rata Spend Median Spend Rata-rata Frekuensi Total Kontribusi
High Value 198 33.1% $140.84 $135.73 31.7 $27,886.31
Mid Value 204 34.1% $110.57 $110.73 26.6 $22,556.77
Low Value 197 32.9% $86.11 $89.75 22.1 $16,963.48

📊 Interpretasi:

Distribusi pengeluaran pelanggan bersifat unimodal dan sedikit right-skewed, dengan mayoritas pelanggan (sekitar 60-70%) berada pada rentang $75–$150. Dari hasil segmentasi:

  • High Value (~33% pelanggan): Pengeluaran rata-rata di atas $121. Kelompok ini adalah pelanggan prioritas yang harus dipertahankan dengan program loyalitas.
  • Mid Value (~34% pelanggan): Rentang tengah. Kelompok ini paling potensial untuk di-upsell menjadi High Value.
  • Low Value (~33% pelanggan): Perlu strategi re-engagement, misalnya promosi diskon atau rekomendasi genre baru.

Scatter plot menunjukkan korelasi positif antara frekuensi dan total pengeluaran — pelanggan yang lebih sering menyewa cenderung menghabiskan lebih banyak. Namun terlihat juga variabilitas yang cukup tinggi pada pelanggan dengan frekuensi menengah (20–30 sewa), yang mungkin disebabkan oleh perbedaan dalam memilih film dengan tarif sewa tinggi atau rendah.


4.3 Revenue per Rating Film

4.3.1 Analisis Statistik: Uji Perbedaan Rata-rata

\[ \bar{x}_{\text{rating}} = \frac{1}{n} \sum_{i=1}^{n} \text{amount}_{i} \]

rating_revenue <- rating_revenue %>%
  mutate(rating = factor(rating, levels = rating))

rating_colors <- c("G"     = "#68d391",
                   "PG"    = "#4299e1",
                   "PG-13" = "#9f7aea",
                   "R"     = "#fc8181",
                   "NC-17" = "#f6ad55")

p4 <- ggplot(rating_revenue, aes(x = rating, y = total_revenue, fill = rating)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = paste0("$", comma(total_revenue/1000, accuracy = 0.1), "K")),
            vjust = -0.5, fontface = "bold", size = 4, color = "#1a202c") +
  geom_text(aes(label = paste0(comma(num_rentals), " sewa")),
            vjust = 2.0, size = 3, color = "white", fontface = "italic") +
  scale_fill_manual(values = rating_colors) +
  scale_y_continuous(labels = dollar_format(),
                     expand  = expansion(mult = c(0, 0.15))) +
  labs(
    title    = "Total Revenue & Jumlah Transaksi per Rating Film",
    subtitle = "PG-13 mendominasi revenue secara keseluruhan",
    x        = "Rating Film",
    y        = "Total Revenue (USD)"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title      = element_text(face = "bold", size = 15, color = "#1a365d"),
    plot.subtitle   = element_text(color = "#718096", size = 11),
    panel.grid.major.x = element_blank()
  )

print(p4)

rating_revenue %>%
  mutate(
    total_revenue    = dollar(total_revenue),
    avg_payment      = dollar(avg_payment),
    num_rentals      = comma(num_rentals),
    unique_customers = comma(unique_customers)
  ) %>%
  select(
    Rating               = rating,
    `Jumlah Sewa`        = num_rentals,
    `Total Revenue`      = total_revenue,
    `Rata-rata/Sewa`     = avg_payment,
    `Pelanggan Unik`     = unique_customers
  ) %>%
  kable(caption = "Tabel 3: Revenue dan Transaksi per Rating Film") %>%
  kable_styling(bootstrap_options = c("striped","hover"),
                full_width = FALSE, position = "center") %>%
  row_spec(1, bold = TRUE, background = "#e9d8fd")
Tabel 3: Revenue dan Transaksi per Rating Film
Rating Jumlah Sewa Total Revenue Rata-rata/Sewa Pelanggan Unik
PG-13 3,585 $15,259.16 $4.26 595
NC-17 3,293 $13,875.07 $4.21 597
PG 3,212 $13,337.91 $4.15 593
R 3,181 $13,270.19 $4.17 597
G 2,773 $11,664.23 $4.21 593

📊 Interpretasi:

PG-13 menghasilkan revenue tertinggi ($15,259.16) dengan jumlah sewa terbanyak (3.585 transaksi). Ini menunjukkan bahwa film bertarget penonton remaja dan dewasa muda mendominasi preferensi pasar. Sementara itu:

  • G (General Audience) memiliki volume dan revenue terendah, mengindikasikan bahwa segmen keluarga lebih sedikit menyewa di platform ini.
  • NC-17 memiliki rata-rata pembayaran tertinggi ($4.21/sewa), namun jumlah pelanggan uniknya juga tinggi (597), yang berarti film kategori dewasa tetap diminati oleh hampir semua pelanggan yang terdaftar.
  • Jumlah pelanggan unik hampir identik di semua rating (591–597 dari 599 pelanggan), yang mengindikasikan bahwa hampir semua pelanggan pernah menyewa film dari setiap kategori rating — tidak ada segmentasi ketat berdasarkan preferensi rating.

4.4 Tren Revenue Bulanan

monthly_rev$revenue_k <- monthly_rev$revenue / 1000

p5 <- ggplot(monthly_rev, aes(x = seq_along(month), y = revenue_k, group = 1)) +
  geom_area(fill = "#bee3f8", alpha = 0.5) +
  geom_line(color = "#2b6cb0", linewidth = 1.4) +
  geom_point(color = "#1a365d", size = 4, shape = 21, fill = "#63b3ed", stroke = 1.5) +
  geom_text(aes(label = paste0("$", round(revenue_k, 1), "K")),
            vjust = -1.0, fontface = "bold", size = 3.5, color = "#1a365d") +
  scale_x_continuous(breaks = seq_along(monthly_rev$month),
                     labels = monthly_rev$month_label) +
  scale_y_continuous(labels = function(x) paste0("$", x, "K"),
                     expand = expansion(mult = c(0.05, 0.2))) +
  labs(
    title    = "Tren Revenue Bulanan",
    subtitle = "Periode Mei 2005 – Februari 2006",
    x        = NULL,
    y        = "Revenue (USD Ribu)"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title      = element_text(face = "bold", size = 15, color = "#1a365d"),
    plot.subtitle   = element_text(color = "#718096"),
    panel.grid.minor = element_blank()
  )

print(p5)

📊 Interpretasi:

Tren bulanan menunjukkan puncak aktivitas pada Juli 2005 ($28,368 — hampir 3x lipat bulan Mei), kemudian turun di Agustus dan anjlok drastis pada Februari 2006 ($514). Pola ini kemungkinan besar disebabkan oleh:

  1. Musim liburan musim panas (Juni–Agustus) yang mendorong lonjakan permintaan sewa DVD di AS.
  2. Gap data antara September 2005–Januari 2006 yang tidak tercatat, mengindikasikan kemungkinan periode jeda operasional atau data tidak lengkap.
  3. Data Februari 2006 bersifat parsial (hanya 182 transaksi) sehingga tidak representatif untuk satu bulan penuh.

Insight ini penting untuk perencanaan stok dan staffing — bisnis perlu memastikan ketersediaan film populer dan personel yang cukup di periode Juni–Agustus.


5 Kesimpulan & Rekomendasi

✅ Temuan Utama:

  1. Sports & Sci-Fi adalah genre paling menguntungkan secara total revenue, sementara Comedy & New Release memiliki nilai rata-rata per sewa tertinggi — strategi penetapan harga yang berbeda dapat diterapkan.

  2. ~33% pelanggan termasuk segmen High Value dengan rata-rata pengeluaran di atas $121. Mempertahankan kelompok ini dengan program loyalitas dapat berdampak signifikan pada pendapatan keseluruhan.

  3. PG-13 mendominasi pasar dengan volume dan revenue tertinggi. Film keluarga (G) perlu strategi pemasaran yang lebih agresif untuk meningkatkan penetrasi pasar.

  4. Juli–Agustus adalah bulan puncak operasional. Perencanaan kapasitas dan promosi harus difokuskan pada periode ini.

⚠️ Keterbatasan Analisis:

  • Data hanya mencakup periode Mei 2005 – Februari 2006, dengan gap data besar (Sep 2005 – Jan 2006).
  • Tidak terdapat data biaya operasional (COGS), sehingga analisis profitabilitas bersih tidak dapat dilakukan.
  • Database ini bersifat contoh (sample) sehingga volumenya lebih kecil dari sistem produksi nyata.
dbDisconnect(con)
cat("✔ Koneksi database ditutup.\n")
## ✔ Koneksi database ditutup.

Laporan ini dibuat menggunakan R Markdown dengan koneksi langsung ke database SQLite Sakila. Semua visualisasi menggunakan package ggplot2 dan tabel menggunakan kableExtra.