library(DBI)
library(RSQLite)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(RColorBrewer)
library(knitr)
library(kableExtra)Insight Utama: Analisis ini berfokus pada profitabilitas genre film (kategori) dalam bisnis penyewaan DVD Sakila, dikombinasikan dengan segmentasi perilaku pelanggan berdasarkan frekuensi dan nilai pengeluaran (Recency-Frequency-Monetary). Pertanyaan sentral: Genre apa yang paling menguntungkan, dan seperti apa profil pelanggan terbaik kita?
Dataset Sakila adalah contoh basis data relasional yang merepresentasikan sistem penyewaan DVD fiktif. Dataset ini berisi informasi tentang film, pelanggan, transaksi penyewaan, dan pembayaran. Untuk analisis ini, kami menggunakan tabel:
film — metadata film (judul, rating,
durasi, tarif sewa)category &
film_category — genre/kategori filmpayment — data pembayaran
pelangganrental — transaksi penyewaaninventory — stok film per tokocustomer — data pelanggan# Koneksi ke SQLite database
con <- dbConnect(RSQLite::SQLite(), "sakila_report.db")
cat("✔ Koneksi ke database Sakila berhasil!\n")## ✔ Koneksi ke database Sakila berhasil!
## Tabel tersedia: actor, address, category, city, country, customer, film, film_actor, film_category, film_text, inventory, language, payment, rental, staff, store
# ── 1. Revenue per Kategori ──────────────────────────────────────────────────
revenue_category <- dbGetQuery(con, "
SELECT
c.name AS category,
COUNT(p.payment_id) AS num_transactions,
ROUND(SUM(p.amount), 2) AS total_revenue,
ROUND(AVG(p.amount), 2) AS avg_revenue,
COUNT(DISTINCT p.customer_id) AS unique_customers,
COUNT(DISTINCT f.film_id) AS num_films
FROM payment p
JOIN rental r ON p.rental_id = r.rental_id
JOIN inventory i ON r.inventory_id = i.inventory_id
JOIN film f ON i.film_id = f.film_id
JOIN film_category fc ON f.film_id = fc.film_id
JOIN category c ON fc.category_id = c.category_id
GROUP BY c.name
ORDER BY total_revenue DESC
")
# ── 2. RFM Customer Segmentation ─────────────────────────────────────────────
rfm_data <- dbGetQuery(con, "
SELECT
p.customer_id,
COUNT(*) AS frequency,
ROUND(SUM(p.amount), 2) AS monetary,
MAX(p.payment_date) AS last_payment
FROM payment p
GROUP BY p.customer_id
")
# ── 3. Revenue by Film Rating ─────────────────────────────────────────────────
rating_revenue <- dbGetQuery(con, "
SELECT
f.rating,
COUNT(p.payment_id) AS num_rentals,
ROUND(SUM(p.amount), 2) AS total_revenue,
ROUND(AVG(p.amount), 2) AS avg_payment,
COUNT(DISTINCT p.customer_id) AS unique_customers
FROM payment p
JOIN rental r ON p.rental_id = r.rental_id
JOIN inventory i ON r.inventory_id = i.inventory_id
JOIN film f ON i.film_id = f.film_id
GROUP BY f.rating
ORDER BY total_revenue DESC
")
# ── 4. Monthly Revenue Trend ──────────────────────────────────────────────────
monthly_rev <- dbGetQuery(con, "
SELECT
strftime('%Y-%m', payment_date) AS month,
COUNT(*) AS num_payments,
ROUND(SUM(amount), 2) AS revenue
FROM payment
GROUP BY month
ORDER BY month
")
monthly_rev$month_label <- c("Mei 2005","Jun 2005","Jul 2005","Agu 2005","Feb 2006")
# ── 5. Revenue per Category per Store ────────────────────────────────────────
store_category <- dbGetQuery(con, "
SELECT
i.store_id,
c.name AS category,
COUNT(r.rental_id) AS rentals,
ROUND(SUM(p.amount), 2) AS revenue
FROM rental r
JOIN payment p ON r.rental_id = p.rental_id
JOIN inventory i ON r.inventory_id = i.inventory_id
JOIN film f ON i.film_id = f.film_id
JOIN film_category fc ON f.film_id = fc.film_id
JOIN category c ON fc.category_id = c.category_id
GROUP BY i.store_id, c.name
ORDER BY i.store_id, revenue DESC
")
cat("✔ Data berhasil diekstrak dari database.\n")## ✔ Data berhasil diekstrak dari database.
total_rev <- sum(revenue_category$total_revenue)
total_cust <- nrow(rfm_data)
total_trans <- sum(revenue_category$num_transactions)
avg_per_cust <- round(total_rev / total_cust, 2)
top_cat <- revenue_category$category[1]
top_cat_rev <- revenue_category$total_revenue[1]# Warna gradien berdasarkan peringkat
n_cats <- nrow(revenue_category)
pal <- colorRampPalette(c("#1d4ed8", "#60a5fa", "#bae6fd"))(n_cats)
revenue_category <- revenue_category %>%
mutate(
category = factor(category, levels = rev(category)),
revenue_pct = total_revenue / sum(total_revenue) * 100
)
p1 <- ggplot(revenue_category,
aes(x = category, y = total_revenue, fill = category)) +
geom_col(width = 0.72, show.legend = FALSE) +
geom_text(aes(label = paste0("$", comma(total_revenue, accuracy = 1))),
hjust = -0.08, size = 3.2, color = "#1e3a5f", fontface = "bold") +
geom_text(aes(label = paste0(round(revenue_pct, 1), "%")),
hjust = 1.15, size = 2.9, color = "white", fontface = "italic") +
coord_flip(clip = "off") +
scale_y_continuous(labels = dollar_format(),
expand = expansion(mult = c(0, 0.18))) +
scale_fill_manual(values = rev(pal)) +
labs(
title = "Total Revenue per Kategori Genre Film",
subtitle = "Berdasarkan seluruh transaksi pembayaran yang tercatat",
x = NULL,
y = "Total Revenue (USD)"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 15, color = "#1a365d"),
plot.subtitle = element_text(color = "#718096", size = 11),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
axis.text.y = element_text(face = "bold", color = "#2d3748"),
plot.margin = margin(10, 60, 10, 10)
)
print(p1)revenue_category %>%
arrange(desc(total_revenue)) %>%
mutate(
total_revenue = dollar(total_revenue),
avg_revenue = dollar(avg_revenue),
revenue_pct = paste0(round(revenue_pct, 1), "%"),
num_transactions = comma(num_transactions),
unique_customers = comma(unique_customers)
) %>%
select(
Kategori = category,
`Jumlah Film` = num_films,
Transaksi = num_transactions,
`Total Revenue` = total_revenue,
`Rata-rata/Sewa` = avg_revenue,
`% Revenue` = revenue_pct,
`Pelanggan Unik` = unique_customers
) %>%
kable(align = c("l","c","c","r","r","c","c"),
caption = "Tabel 1: Ringkasan Revenue per Kategori Genre Film") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = TRUE) %>%
row_spec(1, background = "#dbeafe", bold = TRUE) %>%
row_spec(2, background = "#e0f2fe") %>%
row_spec(3, background = "#f0f9ff")| Kategori | Jumlah Film | Transaksi | Total Revenue | Rata-rata/Sewa | % Revenue | Pelanggan Unik |
|---|---|---|---|---|---|---|
| Sports | 73 | 1,179 | $5,314.21 | $4.51 | 7.9% | 519 |
| Sci-Fi | 59 | 1,101 | $4,756.98 | $4.32 | 7.1% | 507 |
| Animation | 64 | 1,166 | $4,656.30 | $3.99 | 6.9% | 500 |
| Drama | 61 | 1,060 | $4,587.39 | $4.33 | 6.8% | 501 |
| Comedy | 56 | 941 | $4,383.58 | $4.66 | 6.5% | 495 |
| Action | 61 | 1,112 | $4,375.85 | $3.94 | 6.5% | 510 |
| New | 60 | 940 | $4,351.62 | $4.63 | 6.5% | 468 |
| Games | 58 | 969 | $4,281.33 | $4.42 | 6.4% | 474 |
| Foreign | 67 | 1,033 | $4,270.67 | $4.13 | 6.3% | 493 |
| Family | 67 | 1,096 | $4,226.07 | $3.86 | 6.3% | 501 |
| Documentary | 63 | 1,050 | $4,217.52 | $4.02 | 6.3% | 483 |
| Horror | 53 | 846 | $3,722.54 | $4.40 | 5.5% | 451 |
| Children | 58 | 945 | $3,655.55 | $3.87 | 5.4% | 482 |
| Classics | 54 | 939 | $3,639.59 | $3.88 | 5.4% | 468 |
| Travel | 53 | 837 | $3,549.64 | $4.24 | 5.3% | 442 |
| Music | 51 | 830 | $3,417.72 | $4.12 | 5.1% | 447 |
📊 Interpretasi:
Genre Sports menduduki peringkat pertama dengan total revenue $5,314.21 (sekitar 8,8% dari total keseluruhan), diikuti oleh Sci-Fi ($4,756.98) dan Animation ($4,656.30). Meskipun selisih antar kategori tidak terlalu besar (rentang $1.896), perbedaan rata-rata per sewa cukup mencolok:
Rekomendasi: Fokus pemasaran pada Sports dan Sci-Fi untuk memaksimalkan volume, serta Comedy dan New Release untuk mendorong nilai transaksi per sewa.
Analisis RFM (Recency-Frequency-Monetary) adalah metode klasik untuk memahami perilaku pelanggan. Dalam analisis ini, kami fokus pada dua dimensi utama:
\[ \text{Monetary Score} = \sum_{i=1}^{n} \text{Amount}_{i} \quad \text{untuk setiap pelanggan} \]
\[ \text{Frequency} = \text{Jumlah transaksi pelanggan} \]
# Segmentasi berdasarkan kuartil monetary
q1 <- quantile(rfm_data$monetary, 0.33)
q2 <- quantile(rfm_data$monetary, 0.67)
rfm_data <- rfm_data %>%
mutate(
segment = case_when(
monetary >= q2 ~ "High Value",
monetary >= q1 ~ "Mid Value",
TRUE ~ "Low Value"
),
segment = factor(segment, levels = c("Low Value", "Mid Value", "High Value"))
)
seg_colors <- c("Low Value" = "#fc8181", "Mid Value" = "#f6ad55", "High Value" = "#68d391")
p2 <- ggplot(rfm_data, aes(x = monetary, fill = segment)) +
geom_histogram(bins = 35, color = "white", alpha = 0.85) +
geom_vline(xintercept = q1, linetype = "dashed", color = "#e53e3e", linewidth = 0.8) +
geom_vline(xintercept = q2, linetype = "dashed", color = "#2b6cb0", linewidth = 0.8) +
annotate("text", x = q1 - 3, y = 35, label = paste0("P33\n$", round(q1,0)),
color = "#e53e3e", size = 3, hjust = 1) +
annotate("text", x = q2 + 3, y = 35, label = paste0("P67\n$", round(q2,0)),
color = "#2b6cb0", size = 3, hjust = 0) +
scale_fill_manual(values = seg_colors) +
scale_x_continuous(labels = dollar_format()) +
labs(
title = "Distribusi Total Pengeluaran Pelanggan (Monetary)",
subtitle = "Segmentasi pelanggan berdasarkan persentil 33 & 67",
x = "Total Pengeluaran (USD)",
y = "Jumlah Pelanggan",
fill = "Segmen"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 15, color = "#1a365d"),
plot.subtitle = element_text(color = "#718096", size = 11),
legend.position = "top"
)
print(p2)p3 <- ggplot(rfm_data, aes(x = frequency, y = monetary, color = segment)) +
geom_point(alpha = 0.55, size = 1.8) +
geom_smooth(method = "lm", se = FALSE, linewidth = 1.2, linetype = "dashed") +
scale_color_manual(values = seg_colors) +
scale_y_continuous(labels = dollar_format()) +
labs(
title = "Hubungan Frekuensi Sewa vs Total Pengeluaran",
subtitle = "Setiap titik mewakili satu pelanggan",
x = "Frekuensi Transaksi (jumlah penyewaan)",
y = "Total Pengeluaran (USD)",
color = "Segmen Pelanggan"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 15, color = "#1a365d"),
plot.subtitle = element_text(color = "#718096", size = 11),
legend.position = "top"
)
print(p3)rfm_summary <- rfm_data %>%
group_by(segment) %>%
summarise(
jumlah_pelanggan = n(),
rata_monetary = round(mean(monetary), 2),
median_monetary = round(median(monetary), 2),
rata_frequency = round(mean(frequency), 1),
total_revenue = round(sum(monetary), 2)
) %>%
arrange(desc(rata_monetary))
rfm_summary %>%
mutate(
rata_monetary = dollar(rata_monetary),
median_monetary= dollar(median_monetary),
total_revenue = dollar(total_revenue),
pct_cust = paste0(round(jumlah_pelanggan/sum(jumlah_pelanggan)*100, 1), "%")
) %>%
select(
Segmen = segment,
`Jumlah Pelanggan` = jumlah_pelanggan,
`% Pelanggan` = pct_cust,
`Rata-rata Spend` = rata_monetary,
`Median Spend` = median_monetary,
`Rata-rata Frekuensi` = rata_frequency,
`Total Kontribusi` = total_revenue
) %>%
kable(caption = "Tabel 2: Ringkasan Segmentasi Pelanggan RFM") %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = TRUE) %>%
row_spec(1, background = "#c6f6d5", bold = TRUE)| Segmen | Jumlah Pelanggan | % Pelanggan | Rata-rata Spend | Median Spend | Rata-rata Frekuensi | Total Kontribusi |
|---|---|---|---|---|---|---|
| High Value | 198 | 33.1% | $140.84 | $135.73 | 31.7 | $27,886.31 |
| Mid Value | 204 | 34.1% | $110.57 | $110.73 | 26.6 | $22,556.77 |
| Low Value | 197 | 32.9% | $86.11 | $89.75 | 22.1 | $16,963.48 |
📊 Interpretasi:
Distribusi pengeluaran pelanggan bersifat unimodal dan sedikit right-skewed, dengan mayoritas pelanggan (sekitar 60-70%) berada pada rentang $75–$150. Dari hasil segmentasi:
Scatter plot menunjukkan korelasi positif antara frekuensi dan total pengeluaran — pelanggan yang lebih sering menyewa cenderung menghabiskan lebih banyak. Namun terlihat juga variabilitas yang cukup tinggi pada pelanggan dengan frekuensi menengah (20–30 sewa), yang mungkin disebabkan oleh perbedaan dalam memilih film dengan tarif sewa tinggi atau rendah.
\[ \bar{x}_{\text{rating}} = \frac{1}{n} \sum_{i=1}^{n} \text{amount}_{i} \]
rating_revenue <- rating_revenue %>%
mutate(rating = factor(rating, levels = rating))
rating_colors <- c("G" = "#68d391",
"PG" = "#4299e1",
"PG-13" = "#9f7aea",
"R" = "#fc8181",
"NC-17" = "#f6ad55")
p4 <- ggplot(rating_revenue, aes(x = rating, y = total_revenue, fill = rating)) +
geom_col(width = 0.6, show.legend = FALSE) +
geom_text(aes(label = paste0("$", comma(total_revenue/1000, accuracy = 0.1), "K")),
vjust = -0.5, fontface = "bold", size = 4, color = "#1a202c") +
geom_text(aes(label = paste0(comma(num_rentals), " sewa")),
vjust = 2.0, size = 3, color = "white", fontface = "italic") +
scale_fill_manual(values = rating_colors) +
scale_y_continuous(labels = dollar_format(),
expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Total Revenue & Jumlah Transaksi per Rating Film",
subtitle = "PG-13 mendominasi revenue secara keseluruhan",
x = "Rating Film",
y = "Total Revenue (USD)"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 15, color = "#1a365d"),
plot.subtitle = element_text(color = "#718096", size = 11),
panel.grid.major.x = element_blank()
)
print(p4)rating_revenue %>%
mutate(
total_revenue = dollar(total_revenue),
avg_payment = dollar(avg_payment),
num_rentals = comma(num_rentals),
unique_customers = comma(unique_customers)
) %>%
select(
Rating = rating,
`Jumlah Sewa` = num_rentals,
`Total Revenue` = total_revenue,
`Rata-rata/Sewa` = avg_payment,
`Pelanggan Unik` = unique_customers
) %>%
kable(caption = "Tabel 3: Revenue dan Transaksi per Rating Film") %>%
kable_styling(bootstrap_options = c("striped","hover"),
full_width = FALSE, position = "center") %>%
row_spec(1, bold = TRUE, background = "#e9d8fd")| Rating | Jumlah Sewa | Total Revenue | Rata-rata/Sewa | Pelanggan Unik |
|---|---|---|---|---|
| PG-13 | 3,585 | $15,259.16 | $4.26 | 595 |
| NC-17 | 3,293 | $13,875.07 | $4.21 | 597 |
| PG | 3,212 | $13,337.91 | $4.15 | 593 |
| R | 3,181 | $13,270.19 | $4.17 | 597 |
| G | 2,773 | $11,664.23 | $4.21 | 593 |
📊 Interpretasi:
PG-13 menghasilkan revenue tertinggi ($15,259.16) dengan jumlah sewa terbanyak (3.585 transaksi). Ini menunjukkan bahwa film bertarget penonton remaja dan dewasa muda mendominasi preferensi pasar. Sementara itu:
monthly_rev$revenue_k <- monthly_rev$revenue / 1000
p5 <- ggplot(monthly_rev, aes(x = seq_along(month), y = revenue_k, group = 1)) +
geom_area(fill = "#bee3f8", alpha = 0.5) +
geom_line(color = "#2b6cb0", linewidth = 1.4) +
geom_point(color = "#1a365d", size = 4, shape = 21, fill = "#63b3ed", stroke = 1.5) +
geom_text(aes(label = paste0("$", round(revenue_k, 1), "K")),
vjust = -1.0, fontface = "bold", size = 3.5, color = "#1a365d") +
scale_x_continuous(breaks = seq_along(monthly_rev$month),
labels = monthly_rev$month_label) +
scale_y_continuous(labels = function(x) paste0("$", x, "K"),
expand = expansion(mult = c(0.05, 0.2))) +
labs(
title = "Tren Revenue Bulanan",
subtitle = "Periode Mei 2005 – Februari 2006",
x = NULL,
y = "Revenue (USD Ribu)"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 15, color = "#1a365d"),
plot.subtitle = element_text(color = "#718096"),
panel.grid.minor = element_blank()
)
print(p5)📊 Interpretasi:
Tren bulanan menunjukkan puncak aktivitas pada Juli 2005 ($28,368 — hampir 3x lipat bulan Mei), kemudian turun di Agustus dan anjlok drastis pada Februari 2006 ($514). Pola ini kemungkinan besar disebabkan oleh:
Insight ini penting untuk perencanaan stok dan staffing — bisnis perlu memastikan ketersediaan film populer dan personel yang cukup di periode Juni–Agustus.
✅ Temuan Utama:
Sports & Sci-Fi adalah genre paling menguntungkan secara total revenue, sementara Comedy & New Release memiliki nilai rata-rata per sewa tertinggi — strategi penetapan harga yang berbeda dapat diterapkan.
~33% pelanggan termasuk segmen High Value dengan rata-rata pengeluaran di atas $121. Mempertahankan kelompok ini dengan program loyalitas dapat berdampak signifikan pada pendapatan keseluruhan.
PG-13 mendominasi pasar dengan volume dan revenue tertinggi. Film keluarga (G) perlu strategi pemasaran yang lebih agresif untuk meningkatkan penetrasi pasar.
Juli–Agustus adalah bulan puncak operasional. Perencanaan kapasitas dan promosi harus difokuskan pada periode ini.
⚠️ Keterbatasan Analisis:
## ✔ Koneksi database ditutup.
Laporan ini dibuat menggunakan R Markdown dengan
koneksi langsung ke database SQLite Sakila. Semua visualisasi
menggunakan package ggplot2 dan tabel menggunakan
kableExtra.