Dataset yang digunakan adalah Movies Dataset yang bersumber dari Kaggle - Movies Industry Dataset. Dataset ini memuat informasi 7.668 film dari berbagai genre, tahun rilis, rating, anggaran, dan pendapatan kotor.
Variabel Kategorik:
rating — Klasifikasi usia film (R, PG-13, PG, G, Not
Rated, dll.)genre — Genre utama film (Comedy, Action, Drama,
dll.)Variabel Numerik:
score — Skor IMDb film (skala 1–10)runtime — Durasi film (menit)library(ggplot2)
library(dplyr)
library(readxl)
library(scales)
library(gridExtra)
library(RColorBrewer)
library(DT)
# Load data
df <- read_excel("movies.xlsx")
# Seleksi variabel yang digunakan
df <- df %>%
select(name, country, year, rating, genre, score, runtime) %>%
filter(!is.na(rating), !is.na(genre), !is.na(score), !is.na(runtime))
# Tampilkan seluruh dataset
DT::datatable(df, options = list(pageLength = 15, scrollX = TRUE), caption = "Dataset: Movies Industry")
score_stats <- df %>%
summarise(
Mean = mean(score, na.rm = TRUE),
Median = median(score, na.rm = TRUE),
Modus = as.numeric(names(sort(table(round(score, 1)), decreasing = TRUE)[1])),
Q1 = quantile(score, 0.25, na.rm = TRUE),
Q3 = quantile(score, 0.75, na.rm = TRUE),
Range = max(score, na.rm = TRUE) - min(score, na.rm = TRUE),
Varians = var(score, na.rm = TRUE),
Std_Dev = sd(score, na.rm = TRUE)
)
knitr::kable(score_stats, digits = 4, caption = "Statistik Deskriptif: Score IMDb")
| Mean | Median | Modus | Q1 | Q3 | Range | Varians | Std_Dev |
|---|---|---|---|---|---|---|---|
| 6.3884 | 6.5 | 6.6 | 5.8 | 7.1 | 7.4 | 0.9379 | 0.9685 |
runtime_stats <- df %>%
summarise(
Mean = mean(runtime, na.rm = TRUE),
Median = median(runtime, na.rm = TRUE),
Modus = as.numeric(names(sort(table(runtime), decreasing = TRUE)[1])),
Q1 = quantile(runtime, 0.25, na.rm = TRUE),
Q3 = quantile(runtime, 0.75, na.rm = TRUE),
Range = max(runtime, na.rm = TRUE) - min(runtime, na.rm = TRUE),
Varians = var(runtime, na.rm = TRUE),
Std_Dev = sd(runtime, na.rm = TRUE)
)
knitr::kable(runtime_stats, digits = 4, caption = "Statistik Deskriptif: Runtime (Menit)")
| Mean | Median | Modus | Q1 | Q3 | Range | Varians | Std_Dev |
|---|---|---|---|---|---|---|---|
| 107.2713 | 104 | 97 | 95 | 116 | 303 | 345.2565 | 18.5811 |
library(gridExtra)
library(grid)
rating_all <- read_excel("movies.xlsx") %>%
select(rating) %>%
filter(!is.na(rating)) %>%
count(rating, sort = TRUE) %>%
mutate(pct = n / sum(n))
# Warna konsisten — urutan sesuai rating_all (sudah sort by n)
n_ratings <- nrow(rating_all)
colors <- RColorBrewer::brewer.pal(max(n_ratings, 3), "Set3")[1:n_ratings]
names(colors) <- rating_all$rating # ikat nama warna ke rating
pie_plot <- ggplot(rating_all, aes(x = "", y = pct, fill = rating)) +
geom_col(width = 1, color = "white", linewidth = 0.5) +
coord_polar(theta = "y") +
scale_fill_manual(values = colors) + # tanpa geom_text → tidak ada huruf di dalam
labs(
title = "Pie Chart: Proporsi Rating Film",
caption = "Sumber: Movies Industry Dataset (Kaggle)"
) +
theme_void() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
legend.position = "none"
)
# Tabel legend: warna kolom Rating = warna slice pie masing-masing
legend_df <- rating_all %>%
mutate(
Persentase = percent(pct, accuracy = 0.1),
Jumlah = formatC(n, format = "d", big.mark = ",")
) %>%
select(Rating = rating, Persentase, Jumlah)
# Warna per baris (kolom Rating) sesuai urutan rating_all
row_fill <- colors[rating_all$rating] # warna tepat per rating
legend_grob <- tableGrob(
legend_df,
rows = NULL,
theme = ttheme_minimal(
core = list(
fg_params = list(fontsize = 10),
bg_params = list(
# kolom 1 (Rating) pakai warna slice, kolom 2-3 putih
fill = matrix(
c(row_fill, rep("white", n_ratings), rep("white", n_ratings)),
nrow = n_ratings, ncol = 3
),
alpha = matrix(
c(rep(0.55, n_ratings), rep(0, n_ratings), rep(0, n_ratings)),
nrow = n_ratings, ncol = 3
)
)
),
colhead = list(fg_params = list(fontsize = 11, fontface = "bold"))
)
)
grid.arrange(pie_plot, legend_grob, ncol = 2, widths = c(2.8, 1.4))
Interpretasi:
genre_top <- df %>%
count(genre, sort = TRUE) %>%
top_n(10, n)
ggplot(genre_top, aes(x = reorder(genre, n), y = n, fill = genre)) +
geom_bar(stat = "identity", show.legend = FALSE) +
geom_text(aes(label = n), hjust = -0.2, size = 3.5) +
coord_flip() +
scale_fill_brewer(palette = "Paired") +
labs(
title = "Bar Chart: Jumlah Film per Genre (Top 10)",
x = "Genre",
y = "Jumlah Film",
caption = "Sumber: Movies Industry Dataset (Kaggle)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
axis.text = element_text(size = 11)
) +
ylim(0, max(genre_top$n) * 1.1)
Interpretasi:
ggplot(df, aes(x = score)) +
geom_histogram(binwidth = 0.2, fill = "#2196F3", color = "white", alpha = 0.85) +
geom_vline(aes(xintercept = mean(score)), color = "red", linetype = "dashed", linewidth = 1) +
geom_vline(aes(xintercept = median(score)), color = "green", linetype = "dashed", linewidth = 1) +
annotate("text", x = mean(df$score) + 0.15, y = 600, label = paste("Mean =", round(mean(df$score), 2)), color = "red", size = 3.5) +
annotate("text", x = median(df$score) - 0.15, y = 550, label = paste("Median =", round(median(df$score), 2)), color = "darkgreen", size = 3.5, hjust = 1) +
labs(
title = "Histogram: Distribusi Score IMDb Film",
x = "Score IMDb",
y = "Frekuensi",
caption = "Sumber: Movies Industry Dataset (Kaggle)"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5))
Interpretasi:
ggplot(df, aes(x = runtime)) +
geom_histogram(binwidth = 5, fill = "#FF9800", color = "white", alpha = 0.85) +
geom_vline(aes(xintercept = mean(runtime)), color = "red", linetype = "dashed", linewidth = 1) +
geom_vline(aes(xintercept = median(runtime)), color = "blue", linetype = "dashed", linewidth = 1) +
annotate("text", x = mean(df$runtime) + 8, y = 900, label = paste("Mean =", round(mean(df$runtime), 1)), color = "red", size = 3.5) +
annotate("text", x = median(df$runtime) - 8, y = 820, label = paste("Median =", round(median(df$runtime), 1)), color = "blue", size = 3.5, hjust = 1) +
labs(
title = "Histogram: Distribusi Durasi Film (Runtime)",
x = "Runtime (Menit)",
y = "Frekuensi",
caption = "Sumber: Movies Industry Dataset (Kaggle)"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5))
Interpretasi:
ggplot(df, aes(x = score)) +
geom_density(fill = "#9C27B0", alpha = 0.5, color = "#6A1B9A", linewidth = 1) +
geom_vline(aes(xintercept = mean(score)), color = "red", linetype = "dashed", linewidth = 1) +
geom_vline(aes(xintercept = median(score)), color = "blue", linetype = "dashed", linewidth = 1) +
labs(
title = "Density Plot: Distribusi Score IMDb Film",
subtitle = "Garis merah = Mean | Garis biru = Median",
x = "Score IMDb",
y = "Densitas",
caption = "Sumber: Movies Industry Dataset (Kaggle)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, size = 10)
)
Interpretasi:
ggplot(df, aes(x = reorder(rating, score, median), y = score, fill = rating)) +
geom_boxplot(alpha = 0.7, outlier.color = "red", outlier.size = 1.5) +
scale_fill_brewer(palette = "Set3") +
labs(
title = "Boxplot: Distribusi Score IMDb berdasarkan Rating",
x = "Rating",
y = "Score IMDb",
fill = "Rating",
caption = "Sumber: Movies Industry Dataset (Kaggle)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
legend.position = "none"
)
Interpretasi:
top5_genre <- df %>%
count(genre, sort = TRUE) %>%
top_n(5, n) %>%
pull(genre)
df_top5 <- df %>% filter(genre %in% top5_genre)
ggplot(df_top5, aes(x = reorder(genre, runtime, median), y = runtime, fill = genre)) +
geom_boxplot(alpha = 0.7, outlier.color = "darkred", outlier.size = 1.5) +
scale_fill_brewer(palette = "Pastel1") +
labs(
title = "Boxplot: Distribusi Runtime berdasarkan Genre (Top 5)",
x = "Genre",
y = "Runtime (Menit)",
fill = "Genre",
caption = "Sumber: Movies Industry Dataset (Kaggle)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
legend.position = "none"
)
Interpretasi:
Berdasarkan analisis data film:
Dataset: Movies Industry — Kaggle | Analisis menggunakan R dan ggplot2