Netflix merupakan salah satu platform streaming terbesar di dunia dengan jutaan pelanggan aktif di lebih dari 190 negara. Dataset ini berisi informasi konten Netflix berupa film dan serial TV, mencakup jenis konten, rating, tahun rilis, dan durasi.
Analisis ini menggunakan metode Statistika Deskriptif untuk mengeksplorasi karakteristik data secara menyeluruh, meliputi ukuran pemusatan, penyebaran, dan visualisasi distribusi data.
| Jenis | Variabel | Keterangan |
|---|---|---|
| 🏷️ Kategorik | type |
Jenis konten: Movie atau TV Show |
| 🏷️ Kategorik | rating |
Rating usia (TV-MA, TV-14, PG-13, dll.) |
| 🔢 Numerik | release_year |
Tahun rilis konten |
| 🔢 Numerik | duration_minutes |
Durasi film dalam menit |
# Download file netflix_titles.csv dari:
# https://www.kaggle.com/datasets/shivamb/netflix-shows
# Lalu simpan di folder yang sama dengan file .Rmd ini
netflix <- read.csv("netflix_titles.csv",
stringsAsFactors = FALSE,
na.strings = c("", "NA"))
# Lihat struktur data
cat("Dimensi data:", nrow(netflix), "baris x", ncol(netflix), "kolom\n")## Dimensi data: 8807 baris x 12 kolom
# Fungsi modus
modus <- function(x) {
x <- x[!is.na(x)]
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# Cleaning dan transformasi data
netflix_clean <- netflix %>%
filter(!is.na(type), !is.na(rating), !is.na(release_year)) %>%
mutate(
duration_value = as.numeric(gsub("[^0-9]", "", duration)),
duration_minutes = ifelse(type == "Movie", duration_value, NA),
rating_group = case_when(
rating %in% c("G","TV-G","TV-Y","TV-Y7","TV-Y7-FV") ~ "Anak-Anak",
rating %in% c("PG","TV-PG","PG-13","TV-14") ~ "Remaja",
rating %in% c("R","TV-MA","NC-17","NR","UR") ~ "Dewasa",
TRUE ~ "Lainnya"
)
)
cat("✅ Data siap dianalisis:", nrow(netflix_clean), "baris\n")## ✅ Data siap dianalisis: 8803 baris
type — Jenis Kontentabel_type <- netflix_clean %>%
count(type) %>%
mutate(Persentase = round(n / sum(n) * 100, 2)) %>%
rename(Kategori = type, Frekuensi = n)
kable(tabel_type, caption = "Tabel Frekuensi Jenis Konten Netflix",
align = c("l","c","c")) %>%
kable_styling(bootstrap_options = c("striped","hover","bordered"),
full_width = FALSE, position = "center") %>%
row_spec(0, background = "#E50914", color = "white", bold = TRUE)| Kategori | Frekuensi | Persentase |
|---|---|---|
| Movie | 6129 | 69.62 |
| TV Show | 2674 | 30.38 |
ggplot(tabel_type, aes(x = "", y = Frekuensi, fill = Kategori)) +
geom_bar(stat = "identity", width = 1, color = "white", linewidth = 1.2) +
coord_polar("y") +
geom_text(aes(label = paste0(Persentase, "%")),
position = position_stack(vjust = 0.5),
color = "white", size = 7, fontface = "bold") +
scale_fill_manual(values = c("Movie" = "#E50914", "TV Show" = "#221F1F")) +
labs(title = "Proporsi Jenis Konten di Netflix",
fill = "Jenis Konten") +
theme_void() +
theme(
plot.title = element_text(hjust = 0.5, size = 15, face = "bold",
color = "#141414", margin = margin(b=12)),
legend.text = element_text(size = 11),
legend.title = element_text(size = 11, face = "bold"),
plot.background = element_rect(fill = "#fafafa", color = NA)
)🎬 Interpretasi Pie Chart menunjukkan
konten Netflix didominasi oleh Movie (~69%)
dibandingkan TV Show (~31%). Nilai
modus variabel type adalah
“Movie”. Secara inferensia, dominasi film mencerminkan
strategi Netflix yang lebih berfokus pada konten film untuk menarik
pelanggan baru di berbagai pasar global.
rating — Rating Kontentabel_rating <- netflix_clean %>%
count(rating) %>%
arrange(desc(n)) %>%
mutate(Persentase = round(n / sum(n) * 100, 2)) %>%
rename(Rating = rating, Frekuensi = n)
kable(tabel_rating, caption = "Tabel Frekuensi Rating Konten Netflix",
align = c("l","c","c")) %>%
kable_styling(bootstrap_options = c("striped","hover","bordered"),
full_width = FALSE, position = "center") %>%
row_spec(0, background = "#E50914", color = "white", bold = TRUE) %>%
row_spec(1, bold = TRUE, background = "#fff5f5")| Rating | Frekuensi | Persentase |
|---|---|---|
| TV-MA | 3207 | 36.43 |
| TV-14 | 2160 | 24.54 |
| TV-PG | 863 | 9.80 |
| R | 799 | 9.08 |
| PG-13 | 490 | 5.57 |
| TV-Y7 | 334 | 3.79 |
| TV-Y | 307 | 3.49 |
| PG | 287 | 3.26 |
| TV-G | 220 | 2.50 |
| NR | 80 | 0.91 |
| G | 41 | 0.47 |
| TV-Y7-FV | 6 | 0.07 |
| NC-17 | 3 | 0.03 |
| UR | 3 | 0.03 |
| 66 min | 1 | 0.01 |
| 74 min | 1 | 0.01 |
| 84 min | 1 | 0.01 |
netflix_clean %>%
count(rating) %>%
arrange(desc(n)) %>%
ggplot(aes(x = reorder(rating, n), y = n, fill = n)) +
geom_bar(stat = "identity", color = "white", width = 0.75) +
geom_text(aes(label = n), hjust = -0.2, size = 3.5, color = "#333") +
coord_flip() +
scale_fill_gradient(low = "#ffb3b3", high = "#E50914") +
labs(title = "Distribusi Rating Konten di Netflix",
x = "Rating", y = "Jumlah Konten") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "#141414"),
panel.grid.major.y = element_blank(),
legend.position = "none",
axis.text = element_text(color = "#444"),
plot.background = element_rect(fill = "#fafafa", color = NA)
)📊 Interpretasi Bar Chart menunjukkan TV-MA (dewasa) sebagai rating paling dominan, diikuti TV-14 dan TV-PG. Nilai modus = TV-MA. Secara inferensia, Netflix menargetkan penonton dewasa sebagai segmen pasar utama, yang tercermin dari mayoritas konten berrating dewasa.
release_year — Tahun Rilisyear_data <- netflix_clean$release_year
stats_year <- data.frame(
Statistik = c("Mean","Median","Modus","Q1 (Kuartil 1)","Q3 (Kuartil 3)",
"Range","Varians","Standar Deviasi","Minimum","Maksimum"),
Nilai = c(
round(mean(year_data, na.rm=TRUE), 2),
median(year_data, na.rm=TRUE),
modus(year_data),
quantile(year_data, 0.25, na.rm=TRUE),
quantile(year_data, 0.75, na.rm=TRUE),
diff(range(year_data, na.rm=TRUE)),
round(var(year_data, na.rm=TRUE), 2),
round(sd(year_data, na.rm=TRUE), 2),
min(year_data, na.rm=TRUE),
max(year_data, na.rm=TRUE)
)
)
kable(stats_year, caption = "Statistik Deskriptif: Tahun Rilis",
align = c("l","c")) %>%
kable_styling(bootstrap_options = c("striped","hover","bordered"),
full_width = FALSE, position = "center") %>%
row_spec(0, background = "#E50914", color = "white", bold = TRUE) %>%
row_spec(c(1,2,3), bold = TRUE, background = "#fff5f5")| Statistik | Nilai |
|---|---|
| Mean | 2014.18 |
| Median | 2017.00 |
| Modus | 2018.00 |
| Q1 (Kuartil 1) | 2013.00 |
| Q3 (Kuartil 3) | 2019.00 |
| Range | 96.00 |
| Varians | 77.81 |
| Standar Deviasi | 8.82 |
| Minimum | 1925.00 |
| Maksimum | 2021.00 |
ggplot(netflix_clean, aes(x = release_year)) +
geom_histogram(binwidth = 3, fill = "#E50914", color = "white", alpha = 0.9) +
geom_vline(xintercept = mean(year_data, na.rm=TRUE),
color = "#2563eb", linetype = "dashed", linewidth = 1.2) +
geom_vline(xintercept = median(year_data, na.rm=TRUE),
color = "#16a34a", linetype = "solid", linewidth = 1.2) +
annotate("text", x = mean(year_data, na.rm=TRUE) - 3, y = 700,
label = paste("Mean =", round(mean(year_data, na.rm=TRUE),1)),
color = "#2563eb", size = 3.8, fontface = "bold") +
annotate("text", x = median(year_data, na.rm=TRUE) + 4, y = 600,
label = paste("Median =", median(year_data, na.rm=TRUE)),
color = "#16a34a", size = 3.8, fontface = "bold") +
labs(title = "Histogram Tahun Rilis Konten Netflix",
x = "Tahun Rilis", y = "Frekuensi") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "#141414"),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#fafafa", color = NA)
)📈 Interpretasi Histogram menunjukkan distribusi miring ke kiri (negative skew), dengan konten terbanyak dirilis pada 2015–2021. Nilai Mean < Median mengkonfirmasi kemiringan kiri akibat konten lama sebagai outlier. Secara inferensia, tren ini menunjukkan Netflix terus menambah konten baru secara agresif untuk bersaing di pasar streaming global.
ggplot(netflix_clean, aes(x = release_year, fill = type)) +
geom_density(alpha = 0.65, color = NA) +
scale_fill_manual(values = c("Movie" = "#E50914", "TV Show" = "#221F1F")) +
labs(title = "Density Plot Tahun Rilis Berdasarkan Jenis Konten",
x = "Tahun Rilis", y = "Densitas", fill = "Jenis Konten") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "#141414"),
legend.position = "top",
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#fafafa", color = NA)
)📈 Interpretasi Density Plot menunjukkan pola distribusi Movie dan TV Show yang serupa, dengan puncak di sekitar 2018–2020. TV Show memiliki ekor lebih panjang ke kiri. Secara inferensia, kedua jenis konten mengalami lonjakan produksi signifikan dalam satu dekade terakhir seiring meningkatnya popularitas layanan streaming.
duration_minutes — Durasi Filmmovie_data <- netflix_clean %>%
filter(type == "Movie", !is.na(duration_minutes))
cat("🎬 Jumlah data film:", nrow(movie_data), "\n")## 🎬 Jumlah data film: 6126
dur_data <- movie_data$duration_minutes
stats_dur <- data.frame(
Statistik = c("Mean","Median","Modus","Q1 (Kuartil 1)","Q3 (Kuartil 3)",
"Range","Varians","Standar Deviasi","Minimum","Maksimum"),
Nilai = c(
round(mean(dur_data, na.rm=TRUE), 2),
median(dur_data, na.rm=TRUE),
modus(dur_data),
quantile(dur_data, 0.25, na.rm=TRUE),
quantile(dur_data, 0.75, na.rm=TRUE),
diff(range(dur_data, na.rm=TRUE)),
round(var(dur_data, na.rm=TRUE), 2),
round(sd(dur_data, na.rm=TRUE), 2),
min(dur_data, na.rm=TRUE),
max(dur_data, na.rm=TRUE)
)
)
kable(stats_dur, caption = "Statistik Deskriptif: Durasi Film (menit)",
align = c("l","c")) %>%
kable_styling(bootstrap_options = c("striped","hover","bordered"),
full_width = FALSE, position = "center") %>%
row_spec(0, background = "#E50914", color = "white", bold = TRUE) %>%
row_spec(c(1,2,3), bold = TRUE, background = "#fff5f5")| Statistik | Nilai |
|---|---|
| Mean | 99.58 |
| Median | 98.00 |
| Modus | 90.00 |
| Q1 (Kuartil 1) | 87.00 |
| Q3 (Kuartil 3) | 114.00 |
| Range | 309.00 |
| Varians | 799.94 |
| Standar Deviasi | 28.28 |
| Minimum | 3.00 |
| Maksimum | 312.00 |
ggplot(movie_data, aes(x = duration_minutes)) +
geom_histogram(binwidth = 10, fill = "#E50914", color = "white", alpha = 0.9) +
geom_vline(xintercept = mean(dur_data, na.rm=TRUE),
color = "#2563eb", linetype = "dashed", linewidth = 1.2) +
geom_vline(xintercept = median(dur_data, na.rm=TRUE),
color = "#16a34a", linetype = "solid", linewidth = 1.2) +
annotate("text", x = mean(dur_data, na.rm=TRUE) + 12, y = 450,
label = paste("Mean =", round(mean(dur_data, na.rm=TRUE),1)),
color = "#2563eb", size = 3.8, fontface = "bold") +
annotate("text", x = median(dur_data, na.rm=TRUE) - 20, y = 380,
label = paste("Median =", median(dur_data, na.rm=TRUE)),
color = "#16a34a", size = 3.8, fontface = "bold") +
labs(title = "Histogram Durasi Film Netflix",
x = "Durasi (Menit)", y = "Frekuensi") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "#141414"),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#fafafa", color = NA)
)🎬 Interpretasi Histogram durasi film menunjukkan distribusi mendekati simetris dengan puncak di 90–100 menit. Terdapat outlier di sisi kanan (film >180 menit, kemungkinan dokumenter/film epik). Secara inferensia, durasi rata-rata sesuai standar industri perfilman internasional (90–120 menit).
ggplot(movie_data, aes(x = duration_minutes)) +
geom_density(fill = "#E50914", alpha = 0.65, color = "#b0070f") +
geom_vline(xintercept = mean(dur_data, na.rm=TRUE),
color = "#2563eb", linetype = "dashed", linewidth = 1.2) +
geom_vline(xintercept = median(dur_data, na.rm=TRUE),
color = "#16a34a", linetype = "solid", linewidth = 1.2) +
annotate("text", x = mean(dur_data, na.rm=TRUE) + 15, y = 0.014,
label = paste("Mean =", round(mean(dur_data, na.rm=TRUE),1)),
color = "#2563eb", size = 3.8, fontface = "bold") +
annotate("text", x = median(dur_data, na.rm=TRUE) - 20, y = 0.012,
label = paste("Median =", median(dur_data, na.rm=TRUE)),
color = "#16a34a", size = 3.8, fontface = "bold") +
labs(title = "Density Plot Durasi Film Netflix",
x = "Durasi (Menit)", y = "Densitas") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "#141414"),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#fafafa", color = NA)
)🎬 Interpretasi Density Plot memperlihatkan kurva unimodal dengan puncak di sekitar 90 menit. Ekor panjang ke kanan menunjukkan kemiringan positif ringan akibat film berdurasi sangat panjang. Secara inferensia, distribusi ini mendekati normal sehingga uji parametrik dapat diterapkan untuk analisis lebih lanjut.
ggplot(movie_data %>% filter(rating_group != "Lainnya"),
aes(x = rating_group, y = duration_minutes, fill = rating_group)) +
geom_boxplot(outlier.colour = "#E50914", outlier.shape = 16,
outlier.size = 2, alpha = 0.85, width = 0.55) +
stat_summary(fun = mean, geom = "point", shape = 23,
size = 4, fill = "white", color = "black") +
scale_fill_manual(values = c("Anak-Anak" = "#FFD700",
"Remaja" = "#FF8C00",
"Dewasa" = "#E50914")) +
labs(title = "Boxplot Durasi Film Netflix per Kelompok Rating",
subtitle = "◇ Titik putih = Mean | ● Titik merah = Outlier",
x = "Kelompok Rating", y = "Durasi (Menit)") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "#141414"),
plot.subtitle = element_text(hjust = 0.5, color = "#666", size = 10),
legend.position = "none",
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#fafafa", color = NA)
)Secara inferensia, perbedaan durasi antar kelompok dapat diuji lanjut dengan uji ANOVA untuk mengetahui apakah perbedaannya signifikan secara statistik.
## ===== Identifikasi Bentuk Sebaran =====
## >> Release Year:
## Mean : 2014.18
## Median : 2017
## Modus : 2018
if (mean(year_data, na.rm=TRUE) < median(year_data, na.rm=TRUE)) {
cat(" Kesimpulan: Mean < Median -> MIRING KE KIRI (Negative Skew)\n\n")
} else {
cat(" Kesimpulan: Mean > Median -> MIRING KE KANAN (Positive Skew)\n\n")
}## Kesimpulan: Mean < Median -> MIRING KE KIRI (Negative Skew)
## >> Duration (menit):
## Mean : 99.58
## Median : 98
## Modus : 90
if (mean(dur_data, na.rm=TRUE) < median(dur_data, na.rm=TRUE)) {
cat(" Kesimpulan: Mean < Median -> MIRING KE KIRI (Negative Skew)\n")
} else {
cat(" Kesimpulan: Mean > Median -> MIRING KE KANAN (Positive Skew)\n")
}## Kesimpulan: Mean > Median -> MIRING KE KANAN (Positive Skew)
## ===== Deteksi Outlier: Durasi Film =====
Q1_dur <- quantile(dur_data, 0.25, na.rm=TRUE)
Q3_dur <- quantile(dur_data, 0.75, na.rm=TRUE)
IQR_dur <- Q3_dur - Q1_dur
BB <- Q1_dur - 1.5 * IQR_dur
BA <- Q3_dur + 1.5 * IQR_dur
cat("Q1 (Kuartil 1) :", Q1_dur, "menit\n")## Q1 (Kuartil 1) : 87 menit
## Q3 (Kuartil 3) : 114 menit
## IQR : 27 menit
## Batas Bawah : 46.5 menit
## Batas Atas : 154.5 menit
## Outlier bawah : 248 film
## Outlier atas : 201 film
rekap <- data.frame(
Variabel = c("Release Year", "Duration (menit)"),
Mean = c(round(mean(year_data, na.rm=TRUE),2), round(mean(dur_data, na.rm=TRUE),2)),
Median = c(median(year_data, na.rm=TRUE), median(dur_data, na.rm=TRUE)),
Modus = c(modus(year_data), modus(dur_data)),
Q1 = c(quantile(year_data,0.25,na.rm=TRUE), quantile(dur_data,0.25,na.rm=TRUE)),
Q3 = c(quantile(year_data,0.75,na.rm=TRUE), quantile(dur_data,0.75,na.rm=TRUE)),
Range = c(diff(range(year_data,na.rm=TRUE)), diff(range(dur_data,na.rm=TRUE))),
Varians = c(round(var(year_data,na.rm=TRUE),2), round(var(dur_data,na.rm=TRUE),2)),
Std_Dev = c(round(sd(year_data,na.rm=TRUE),2), round(sd(dur_data,na.rm=TRUE),2))
)
kable(rekap, caption = "Rekapitulasi Statistik Deskriptif",
col.names = c("Variabel","Mean","Median","Modus","Q1","Q3","Range","Varians","Std. Deviasi")) %>%
kable_styling(bootstrap_options = c("striped","hover","bordered"),
full_width = TRUE) %>%
row_spec(0, background = "#E50914", color = "white", bold = TRUE) %>%
column_spec(1, bold = TRUE, background = "#fff5f5")| Variabel | Mean | Median | Modus | Q1 | Q3 | Range | Varians | Std. Deviasi |
|---|---|---|---|---|---|---|---|---|
| Release Year | 2014.18 | 2017 | 2018 | 2013 | 2019 | 96 | 77.81 | 8.82 |
| Duration (menit) | 99.58 | 98 | 90 | 87 | 114 | 309 | 799.94 | 28.28 |