Dataset yang digunakan adalah Global YouTube Statistics 2023, berisi data 995 channel YouTube terbesar di dunia.
Sumber: https://www.kaggle.com/datasets/nelgiriyewithana/global-youtube-statistics-2023
library(ggplot2)
library(dplyr)
df <- read.csv("Global YouTube Statistics.csv", fileEncoding = "latin1")
# Bersihkan nilai "nan"
df <- df %>%
filter(category != "nan", channel_type != "nan") %>%
filter(!is.na(subscribers), !is.na(video.views),
!is.na(uploads), !is.na(highest_yearly_earnings)) %>%
filter(subscribers > 0, video.views > 0, uploads > 0)
cat("Jumlah data:", nrow(df), "channel\n")
## Jumlah data: 900 channel
| No | Variabel | Tipe |
|---|---|---|
| 1 | category |
Kategorik |
| 2 | channel_type |
Kategorik |
| 3 | Country |
Kategorik |
| 4 | subscribers |
Numerik |
| 5 | video.views |
Numerik |
| 6 | highest_yearly_earnings |
Numerik |
| 7 | uploads |
Numerik |
# Fungsi mencari modus
modus <- function(x) {
tabel <- table(round(x))
as.numeric(names(tabel)[which.max(tabel)])
}
# Hitung statistik untuk subscribers
sub <- df$subscribers
cat("===== SUBSCRIBERS =====\n")
## ===== SUBSCRIBERS =====
cat("Mean :", round(mean(sub), 0), "\n")
## Mean : 22880889
cat("Median :", round(median(sub), 0), "\n")
## Median : 17700000
cat("Modus :", modus(sub), "\n")
## Modus : 12500000
cat("Q1 :", round(quantile(sub, 0.25), 0), "\n")
## Q1 : 14500000
cat("Q3 :", round(quantile(sub, 0.75), 0), "\n")
## Q3 : 24600000
cat("Range :", round(max(sub) - min(sub), 0), "\n")
## Range : 232700000
cat("Varians :", round(var(sub), 0), "\n")
## Varians : 2.893548e+14
cat("Std Dev :", round(sd(sub), 0), "\n")
## Std Dev : 17010432
# Hitung statistik untuk video views
views <- df$video.views
cat("===== VIDEO VIEWS =====\n")
## ===== VIDEO VIEWS =====
cat("Mean :", round(mean(views), 0), "\n")
## Mean : 11392569621
cat("Median :", round(median(views), 0), "\n")
## Median : 7891461348
cat("Modus :", modus(views), "\n")
## Modus : 2634
cat("Q1 :", round(quantile(views, 0.25), 0), "\n")
## Q1 : 4393400010
cat("Q3 :", round(quantile(views, 0.75), 0), "\n")
## Q3 : 13947169248
cat("Range :", round(max(views) - min(views), 0), "\n")
## Range : 2.28e+11
cat("Varians :", round(var(views), 0), "\n")
## Varians : 2.139259e+20
cat("Std Dev :", round(sd(views), 0), "\n")
## Std Dev : 14626205333
# Hitung statistik untuk penghasilan tahunan
earn <- df$highest_yearly_earnings
cat("===== HIGHEST YEARLY EARNINGS =====\n")
## ===== HIGHEST YEARLY EARNINGS =====
cat("Mean :", round(mean(earn), 0), "\n")
## Mean : 7225554
cat("Median :", round(median(earn), 0), "\n")
## Median : 2900000
cat("Modus :", modus(earn), "\n")
## Modus : 0
cat("Q1 :", round(quantile(earn, 0.25), 0), "\n")
## Q1 : 804900
cat("Q3 :", round(quantile(earn, 0.75), 0), "\n")
## Q3 : 7725000
cat("Range :", round(max(earn) - min(earn), 0), "\n")
## Range : 110600000
cat("Varians :", round(var(earn), 0), "\n")
## Varians : 1.658454e+14
cat("Std Dev :", round(sd(earn), 0), "\n")
## Std Dev : 12878098
# Hitung statistik untuk uploads
upl <- df$uploads
cat("===== UPLOADS =====\n")
## ===== UPLOADS =====
cat("Mean :", round(mean(upl), 0), "\n")
## Mean : 10111
cat("Median :", round(median(upl), 0), "\n")
## Median : 870
cat("Modus :", modus(upl), "\n")
## Modus : 1
cat("Q1 :", round(quantile(upl, 0.25), 0), "\n")
## Q1 : 252
cat("Q3 :", round(quantile(upl, 0.75), 0), "\n")
## Q3 : 3161
cat("Range :", round(max(upl) - min(upl), 0), "\n")
## Range : 301307
cat("Varians :", round(var(upl), 0), "\n")
## Varians : 1280341577
cat("Std Dev :", round(sd(upl), 0), "\n")
## Std Dev : 35782
# 1. Menyiapkan dan merapikan data
kat <- df %>%
filter(!is.na(category)) %>%
count(category) %>%
# Hitung persentase awal untuk menentukan mana yang masuk "Lainnya"
mutate(persen = n / sum(n) * 100) %>%
# Jika persentase di bawah 3%, ubah namanya menjadi "Lainnya"
mutate(category = ifelse(persen < 3, "Lainnya", as.character(category))) %>%
# Kelompokkan ulang karena sekarang ada beberapa data yang bernama "Lainnya"
group_by(category) %>%
summarise(n = sum(n), .groups = 'drop') %>%
# Hitung ulang persentase final
mutate(persen = round(n / sum(n) * 100, 1)) %>%
# Mengurutkan data dari terbesar ke terkecil
arrange(desc(n)) %>%
# Mengunci urutan menggunakan factor agar ggplot tidak mengurutkannya sesuai abjad
mutate(category = factor(category, levels = category))
# 2. Membuat Plot
ggplot(kat, aes(x = "", y = n, fill = category)) +
geom_bar(stat = "identity", width = 1, color = "white", size = 0.5) +
coord_polar("y", start = 0) +
# Label langsung ditampilkan semua karena kategori kecil sudah digabung
geom_text(aes(label = paste0(persen, "%")),
position = position_stack(vjust = 0.5),
size = 3.5, color = "white", fontface = "bold") +
labs(
title = "Distribusi Kategori Channel YouTube",
fill = "Kategori",
caption = "Sumber: Global YouTube Statistics 2023 - Kaggle"
) +
theme_void() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14, margin = margin(b = 15)),
legend.title = element_text(face = "bold")
)
Interpretasi:
negara <- df %>%
filter(!is.na(Country), Country != "nan") %>%
count(Country, sort = TRUE) %>%
top_n(10, n) %>%
mutate(Country = reorder(Country, n))
ggplot(negara, aes(x = Country, y = n, fill = Country)) +
geom_col(show.legend = FALSE, width = 0.7) +
geom_text(aes(label = n), hjust = -0.2, fontface = "bold", size = 4) +
coord_flip() +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Top 10 Negara dengan Channel YouTube Terbanyak",
x = "Negara", y = "Jumlah Channel",
caption = "Sumber: Global YouTube Statistics 2023 - Kaggle"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 13))
Interpretasi:
df <- df %>% mutate(sub_juta = subscribers / 1e6)
ggplot(df, aes(x = sub_juta)) +
geom_histogram(bins = 35, fill = "#3498DB", color = "white", alpha = 0.85) +
geom_vline(xintercept = mean(df$sub_juta),
color = "red", linewidth = 1, linetype = "dashed") +
geom_vline(xintercept = median(df$sub_juta),
color = "darkgreen", linewidth = 1, linetype = "dashed") +
annotate("text", x = mean(df$sub_juta) + 2, y = Inf,
label = paste0("Mean: ", round(mean(df$sub_juta), 1), "M"),
color = "red", vjust = 2, size = 4) +
annotate("text", x = median(df$sub_juta) - 2, y = Inf,
label = paste0("Median: ", round(median(df$sub_juta), 1), "M"),
color = "darkgreen", vjust = 2, size = 4) +
labs(
title = "Distribusi Jumlah Subscribers",
x = "Subscribers (Juta)", y = "Frekuensi",
caption = "Sumber: Global YouTube Statistics 2023 - Kaggle"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 13))
Interpretasi:
df_earn <- df %>%
filter(highest_yearly_earnings > 0) %>%
mutate(earn_juta = highest_yearly_earnings / 1e6)
ggplot(df_earn, aes(x = earn_juta)) +
geom_density(fill = "#9B59B6", alpha = 0.6, color = "#6C3483", linewidth = 1) +
geom_vline(xintercept = mean(df_earn$earn_juta),
color = "red", linewidth = 1, linetype = "dashed") +
geom_vline(xintercept = median(df_earn$earn_juta),
color = "darkgreen", linewidth = 1, linetype = "dashed") +
annotate("text", x = mean(df_earn$earn_juta) + 0.5, y = Inf,
label = paste0("Mean: $", round(mean(df_earn$earn_juta), 2), "M"),
color = "red", vjust = 2, size = 4) +
annotate("text", x = median(df_earn$earn_juta) - 0.5, y = Inf,
label = paste0("Median: $", round(median(df_earn$earn_juta), 2), "M"),
color = "darkgreen", vjust = 2, size = 4) +
labs(
title = "Distribusi Penghasilan Tahunan Tertinggi YouTubers",
x = "Penghasilan (Juta USD)", y = "Kepadatan",
caption = "Sumber: Global YouTube Statistics 2023 - Kaggle"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 13))
Interpretasi:
top_type <- df %>%
count(channel_type, sort = TRUE) %>%
top_n(7, n) %>%
pull(channel_type)
df_box <- df %>%
filter(channel_type %in% top_type) %>%
mutate(channel_type = reorder(channel_type, sub_juta, median))
ggplot(df_box, aes(x = channel_type, y = sub_juta, fill = channel_type)) +
geom_boxplot(outlier.color = "red", outlier.alpha = 0.5, show.legend = FALSE) +
coord_flip() +
labs(
title = "Distribusi Subscribers berdasarkan Tipe Channel",
x = "Tipe Channel", y = "Subscribers (Juta)",
caption = "Sumber: Global YouTube Statistics 2023 - Kaggle"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 13))
Interpretasi:
Sumber data: Global YouTube Statistics 2023 - Kaggle