This analysis explores YouTube statistics data sourced from Kaggle. The focus is on understanding patterns across country, channel types, and key metrics such as subscribers, video views, and uploads. The analysis includes descriptive and inferential statistics, as well as bootstrap simulation.
library(tidyverse)
library(psych)
library(highcharter)
library(stringr)
# Read CSV
dataset <- read.csv("data/global_youtube_statistics.csv")
# Preview
head(dataset)
# Clean non-alphanumeric characters
dataset$Youtuber <- str_replace_all(dataset$Youtuber, "[^[:alnum:]\\s]", "")
dataset$Title <- str_replace_all(dataset$Title, "[^[:alnum:]\\s]", "")
# Drop unnecessary columns
dataset <- dataset %>%
select(-c(Gross.tertiary.education.enrollment...., Population, Unemployment.rate,
Urban_population, Latitude, Longitude)) %>%
drop_na()
# Check missing values
colSums(is.na(dataset))
## rank Youtuber
## 0 0
## subscribers video.views
## 0 0
## category Title
## 0 0
## uploads Country
## 0 0
## Abbreviation channel_type
## 0 0
## video_views_rank country_rank
## 0 0
## channel_type_rank video_views_for_the_last_30_days
## 0 0
## lowest_monthly_earnings highest_monthly_earnings
## 0 0
## lowest_yearly_earnings highest_yearly_earnings
## 0 0
## subscribers_for_last_30_days created_year
## 0 0
## created_month created_date
## 0 0
country_to_continent <- function(country) {
case_when(
country %in% c("Afghanistan", "Bangladesh", "India", "Pakistan", "Iraq", "Iran", "Saudi Arabia",
"Kuwait", "Jordan", "China", "Indonesia", "Japan", "Malaysia", "Philippines",
"Singapore", "Thailand", "Vietnam") ~ "Asia",
country %in% c("Egypt", "Morocco") ~ "Africa",
country %in% c("Argentina", "Brazil", "Colombia", "Peru", "Venezuela", "Ecuador") ~ "South America",
country %in% c("Canada", "United States", "Mexico", "El Salvador", "Cuba") ~ "North America",
country %in% c("United Kingdom", "France", "Germany", "Italy", "Spain", "Sweden", "Finland",
"Switzerland", "Andorra", "Latvia", "Ukraine", "Netherlands", "Russia", "Turkey") ~ "Europe",
country %in% c("Australia") ~ "Oceania",
TRUE ~ NA_character_
)
}
dataset$continent <- factor(sapply(dataset$Country, country_to_continent))
# Frequency table
table(dataset$continent)
##
## Africa Asia Europe North America Oceania
## 2 210 75 222 2
## South America
## 56
top_country_views <- dataset %>%
mutate(continent = sapply(Country, country_to_continent)) %>%
group_by(Country, continent) %>%
summarise(total_views = sum(video.views), .groups = "drop") %>%
arrange(desc(total_views)) %>%
group_by(continent) %>%
slice_max(total_views, n = 10)
ggplot(top_country_views, aes(x = continent, y = total_views, fill = Country)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Countries by Total Views per Continent",
x = "Continent", y = "Total Views") +
theme_minimal() +
theme(legend.position = "bottom", axis.text.x = element_text(angle = 45))
dataset$channel_type <- factor(dataset$channel_type)
# Frequency table
table(dataset$channel_type)
##
## Animals Autos Comedy Education Entertainment
## 3 2 30 39 194
## Film Games Howto Music News
## 30 51 14 129 26
## Nonprofit People Sports Tech
## 2 51 10 13
channel_summary <- dataset %>%
group_by(channel_type) %>%
summarise(total_views = sum(video.views), .groups = "drop")
highchart() %>%
hc_chart(type = "column", options3d = list(enabled = TRUE, alpha = 20, beta = 15)) %>%
hc_xAxis(categories = channel_summary$channel_type) %>%
hc_add_series(data = channel_summary$total_views, name = "Total Views") %>%
hc_title(text = "Total Views by Channel Type")
describe(dataset$subscribers)
describe(dataset$video.views)
bubble_data <- dataset %>%
select(Youtuber, subscribers, video.views, category, Country) %>%
mutate(
subscribers = as.numeric(subscribers),
video.views = as.numeric(video.views)
)
highchart() %>%
hc_chart(type = "bubble", zoomType = "xy") %>%
hc_title(text = "Subscribers vs. Video Views") %>%
hc_xAxis(
title = list(text = "Subscribers"),
labels = list(format = "{value / 1000000}M")
) %>%
hc_yAxis(title = list(text = "Video Views")) %>%
hc_add_series(
data = bubble_data,
type = "bubble",
hcaes(x = subscribers, y = video.views, z = subscribers, name = Youtuber),
dataLabels = list(enabled = TRUE, format = '{point.name}'),
colorByPoint = TRUE,
name = "Youtuber"
) %>%
hc_tooltip(
useHTML = TRUE,
formatter = JS("
function() {
var views = this.y;
if (views >= 1e9) views = (views / 1e9).toFixed(1) + 'B';
else if (views >= 1e6) views = (views / 1e6).toFixed(1) + 'M';
else views = views.toFixed(0);
return '<b>' + this.point.name + '</b><br>' +
'Subscribers: ' + Highcharts.numberFormat(this.x, 0) + '<br>' +
'Video Views: ' + views + '<br>' +
'Country: ' + this.point.Country + '<br>' +
'Category: ' + this.point.category;
}
")
)
shapiro.test(dataset$subscribers)
##
## Shapiro-Wilk normality test
##
## data: dataset$subscribers
## W = 0.54011, p-value < 2.2e-16
shapiro.test(dataset$uploads)
##
## Shapiro-Wilk normality test
##
## data: dataset$uploads
## W = 0.34965, p-value < 2.2e-16
wilcox.test(dataset$uploads, dataset$subscribers, paired = TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: dataset$uploads and dataset$subscribers
## V = 0, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
cor.test(dataset$uploads, dataset$subscribers, method = "spearman")
## Warning in cor.test.default(dataset$uploads, dataset$subscribers, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: dataset$uploads and dataset$subscribers
## S = 32844092, p-value = 0.1459
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.05973471
set.seed(123)
boot_results <- replicate(100, {
sample_data <- dataset %>% slice_sample(n = nrow(dataset), replace = TRUE)
model <- lm(subscribers ~ uploads, data = sample_data)
c(coef(model), summary(model)$r.squared)
})
boot_df <- as.data.frame(t(boot_results))
colnames(boot_df) <- c("intercept", "slope", "r_squared")
summary_stats <- boot_df %>%
summarise(
mean_slope = mean(slope),
sd_slope = sd(slope),
mean_intercept = mean(intercept),
sd_intercept = sd(intercept),
mean_r_squared = mean(r_squared),
sd_r_squared = sd(r_squared)
)
summary_stats