Introduction

This analysis explores YouTube statistics data sourced from Kaggle. The focus is on understanding patterns across country, channel types, and key metrics such as subscribers, video views, and uploads. The analysis includes descriptive and inferential statistics, as well as bootstrap simulation.

1. Load Packages and Dataset

library(tidyverse)
library(psych)
library(highcharter)
library(stringr)

# Read CSV
dataset <- read.csv("data/global_youtube_statistics.csv")

# Preview
head(dataset)

2. Data Cleaning

# Clean non-alphanumeric characters
dataset$Youtuber <- str_replace_all(dataset$Youtuber, "[^[:alnum:]\\s]", "")
dataset$Title <- str_replace_all(dataset$Title, "[^[:alnum:]\\s]", "")

# Drop unnecessary columns
dataset <- dataset %>% 
  select(-c(Gross.tertiary.education.enrollment...., Population, Unemployment.rate,
            Urban_population, Latitude, Longitude)) %>%
  drop_na()

# Check missing values
colSums(is.na(dataset))
##                             rank                         Youtuber 
##                                0                                0 
##                      subscribers                      video.views 
##                                0                                0 
##                         category                            Title 
##                                0                                0 
##                          uploads                          Country 
##                                0                                0 
##                     Abbreviation                     channel_type 
##                                0                                0 
##                 video_views_rank                     country_rank 
##                                0                                0 
##                channel_type_rank video_views_for_the_last_30_days 
##                                0                                0 
##          lowest_monthly_earnings         highest_monthly_earnings 
##                                0                                0 
##           lowest_yearly_earnings          highest_yearly_earnings 
##                                0                                0 
##     subscribers_for_last_30_days                     created_year 
##                                0                                0 
##                    created_month                     created_date 
##                                0                                0

3. Exploratory Data Analysis (EDA)

3.1 Country and Continent Analysis

country_to_continent <- function(country) {
  case_when(
    country %in% c("Afghanistan", "Bangladesh", "India", "Pakistan", "Iraq", "Iran", "Saudi Arabia",
                   "Kuwait", "Jordan", "China", "Indonesia", "Japan", "Malaysia", "Philippines",
                   "Singapore", "Thailand", "Vietnam") ~ "Asia",
    country %in% c("Egypt", "Morocco") ~ "Africa",
    country %in% c("Argentina", "Brazil", "Colombia", "Peru", "Venezuela", "Ecuador") ~ "South America",
    country %in% c("Canada", "United States", "Mexico", "El Salvador", "Cuba") ~ "North America",
    country %in% c("United Kingdom", "France", "Germany", "Italy", "Spain", "Sweden", "Finland",
                   "Switzerland", "Andorra", "Latvia", "Ukraine", "Netherlands", "Russia", "Turkey") ~ "Europe",
    country %in% c("Australia") ~ "Oceania",
    TRUE ~ NA_character_
  )
}

dataset$continent <- factor(sapply(dataset$Country, country_to_continent))

# Frequency table
table(dataset$continent)
## 
##        Africa          Asia        Europe North America       Oceania 
##             2           210            75           222             2 
## South America 
##            56

Visualization: Top 10 Countries by Views

top_country_views <- dataset %>%
  mutate(continent = sapply(Country, country_to_continent)) %>%
  group_by(Country, continent) %>%
  summarise(total_views = sum(video.views), .groups = "drop") %>%
  arrange(desc(total_views)) %>%
  group_by(continent) %>%
  slice_max(total_views, n = 10)

ggplot(top_country_views, aes(x = continent, y = total_views, fill = Country)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 10 Countries by Total Views per Continent",
       x = "Continent", y = "Total Views") +
  theme_minimal() +
  theme(legend.position = "bottom", axis.text.x = element_text(angle = 45))

3.2 Channel Type

dataset$channel_type <- factor(dataset$channel_type)

# Frequency table
table(dataset$channel_type)
## 
##       Animals         Autos        Comedy     Education Entertainment 
##             3             2            30            39           194 
##          Film         Games         Howto         Music          News 
##            30            51            14           129            26 
##     Nonprofit        People        Sports          Tech 
##             2            51            10            13

Visualization: 3D Column Chart

channel_summary <- dataset %>%
  group_by(channel_type) %>%
  summarise(total_views = sum(video.views), .groups = "drop")

highchart() %>%
  hc_chart(type = "column", options3d = list(enabled = TRUE, alpha = 20, beta = 15)) %>%
  hc_xAxis(categories = channel_summary$channel_type) %>%
  hc_add_series(data = channel_summary$total_views, name = "Total Views") %>%
  hc_title(text = "Total Views by Channel Type")

3.3 Numerical: Subscribers & Video Views

describe(dataset$subscribers)
describe(dataset$video.views)

Bubble Chart

bubble_data <- dataset %>%
  select(Youtuber, subscribers, video.views, category, Country) %>%
  mutate(
    subscribers = as.numeric(subscribers),
    video.views = as.numeric(video.views)
  )

highchart() %>%
  hc_chart(type = "bubble", zoomType = "xy") %>%
  hc_title(text = "Subscribers vs. Video Views") %>%
  hc_xAxis(
    title = list(text = "Subscribers"),
    labels = list(format = "{value / 1000000}M")
  ) %>%
  hc_yAxis(title = list(text = "Video Views")) %>%
  hc_add_series(
    data = bubble_data,
    type = "bubble",
    hcaes(x = subscribers, y = video.views, z = subscribers, name = Youtuber),
    dataLabels = list(enabled = TRUE, format = '{point.name}'),
    colorByPoint = TRUE,
    name = "Youtuber"
  ) %>%
  hc_tooltip(
    useHTML = TRUE,
    formatter = JS("
      function() {
        var views = this.y;
        if (views >= 1e9) views = (views / 1e9).toFixed(1) + 'B';
        else if (views >= 1e6) views = (views / 1e6).toFixed(1) + 'M';
        else views = views.toFixed(0);

        return '<b>' + this.point.name + '</b><br>' +
               'Subscribers: ' + Highcharts.numberFormat(this.x, 0) + '<br>' +
               'Video Views: ' + views + '<br>' +
               'Country: ' + this.point.Country + '<br>' +
               'Category: ' + this.point.category;
      }
    ")
  )

4. Inferential Statistics

shapiro.test(dataset$subscribers)
## 
##  Shapiro-Wilk normality test
## 
## data:  dataset$subscribers
## W = 0.54011, p-value < 2.2e-16
shapiro.test(dataset$uploads)
## 
##  Shapiro-Wilk normality test
## 
## data:  dataset$uploads
## W = 0.34965, p-value < 2.2e-16
wilcox.test(dataset$uploads, dataset$subscribers, paired = TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  dataset$uploads and dataset$subscribers
## V = 0, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
cor.test(dataset$uploads, dataset$subscribers, method = "spearman")
## Warning in cor.test.default(dataset$uploads, dataset$subscribers, method =
## "spearman"): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  dataset$uploads and dataset$subscribers
## S = 32844092, p-value = 0.1459
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.05973471

5. Bootstrap Simulation

set.seed(123)

boot_results <- replicate(100, {
  sample_data <- dataset %>% slice_sample(n = nrow(dataset), replace = TRUE)
  model <- lm(subscribers ~ uploads, data = sample_data)
  c(coef(model), summary(model)$r.squared)
})

boot_df <- as.data.frame(t(boot_results))
colnames(boot_df) <- c("intercept", "slope", "r_squared")

summary_stats <- boot_df %>%
  summarise(
    mean_slope = mean(slope),
    sd_slope = sd(slope),
    mean_intercept = mean(intercept),
    sd_intercept = sd(intercept),
    mean_r_squared = mean(r_squared),
    sd_r_squared = sd(r_squared)
  )

summary_stats