HW07

Author

Xiangzhe Li

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

twitch_data <- read_csv("https://raw.githubusercontent.com/vaiseys/dav-course/refs/heads/main/Data/twitchdata-update.csv")

Rows: 1000 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Channel, Language
dbl (7): Watch time(Minutes), Stream time(minutes), Peak viewers, Average vi...
lgl (2): Partnered, Mature

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library(janitor)


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

twitch_data <- clean_names(twitch_data)
colnames(twitch_data)

 [1] "channel"             "watch_time_minutes"  "stream_time_minutes"
 [4] "peak_viewers"        "average_viewers"     "followers"          
 [7] "followers_gained"    "views_gained"        "partnered"          
[10] "mature"              "language"

Question 1

twitch_data |>
  select(channel, average_viewers, followers) |>
  slice_sample(n = 5)

# A tibble: 5 × 3
  channel       average_viewers followers
  <chr>                   <dbl>     <dbl>
1 RocketBeansTV             696    416018
2 CaMaK                    1759    151886
3 Nmplol                   6767    455444
4 hexy                     2092    336093
5 Xayoo_                   6091    572789

It looks like there isn’t a clear connection between followers and viewers.

twitch_data |>
  select(followers, average_viewers) |>
  summary()

   followers       average_viewers 
 Min.   :   3660   Min.   :   235  
 1st Qu.: 170546   1st Qu.:  1458  
 Median : 318063   Median :  2425  
 Mean   : 570054   Mean   :  4781  
 3rd Qu.: 624332   3rd Qu.:  4786  
 Max.   :8938903   Max.   :147643

I noticed that the maximum and minimum values of followers and views are very far from their Q1 and Q3, so they are probably outliers.

twitch_data |>
  ggplot(aes(x = followers, y = average_viewers)) +
  geom_point(alpha = 0.5) +
  labs(x = "Followers",
       y = "Average Viewers",
       title = "Followers vs. Average Viewers for Twitch Streamers")

I noticed that most dots are gathered at the left bottom corner of the graph, with a few at left top, bottom middle and bottom right.

twitch_data |>
  ggplot(aes(x = followers, y = average_viewers)) +
  geom_point(alpha = 0.5) +
  scale_x_log10() +
  scale_y_log10() +
  labs(x = "Followers (log scale)",
       y = "Average Viewers (log scale)",
       title = "Followers vs. Average Viewers for Twitch Streamers")

The graph is “zoomed in” a lot, which makes the relationship (now it looks like there is a strong postive correlations between the variables) clearer.

twitch_data <- twitch_data |> 
  mutate(log_viewers = log10(average_viewers), 
         log_followers = log10(followers))

Question 2

fit1 <- lm(log_viewers ~ log_followers, data = twitch_data)
summary(fit1)


Call:
lm(formula = log_viewers ~ log_followers, data = twitch_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.17259 -0.20163 -0.01883  0.18637  1.54533 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)     0.1978     0.1253   1.579    0.115    
log_followers   0.5885     0.0226  26.042   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3087 on 998 degrees of freedom
Multiple R-squared:  0.4046,    Adjusted R-squared:  0.404 
F-statistic: 678.2 on 1 and 998 DF,  p-value: < 2.2e-16

library(broom)
tidy(fit1)

# A tibble: 2 × 5
  term          estimate std.error statistic   p.value
  <chr>            <dbl>     <dbl>     <dbl>     <dbl>
1 (Intercept)      0.198    0.125       1.58 1.15e-  1
2 log_followers    0.588    0.0226     26.0  1.69e-114

(1.1^0.5884958 - 1) * 100

[1] 5.769249

A 10% increase in followers is associated with a 5.8% increase in the average number of viewers, which comes from the average number of viewers by a factor of 1.1^0.5885 = 1.0577.

Question 3

pred_data <- augment(fit1)
glimpse(pred_data)

Rows: 1,000
Columns: 8
$ log_viewers   <dbl> 4.442731, 4.408410, 4.040444, 3.887280, 4.471321, 4.6275…
$ log_followers <dbl> 6.511388, 6.725108, 6.247393, 6.596030, 6.951284, 6.1940…
$ .fitted       <dbl> 4.029761, 4.155534, 3.874400, 4.079572, 4.288638, 3.8430…
$ .resid        <dbl> 0.4129697, 0.2528757, 0.1660436, -0.1922928, 0.1826833, …
$ .hat          <dbl> 0.006194481, 0.008694557, 0.003782169, 0.007126066, 0.01…
$ .sigma        <dbl> 0.3085580, 0.3087321, 0.3087919, 0.3087764, 0.3087820, 0…
$ .cooksd       <dbl> 0.0056128779, 0.0029688873, 0.0005513456, 0.0014026033, …
$ .std.resid    <dbl> 1.3420109, 0.8227954, 0.5389316, -0.6251793, 0.5953620, …

pred_data |> 
  ggplot(aes(x = log_followers, 
             y = log_viewers)) +
  geom_jitter(alpha = 0.4) + 
  geom_line(aes(x = log_followers, 
                y = .fitted), 
            col = "orange") + 
  theme_minimal() +
  labs(subtitle = "Fitted Model and Raw Data", 
       title = "Followers & Average Viewership", 
       x = "log(followers)", 
       y = "log(viewers)")

The model is indeed pretty good, as it captures the basic trend of the scatterplot and passes through were the dots are most clustered.

pred_data |> 
  ggplot(aes(x = log_followers, y = .resid)) +
  geom_point(alpha = 0.4) +
  theme_minimal() +
  labs(title = "Residuals vs. log(followers)",
       x = "log(followers)",
       y = "Residuals")

Most of the points are dense around residual = 0, which is good. But I do see some big residuals above 1. The largest residuals appear when x is between 5 and 6. Also, in the range of x from 6 to 7, the residuals are generally larger than before, even though they don’t go over 1.

Question 4

Raw Data:

twitch_data |>
  select(language, average_viewers) |>
  slice_sample(n = 10)

# A tibble: 10 × 2
   language   average_viewers
   <chr>                <dbl>
 1 Portuguese            1537
 2 French                2268
 3 Russian               4473
 4 Portuguese            1284
 5 English               1470
 6 English                297
 7 English               1782
 8 Spanish               9301
 9 German                 696
10 French                2311

twitch_data |>
  summarise(num_languages = n_distinct(language))

# A tibble: 1 × 1
  num_languages
          <int>
1            21

Summarize Data:

twitch_data |>
  count(language, sort = TRUE)

# A tibble: 21 × 2
   language       n
   <chr>      <int>
 1 English      485
 2 Korean        77
 3 Russian       74
 4 Spanish       68
 5 French        66
 6 Portuguese    61
 7 German        49
 8 Chinese       30
 9 Turkish       22
10 Italian       17
# ℹ 11 more rows

twitch_data |>
  select(average_viewers) |>
  summary()

 average_viewers 
 Min.   :   235  
 1st Qu.:  1458  
 Median :  2425  
 Mean   :  4781  
 3rd Qu.:  4786  
 Max.   :147643

twitch_data |>
  group_by(language) |>
  summarize(mean_viewers = mean(average_viewers, na.rm = TRUE),
            median_viewers = median(average_viewers, na.rm = TRUE),
            n = n()) |>
  arrange(desc(mean_viewers))

# A tibble: 21 × 4
   language   mean_viewers median_viewers     n
   <chr>             <dbl>          <dbl> <int>
 1 Russian           6594.          2934.    74
 2 Spanish           6450.          3918.    68
 3 Arabic            5682.          5459      5
 4 English           5113.          2483    485
 5 Japanese          4763.          2907     10
 6 Turkish           4761.          3093     22
 7 German            4278.          2181     49
 8 Korean            3914.          2202     77
 9 Portuguese        3801.          1914     61
10 French            3507.          2212     66
# ℹ 11 more rows

Visualization:

twitch_data |>
  ggplot(aes(x = language, y = average_viewers)) +
  geom_boxplot() +
  scale_y_log10(labels = scales::comma) +  
  labs(x = "Language",
       y = "Average Viewers",
       title = "Distribution of Average Viewers by Language") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

twitch_data |>
  group_by(language) |>
  summarize(mean_viewers = mean(average_viewers, na.rm = TRUE)) |>
  ggplot(aes(x = reorder(language, -mean_viewers), y = mean_viewers)) +
  geom_col() +
  scale_y_log10() +
  labs(x = "Language",
       y = "Mean Average Viewers (log scale)",
       title = "Mean Average Viewers by Language") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Question 5

twitch_data1 <- twitch_data |> 
  mutate(language = as.factor(language), 
         language = relevel(language, ref = "English"))
fit1 <- lm(average_viewers ~ language, data = twitch_data1)
summary(fit1)


Call:
lm(formula = average_viewers ~ language, data = twitch_data1)

Residuals:
   Min     1Q Median     3Q    Max 
 -5745  -3317  -2047    173 142530 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)          5112.8      384.6  13.292   <2e-16 ***
languageArabic        569.4     3807.8   0.150    0.881    
languageChinese     -1688.0     1593.7  -1.059    0.290    
languageCzech       -3285.1     3479.6  -0.944    0.345    
languageFinnish     -4085.8     8479.7  -0.482    0.630    
languageFrench      -1606.3     1111.4  -1.445    0.149    
languageGerman       -834.6     1269.8  -0.657    0.511    
languageGreek       -3151.8     8479.7  -0.372    0.710    
languageHungarian   -2972.3     6002.2  -0.495    0.621    
languageItalian     -2907.3     2090.2  -1.391    0.165    
languageJapanese     -350.2     2706.2  -0.129    0.897    
languageKorean      -1198.9     1039.2  -1.154    0.249    
languageOther       -3963.8     8479.7  -0.467    0.640    
languagePolish      -2207.6     2475.4  -0.892    0.373    
languagePortuguese  -1311.9     1150.8  -1.140    0.255    
languageRussian      1481.4     1057.2   1.401    0.161    
languageSlovak      -3040.8     8479.7  -0.359    0.720    
languageSpanish      1336.9     1096.9   1.219    0.223    
languageSwedish     -4085.8     8479.7  -0.482    0.630    
languageThai        -3101.5     2582.9  -1.201    0.230    
languageTurkish      -352.0     1846.5  -0.191    0.849    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8471 on 979 degrees of freedom
Multiple R-squared:  0.01602,   Adjusted R-squared:  -0.004082 
F-statistic: 0.7969 on 20 and 979 DF,  p-value: 0.7194

According to the model, the estimated standard deviations for Russian, Spanish and Arabic are higher than that of English. While English is indeed very popular, it may not be the most dominant, as also suggested by the average viewer bar graph in Question 4.

Question 6

library(broom)

fit_lang <- lm(average_viewers ~ language, data = twitch_data)
pred_lang <- augment(fit_lang, data = twitch_data)
glimpse(pred_lang)

Rows: 1,000
Columns: 19
$ channel             <chr> "xQcOW", "summit1g", "Gaules", "ESL_CSGO", "Tfue",…
$ watch_time_minutes  <dbl> 6196161750, 6091677300, 5644590915, 3970318140, 36…
$ stream_time_minutes <dbl> 215250, 211845, 515280, 517740, 123660, 82260, 136…
$ peak_viewers        <dbl> 222720, 310998, 387315, 300575, 285644, 263720, 11…
$ average_viewers     <dbl> 27716, 25610, 10976, 7714, 29602, 42414, 24181, 18…
$ followers           <dbl> 3246298, 5310163, 1767635, 3944850, 8938903, 15634…
$ followers_gained    <dbl> 1734810, 1370184, 1023779, 703986, 2068424, 554201…
$ views_gained        <dbl> 93036735, 89705964, 102611607, 106546942, 78998587…
$ partnered           <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
$ mature              <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FA…
$ language            <chr> "English", "English", "Portuguese", "English", "En…
$ log_viewers         <dbl> 4.442731, 4.408410, 4.040444, 3.887280, 4.471321, …
$ log_followers       <dbl> 6.511388, 6.725108, 6.247393, 6.596030, 6.951284, …
$ .fitted             <dbl> 5112.814, 5112.814, 3800.934, 5112.814, 5112.814, …
$ .resid              <dbl> 22603.186, 20497.186, 7175.066, 2601.186, 24489.18…
$ .hat                <dbl> 0.002061856, 0.002061856, 0.016393443, 0.002061856…
$ .sigma              <dbl> 8444.313, 8449.818, 8472.094, 8474.843, 8438.923, …
$ .cooksd             <dbl> 7.019555e-04, 5.772431e-04, 5.788931e-04, 9.296361…
$ .std.resid          <dbl> 2.6710814, 2.4222095, 0.8540523, 0.3073893, 2.8939…

pred_lang |>
  drop_na(.std.resid) |>
  ggplot(aes(x = language, y = .std.resid)) +
  geom_point(alpha = 0.4) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Standardized Residuals vs. Language",
    x = "Language",
    y = "Standardized Residuals"
  ) +
  coord_cartesian(ylim = c(-0.5, 20))

The points miss-captured:

pred_lang |> 
  slice_max(abs(.resid), n = 5) |> 
  select(channel, language, average_viewers, .fitted, .resid)

# A tibble: 5 × 5
  channel    language average_viewers .fitted  .resid
  <chr>      <chr>              <dbl>   <dbl>   <dbl>
1 dota2ti    English           147643   5113. 142530.
2 dota2ti_ru Russian           126232   6594. 119638.
3 auronplay  Spanish            53986   6450.  47536.
4 LCS        English            46459   5113.  41346.
5 Asmongold  English            42414   5113.  37301.