knitr::opts_chunk$set(echo = TRUE)



library(tidyverse)
## Warning: пакет 'tidyverse' был собран под R версии 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(knitr)
## Warning: пакет 'knitr' был собран под R версии 4.5.3
library(kableExtra)
## Warning: пакет 'kableExtra' был собран под R версии 4.5.3
## 
## Присоединяю пакет: 'kableExtra'
## 
## Следующий объект скрыт от 'package:dplyr':
## 
##     group_rows
library(scales)
## Warning: пакет 'scales' был собран под R версии 4.5.3
## 
## Присоединяю пакет: 'scales'
## 
## Следующий объект скрыт от 'package:purrr':
## 
##     discard
## 
## Следующий объект скрыт от 'package:readr':
## 
##     col_factor
library(RColorBrewer)
 
 

# 1. INTRODUCTION - WHY TWITCH, WHAT MAKES IT UNIQUE

 
twitch_facts <- tibble(
  Metric = c("Monthly active users",
             "Avg. concurrent viewers",
             "Active streamers / month",
             "Hours watched / month (B)",
             "Avg. session length (min)"),
  Value  = c("140M+", "2.5M", "8M", "1.6", "95")
)
print(twitch_facts)
## # A tibble: 5 × 2
##   Metric                    Value
##   <chr>                     <chr>
## 1 Monthly active users      140M+
## 2 Avg. concurrent viewers   2.5M 
## 3 Active streamers / month  8M   
## 4 Hours watched / month (B) 1.6  
## 5 Avg. session length (min) 95
content_mix <- tibble(
  Category = c("Just Chatting", "League of Legends", "GTA V", "Valorant",
               "Minecraft", "Fortnite", "Music", "Other"),
  Hours_M  = c(420, 180, 150, 130, 110, 95, 70, 380)
)
 
ggplot(content_mix, aes(reorder(Category, Hours_M), Hours_M, fill = Category)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(title = "Hours Watched by Top Twitch Categories",
       x = NULL, y = "Hours watched (millions)") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2")

# 2. SCENARIO DESIGN - USER PERSPECTIVE

 
user_scenarios <- tibble(
  Scenario       = c("Returning fan", "Bored browser", "Game-focused viewer",
                     "New user onboarding", "Mobile, short session"),
  User_Goal      = c("Catch favorite streamer live",
                     "Find anything entertaining",
                     "Watch a specific game",
                     "Discover the platform",
                     "Quick 10-min watch"),
  Signal_Used    = c("Follow list, watch history",
                     "Watch history, time-of-day",
                     "Category page, language",
                     "Sign-up survey, popularity",
                     "Device type, short clips"),
  Success_Metric = c("CTR to followed channel",
                     "Watch time > 5 min",
                     "Time on category page",
                     "Day-7 retention",
                     "Clip completion rate")
)
print(user_scenarios)
## # A tibble: 5 × 4
##   Scenario              User_Goal                    Signal_Used  Success_Metric
##   <chr>                 <chr>                        <chr>        <chr>         
## 1 Returning fan         Catch favorite streamer live Follow list… CTR to follow…
## 2 Bored browser         Find anything entertaining   Watch histo… Watch time > …
## 3 Game-focused viewer   Watch a specific game        Category pa… Time on categ…
## 4 New user onboarding   Discover the platform        Sign-up sur… Day-7 retenti…
## 5 Mobile, short session Quick 10-min watch           Device type… Clip completi…
set.seed(42)
days <- 1:30
journey <- tibble(
  day        = rep(days, 2),
  engagement = c(100 * exp(-0.08 * days) + rnorm(30, 0, 3),
                 60 + 20 * sin(days / 3) + rnorm(30, 0, 4)),
  user_type  = rep(c("New user", "Returning fan"), each = 30)
)
 
ggplot(journey, aes(day, engagement, color = user_type)) +
  geom_line(size = 1.1) +
  geom_point() +
  labs(title = "Simulated Engagement Patterns by User Type",
       x = "Day", y = "Daily watch minutes", color = "User type") +
  theme_minimal() +
  scale_color_manual(values = c("#9146FF", "#00B5AD"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 3. SCENARIO DESIGN - BUSINESS PERSPECTIVE

 
biz_objectives <- tibble(
  Objective = c("Maximize watch time",
                "Grow subs/Bits",
                "Support emerging streamers",
                "Reduce churn",
                "Brand safety"),
  KPI       = c("Avg. minutes/session",
                "Conversion to paid",
                "% views to <1k follower channels",
                "Day-30 retention",
                "% flagged streams surfaced"),
  Lever     = c("Personalized 'Live now' rail",
                "Promote sub-only perks",
                "Diversity boost in feed",
                "Re-engagement notifications",
                "Filter low-trust signals")
)
print(biz_objectives)
## # A tibble: 5 × 3
##   Objective                  KPI                              Lever             
##   <chr>                      <chr>                            <chr>             
## 1 Maximize watch time        Avg. minutes/session             Personalized 'Liv…
## 2 Grow subs/Bits             Conversion to paid               Promote sub-only …
## 3 Support emerging streamers % views to <1k follower channels Diversity boost i…
## 4 Reduce churn               Day-30 retention                 Re-engagement not…
## 5 Brand safety               % flagged streams surfaced       Filter low-trust …
tradeoff <- tibble(
  Strategy         = c("Pure popularity", "Pure personalization",
                       "Diversity-aware", "Exploration boost",
                       "Hybrid (current)"),
  Watch_Time       = c(0.85, 0.78, 0.72, 0.65, 0.80),
  Ecosystem_Health = c(0.30, 0.50, 0.85, 0.90, 0.75)
)
 
ggplot(tradeoff, aes(Watch_Time, Ecosystem_Health, label = Strategy)) +
  geom_point(size = 5, color = "#9146FF") +
  geom_text(vjust = -1.2, size = 3.8) +
  xlim(0.5, 0.95) +
  ylim(0.2, 1.0) +
  labs(title = "Recommender Strategy Tradeoffs",
       x = "Short-term watch time",
       y = "Long-term ecosystem health") +
  theme_minimal()

# 4. REVERSE ENGINEERING - SIGNALS, ALGORITHM, LIVE CHALLENGE

# NOTE: written without `$` access in the scoring function to make
# this script robust to copy-paste through PDF/Word editors that
# sometimes strip the dollar sign.
 
set.seed(123)
 
streamers <- tibble(
  streamer_id = paste0("S", 1:8),
  name        = c("xQc", "Pokimane", "shroud", "Ninja",
                  "Asmongold", "Valkyrae", "summit1g", "TommyInnit"),
  category    = c("Just Chatting", "Just Chatting", "Valorant", "Fortnite",
                  "WoW", "GTA V", "GTA V", "Minecraft"),
  language    = c("EN", "EN", "EN", "EN", "EN", "EN", "EN", "EN"),
  avg_viewers = c(70000, 25000, 30000, 20000, 60000, 18000, 22000, 40000),
  is_live     = c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE)
)
 
users <- tibble(
  user_id      = paste0("U", 1:5),
  fav_category = c("Just Chatting", "Valorant", "Minecraft",
                   "GTA V", "Just Chatting"),
  fav_language = rep("EN", 5)
)
 
watch_matrix <- matrix(
  c(120,  60,   0,  10,  80,   0,   5,   0,
      0,   5, 200,   0,   0,   0,  20,   0,
     10,   0,   0,  30,   5,   0,   0, 240,
      0,  15,  10,   0,   0, 180, 150,   0,
     90,  70,   0,   0, 110,   0,   0,  20),
  nrow = 5, byrow = TRUE
)
rownames(watch_matrix) <- pull(users, user_id)
colnames(watch_matrix) <- pull(streamers, streamer_id)
print(watch_matrix)
##     S1 S2  S3 S4  S5  S6  S7  S8
## U1 120 60   0 10  80   0   5   0
## U2   0  5 200  0   0   0  20   0
## U3  10  0   0 30   5   0   0 240
## U4   0 15  10  0   0 180 150   0
## U5  90 70   0  0 110   0   0  20
# Hybrid scoring function (uses pull() instead of $ to avoid copy issues)
score_streamers <- function(uid, w_cf = 0.5, w_content = 0.3, w_pop = 0.2) {
 
  user_row     <- filter(users, user_id == uid)
  fav_cat      <- pull(user_row, fav_category)
  fav_lang     <- pull(user_row, fav_language)
 
  s_category   <- pull(streamers, category)
  s_language   <- pull(streamers, language)
  s_viewers    <- pull(streamers, avg_viewers)
  s_is_live    <- pull(streamers, is_live)
 
  # Collaborative-filtering proxy
  cf <- watch_matrix[uid, ]
  cf <- cf / max(cf, 1)
 
  # Content match
  content <- (s_category == fav_cat) * 1.0 + (s_language == fav_lang) * 0.3
  content <- pmin(content / 1.3, 1)
 
  # Popularity / live boost
  pop <- s_viewers / max(s_viewers)
  pop <- pop * ifelse(s_is_live, 1, 0.2)
 
  streamers %>%
    mutate(cf          = round(cf, 2),
           content     = round(content, 2),
           popularity  = round(pop, 2),
           final_score = round(w_cf * cf + w_content * content + w_pop * pop, 3)) %>%
    arrange(desc(final_score))
}
 
recs_u1 <- score_streamers("U1")
print(recs_u1)
## # A tibble: 8 × 10
##   streamer_id name       category     language avg_viewers is_live    cf content
##   <chr>       <chr>      <chr>        <chr>          <dbl> <lgl>   <dbl>   <dbl>
## 1 S1          xQc        Just Chatti… EN             70000 TRUE     1       1   
## 2 S2          Pokimane   Just Chatti… EN             25000 TRUE     0.5     1   
## 3 S5          Asmongold  WoW          EN             60000 TRUE     0.67    0.23
## 4 S8          TommyInnit Minecraft    EN             40000 TRUE     0       0.23
## 5 S4          Ninja      Fortnite     EN             20000 TRUE     0.08    0.23
## 6 S7          summit1g   GTA V        EN             22000 TRUE     0.04    0.23
## 7 S3          shroud     Valorant     EN             30000 FALSE    0       0.23
## 8 S6          Valkyrae   GTA V        EN             18000 FALSE    0       0.23
## # ℹ 2 more variables: popularity <dbl>, final_score <dbl>
ggplot(recs_u1,
       aes(reorder(name, final_score), final_score, fill = is_live)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = c("TRUE" = "#9146FF", "FALSE" = "grey70"),
                    labels = c("TRUE" = "Live now", "FALSE" = "Offline")) +
  labs(title = "Final Recommendation Scores for User U1",
       x = NULL, y = "Hybrid score", fill = NULL) +
  theme_minimal()

# 5. COMPARISON WITH AMAZON / NYT

 
comp_scores <- tibble(
  Dimension = rep(c("Item churn", "Personalization weight",
                    "Cold-start severity", "Real-time pressure",
                    "Diversity need"), 3),
  Platform  = rep(c("Twitch", "Amazon", "NYT"), each = 5),
  Score     = c(5, 4, 5, 5, 4,    # Twitch
                2, 5, 3, 2, 3,    # Amazon
                5, 3, 5, 4, 5)    # NYT
)
 
ggplot(comp_scores, aes(Dimension, Score, fill = Platform)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("Twitch" = "#9146FF",
                               "Amazon" = "#FF9900",
                               "NYT"    = "#326891")) +
  labs(title = "Recommender Pressure Profile by Platform",
       y = "Intensity (1-5)", x = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 20, hjust = 1))

# 6. RECOMMENDATIONS FOR IMPROVEMENT

 
improvements <- tibble(
  N = 1:4,
  Recommendation = c("Better explanation of recs",
                     "Diversity boost for emerging streamers",
                     "Smarter mid-stream re-categorization",
                     "Cross-device session continuity"),
  Expected_Impact = c(4, 9, 6, 3)
)
 
ggplot(improvements,
       aes(reorder(Recommendation, N), Expected_Impact,
           fill = Recommendation)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(title = "Estimated Relative Impact of Recommendations",
       x = NULL, y = "Relative impact") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2")

# 7. CONCLUSION

 
cat("Twitch's recommender must blend collaborative, content, and",
    "live-popularity signals.\n",
    "It differs most from Amazon and NYT on item lifetime and",
    "real-time pressure.\n",
    "Key improvement areas: explainability, diversity, mid-stream",
    "re-tagging, cross-device.\n")
## Twitch's recommender must blend collaborative, content, and live-popularity signals.
##  It differs most from Amazon and NYT on item lifetime and real-time pressure.
##  Key improvement areas: explainability, diversity, mid-stream re-tagging, cross-device.
# 8. REFERENCES

# Twitch developer docs:        https://dev.twitch.tv/docs
# Linden, Smith, York (2003)    Item-to-item collaborative filtering
# Covington et al. (2016)       Deep NN for YouTube Recommendations
# Spangher (2015)               Building NYT recommendation engine
# Gomez-Uribe & Hunt (2015)     The Netflix Recommender System
# Ricci et al. (2022)           Recommender Systems Handbook, 3rd ed.