knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## Warning: пакет 'tidyverse' был собран под R версии 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(knitr)
## Warning: пакет 'knitr' был собран под R версии 4.5.3
library(kableExtra)
## Warning: пакет 'kableExtra' был собран под R версии 4.5.3
##
## Присоединяю пакет: 'kableExtra'
##
## Следующий объект скрыт от 'package:dplyr':
##
## group_rows
library(scales)
## Warning: пакет 'scales' был собран под R версии 4.5.3
##
## Присоединяю пакет: 'scales'
##
## Следующий объект скрыт от 'package:purrr':
##
## discard
##
## Следующий объект скрыт от 'package:readr':
##
## col_factor
library(RColorBrewer)
# 1. INTRODUCTION - WHY TWITCH, WHAT MAKES IT UNIQUE
twitch_facts <- tibble(
Metric = c("Monthly active users",
"Avg. concurrent viewers",
"Active streamers / month",
"Hours watched / month (B)",
"Avg. session length (min)"),
Value = c("140M+", "2.5M", "8M", "1.6", "95")
)
print(twitch_facts)
## # A tibble: 5 × 2
## Metric Value
## <chr> <chr>
## 1 Monthly active users 140M+
## 2 Avg. concurrent viewers 2.5M
## 3 Active streamers / month 8M
## 4 Hours watched / month (B) 1.6
## 5 Avg. session length (min) 95
content_mix <- tibble(
Category = c("Just Chatting", "League of Legends", "GTA V", "Valorant",
"Minecraft", "Fortnite", "Music", "Other"),
Hours_M = c(420, 180, 150, 130, 110, 95, 70, 380)
)
ggplot(content_mix, aes(reorder(Category, Hours_M), Hours_M, fill = Category)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Hours Watched by Top Twitch Categories",
x = NULL, y = "Hours watched (millions)") +
theme_minimal() +
scale_fill_brewer(palette = "Set2")

# 2. SCENARIO DESIGN - USER PERSPECTIVE
user_scenarios <- tibble(
Scenario = c("Returning fan", "Bored browser", "Game-focused viewer",
"New user onboarding", "Mobile, short session"),
User_Goal = c("Catch favorite streamer live",
"Find anything entertaining",
"Watch a specific game",
"Discover the platform",
"Quick 10-min watch"),
Signal_Used = c("Follow list, watch history",
"Watch history, time-of-day",
"Category page, language",
"Sign-up survey, popularity",
"Device type, short clips"),
Success_Metric = c("CTR to followed channel",
"Watch time > 5 min",
"Time on category page",
"Day-7 retention",
"Clip completion rate")
)
print(user_scenarios)
## # A tibble: 5 × 4
## Scenario User_Goal Signal_Used Success_Metric
## <chr> <chr> <chr> <chr>
## 1 Returning fan Catch favorite streamer live Follow list… CTR to follow…
## 2 Bored browser Find anything entertaining Watch histo… Watch time > …
## 3 Game-focused viewer Watch a specific game Category pa… Time on categ…
## 4 New user onboarding Discover the platform Sign-up sur… Day-7 retenti…
## 5 Mobile, short session Quick 10-min watch Device type… Clip completi…
set.seed(42)
days <- 1:30
journey <- tibble(
day = rep(days, 2),
engagement = c(100 * exp(-0.08 * days) + rnorm(30, 0, 3),
60 + 20 * sin(days / 3) + rnorm(30, 0, 4)),
user_type = rep(c("New user", "Returning fan"), each = 30)
)
ggplot(journey, aes(day, engagement, color = user_type)) +
geom_line(size = 1.1) +
geom_point() +
labs(title = "Simulated Engagement Patterns by User Type",
x = "Day", y = "Daily watch minutes", color = "User type") +
theme_minimal() +
scale_color_manual(values = c("#9146FF", "#00B5AD"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 3. SCENARIO DESIGN - BUSINESS PERSPECTIVE
biz_objectives <- tibble(
Objective = c("Maximize watch time",
"Grow subs/Bits",
"Support emerging streamers",
"Reduce churn",
"Brand safety"),
KPI = c("Avg. minutes/session",
"Conversion to paid",
"% views to <1k follower channels",
"Day-30 retention",
"% flagged streams surfaced"),
Lever = c("Personalized 'Live now' rail",
"Promote sub-only perks",
"Diversity boost in feed",
"Re-engagement notifications",
"Filter low-trust signals")
)
print(biz_objectives)
## # A tibble: 5 × 3
## Objective KPI Lever
## <chr> <chr> <chr>
## 1 Maximize watch time Avg. minutes/session Personalized 'Liv…
## 2 Grow subs/Bits Conversion to paid Promote sub-only …
## 3 Support emerging streamers % views to <1k follower channels Diversity boost i…
## 4 Reduce churn Day-30 retention Re-engagement not…
## 5 Brand safety % flagged streams surfaced Filter low-trust …
tradeoff <- tibble(
Strategy = c("Pure popularity", "Pure personalization",
"Diversity-aware", "Exploration boost",
"Hybrid (current)"),
Watch_Time = c(0.85, 0.78, 0.72, 0.65, 0.80),
Ecosystem_Health = c(0.30, 0.50, 0.85, 0.90, 0.75)
)
ggplot(tradeoff, aes(Watch_Time, Ecosystem_Health, label = Strategy)) +
geom_point(size = 5, color = "#9146FF") +
geom_text(vjust = -1.2, size = 3.8) +
xlim(0.5, 0.95) +
ylim(0.2, 1.0) +
labs(title = "Recommender Strategy Tradeoffs",
x = "Short-term watch time",
y = "Long-term ecosystem health") +
theme_minimal()

# 4. REVERSE ENGINEERING - SIGNALS, ALGORITHM, LIVE CHALLENGE
# NOTE: written without `$` access in the scoring function to make
# this script robust to copy-paste through PDF/Word editors that
# sometimes strip the dollar sign.
set.seed(123)
streamers <- tibble(
streamer_id = paste0("S", 1:8),
name = c("xQc", "Pokimane", "shroud", "Ninja",
"Asmongold", "Valkyrae", "summit1g", "TommyInnit"),
category = c("Just Chatting", "Just Chatting", "Valorant", "Fortnite",
"WoW", "GTA V", "GTA V", "Minecraft"),
language = c("EN", "EN", "EN", "EN", "EN", "EN", "EN", "EN"),
avg_viewers = c(70000, 25000, 30000, 20000, 60000, 18000, 22000, 40000),
is_live = c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE)
)
users <- tibble(
user_id = paste0("U", 1:5),
fav_category = c("Just Chatting", "Valorant", "Minecraft",
"GTA V", "Just Chatting"),
fav_language = rep("EN", 5)
)
watch_matrix <- matrix(
c(120, 60, 0, 10, 80, 0, 5, 0,
0, 5, 200, 0, 0, 0, 20, 0,
10, 0, 0, 30, 5, 0, 0, 240,
0, 15, 10, 0, 0, 180, 150, 0,
90, 70, 0, 0, 110, 0, 0, 20),
nrow = 5, byrow = TRUE
)
rownames(watch_matrix) <- pull(users, user_id)
colnames(watch_matrix) <- pull(streamers, streamer_id)
print(watch_matrix)
## S1 S2 S3 S4 S5 S6 S7 S8
## U1 120 60 0 10 80 0 5 0
## U2 0 5 200 0 0 0 20 0
## U3 10 0 0 30 5 0 0 240
## U4 0 15 10 0 0 180 150 0
## U5 90 70 0 0 110 0 0 20
# Hybrid scoring function (uses pull() instead of $ to avoid copy issues)
score_streamers <- function(uid, w_cf = 0.5, w_content = 0.3, w_pop = 0.2) {
user_row <- filter(users, user_id == uid)
fav_cat <- pull(user_row, fav_category)
fav_lang <- pull(user_row, fav_language)
s_category <- pull(streamers, category)
s_language <- pull(streamers, language)
s_viewers <- pull(streamers, avg_viewers)
s_is_live <- pull(streamers, is_live)
# Collaborative-filtering proxy
cf <- watch_matrix[uid, ]
cf <- cf / max(cf, 1)
# Content match
content <- (s_category == fav_cat) * 1.0 + (s_language == fav_lang) * 0.3
content <- pmin(content / 1.3, 1)
# Popularity / live boost
pop <- s_viewers / max(s_viewers)
pop <- pop * ifelse(s_is_live, 1, 0.2)
streamers %>%
mutate(cf = round(cf, 2),
content = round(content, 2),
popularity = round(pop, 2),
final_score = round(w_cf * cf + w_content * content + w_pop * pop, 3)) %>%
arrange(desc(final_score))
}
recs_u1 <- score_streamers("U1")
print(recs_u1)
## # A tibble: 8 × 10
## streamer_id name category language avg_viewers is_live cf content
## <chr> <chr> <chr> <chr> <dbl> <lgl> <dbl> <dbl>
## 1 S1 xQc Just Chatti… EN 70000 TRUE 1 1
## 2 S2 Pokimane Just Chatti… EN 25000 TRUE 0.5 1
## 3 S5 Asmongold WoW EN 60000 TRUE 0.67 0.23
## 4 S8 TommyInnit Minecraft EN 40000 TRUE 0 0.23
## 5 S4 Ninja Fortnite EN 20000 TRUE 0.08 0.23
## 6 S7 summit1g GTA V EN 22000 TRUE 0.04 0.23
## 7 S3 shroud Valorant EN 30000 FALSE 0 0.23
## 8 S6 Valkyrae GTA V EN 18000 FALSE 0 0.23
## # ℹ 2 more variables: popularity <dbl>, final_score <dbl>
ggplot(recs_u1,
aes(reorder(name, final_score), final_score, fill = is_live)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = c("TRUE" = "#9146FF", "FALSE" = "grey70"),
labels = c("TRUE" = "Live now", "FALSE" = "Offline")) +
labs(title = "Final Recommendation Scores for User U1",
x = NULL, y = "Hybrid score", fill = NULL) +
theme_minimal()

# 5. COMPARISON WITH AMAZON / NYT
comp_scores <- tibble(
Dimension = rep(c("Item churn", "Personalization weight",
"Cold-start severity", "Real-time pressure",
"Diversity need"), 3),
Platform = rep(c("Twitch", "Amazon", "NYT"), each = 5),
Score = c(5, 4, 5, 5, 4, # Twitch
2, 5, 3, 2, 3, # Amazon
5, 3, 5, 4, 5) # NYT
)
ggplot(comp_scores, aes(Dimension, Score, fill = Platform)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("Twitch" = "#9146FF",
"Amazon" = "#FF9900",
"NYT" = "#326891")) +
labs(title = "Recommender Pressure Profile by Platform",
y = "Intensity (1-5)", x = NULL) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 20, hjust = 1))

# 6. RECOMMENDATIONS FOR IMPROVEMENT
improvements <- tibble(
N = 1:4,
Recommendation = c("Better explanation of recs",
"Diversity boost for emerging streamers",
"Smarter mid-stream re-categorization",
"Cross-device session continuity"),
Expected_Impact = c(4, 9, 6, 3)
)
ggplot(improvements,
aes(reorder(Recommendation, N), Expected_Impact,
fill = Recommendation)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Estimated Relative Impact of Recommendations",
x = NULL, y = "Relative impact") +
theme_minimal() +
scale_fill_brewer(palette = "Set2")

# 7. CONCLUSION
cat("Twitch's recommender must blend collaborative, content, and",
"live-popularity signals.\n",
"It differs most from Amazon and NYT on item lifetime and",
"real-time pressure.\n",
"Key improvement areas: explainability, diversity, mid-stream",
"re-tagging, cross-device.\n")
## Twitch's recommender must blend collaborative, content, and live-popularity signals.
## It differs most from Amazon and NYT on item lifetime and real-time pressure.
## Key improvement areas: explainability, diversity, mid-stream re-tagging, cross-device.
# 8. REFERENCES
# Twitch developer docs: https://dev.twitch.tv/docs
# Linden, Smith, York (2003) Item-to-item collaborative filtering
# Covington et al. (2016) Deep NN for YouTube Recommendations
# Spangher (2015) Building NYT recommendation engine
# Gomez-Uribe & Hunt (2015) The Netflix Recommender System
# Ricci et al. (2022) Recommender Systems Handbook, 3rd ed.