setwd("~/Desktop/DSCI_304/Final_Project")
#https://www.kaggle.com/datasets/datasnaek/chess?resource=download
games <- read.csv("games.csv")
#View(games)
library(ggplot2)
ggplot(games, aes(x = turns, fill = winner)) +
geom_density(alpha = 0.6) +
labs(
title = "Density of Game Lengths by Winner",
x = "Number of Turns",
y = "Density",
fill = "Winner"
) +
theme_minimal()

library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Top 15 openings
top_openings <- games %>%
count(opening_name, sort = TRUE) %>%
slice_head(n = 15) %>%
pull(opening_name)
# Filter to top 15
games_top <- games %>%
filter(opening_name %in% top_openings)
# Count number of games by opening and winner
summary_df <- games_top %>%
group_by(opening_name, winner) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(opening_name) %>%
mutate(total = sum(n),
pct = n / total) %>%
ungroup()
# Interactive stacked bar chart
plot_ly(
summary_df,
x = ~reorder(opening_name, -total),
y = ~n,
color = ~winner,
type = "bar",
text = ~paste0(
"Winner: ", winner, "<br>",
"Games: ", n, "<br>",
"Win %: ", scales::percent(pct, accuracy = 0.1)
),
hoverinfo = "text"
) %>%
layout(
barmode = "stack",
title = "Number of Games by Opening and Winner",
xaxis = list(title = "Opening Name"),
yaxis = list(title = "Number of Games")
)
library(dplyr)
library(stringr)
library(interactions)
library(ggplot2)
# Preprocess time controls
games2 <- games %>%
mutate(
avg_rating = (white_rating + black_rating) / 2,
# increment_code contains things like "10+0", "5+5", "3+2"
base_time = as.numeric(str_extract(increment_code, "^[0-9]+")),
increment = as.numeric(str_extract(increment_code, "(?<=\\+)[0-9]+")),
base_time = ifelse(is.na(base_time), 0, base_time),
increment = ifelse(is.na(increment), 0, increment)
) %>%
filter(
base_time > 0,
turns > 0,
avg_rating > 0
)
# Convert base_time → time control category
games2 <- games2 %>%
mutate(
base_time_cat = case_when(
base_time <= 3 ~ "Bullet (1–3 min)",
base_time <= 10 ~ "Blitz (4–10 min)",
base_time <= 25 ~ "Rapid (11–25 min)",
TRUE ~ "Classical"
),
base_time_cat = factor(
base_time_cat,
levels = c("Bullet (1–3 min)", "Blitz (4–10 min)", "Rapid (11–25 min)", "Classical")
)
)
# Interaction model
m <- lm(turns ~ avg_rating * base_time_cat, data = games2)
summary(m)
##
## Call:
## lm(formula = turns ~ avg_rating * base_time_cat, data = games2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.648 -23.588 -5.183 18.087 275.067
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 20.6787824 16.6932873 1.239
## avg_rating 0.0196658 0.0103690 1.897
## base_time_catBlitz (4–10 min) 8.5802388 16.7890298 0.511
## base_time_catRapid (11–25 min) 7.3006811 16.9198757 0.431
## base_time_catClassical -4.5051994 17.4845420 -0.258
## avg_rating:base_time_catBlitz (4–10 min) 0.0004847 0.0104273 0.046
## avg_rating:base_time_catRapid (11–25 min) 0.0004847 0.0105135 0.046
## avg_rating:base_time_catClassical 0.0045423 0.0108676 0.418
## Pr(>|t|)
## (Intercept) 0.2155
## avg_rating 0.0579 .
## base_time_catBlitz (4–10 min) 0.6093
## base_time_catRapid (11–25 min) 0.6661
## base_time_catClassical 0.7967
## avg_rating:base_time_catBlitz (4–10 min) 0.9629
## avg_rating:base_time_catRapid (11–25 min) 0.9632
## avg_rating:base_time_catClassical 0.6760
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33.06 on 20005 degrees of freedom
## Multiple R-squared: 0.02926, Adjusted R-squared: 0.02892
## F-statistic: 86.13 on 7 and 20005 DF, p-value: < 2.2e-16
# Interaction plot
interact_plot(
model = m,
pred = avg_rating,
modx = base_time_cat,
interval = TRUE,
int.width = 0.8,
plot.points = FALSE
) +
labs(
x = "Average Player Rating",
y = "Predicted Game Length (Turns)",
title = "Interaction: Rating × Time Control → Game Length"
) +
theme_minimal(base_size = 14)

library(dplyr)
library(ggplot2)
library(ggimage)
# Categorize games by time control
games_time <- games %>%
mutate(
base_minutes = as.numeric(sub("\\+.*", "", increment_code)),
format = case_when(
base_minutes > 60 ~ "Classical",
base_minutes > 10 & base_minutes <= 60 ~ "Rapid",
base_minutes > 3 & base_minutes <= 10 ~ "Blitz",
base_minutes <= 3 ~ "Bullet",
TRUE ~ "Other"
)
)
# Count games per format
format_counts <- games_time %>%
group_by(format) %>%
summarise(n = n(), .groups = "drop")
# Add icons
format_counts$img <- NA
format_counts$img[format_counts$format=="Classical"] <- "./Classical.png"
format_counts$img[format_counts$format=="Rapid"] <- "./Rapid.png"
format_counts$img[format_counts$format=="Blitz"] <- "./Blitz.png"
format_counts$img[format_counts$format=="Bullet"] <- "./Bullet.png"
# Define icon value
icon_value <- 3500
# Expand the data: one row per icon
format_icons <- format_counts %>%
rowwise() %>%
mutate(n_icons = ceiling(n / icon_value)) %>%
do(data.frame(
format = .$format,
icon_y = seq(icon_value, .$n_icons * icon_value, by = icon_value) - icon_value/2,
img = .$img
))
# Combined bar + pictogram
ggplot() +
geom_col(data = format_counts, aes(x = format, y = n), fill = "steelblue") +
geom_image(data = format_icons, aes(x = format, y = icon_y, image = img), size = 0.25) +
labs(
title = "Distribution of Chess Formats (Pictogram + Bars)",
x = "Format",
y = "Number of Games"
) +
theme_minimal()

library(dplyr)
library(ggplot2)
# Categorize by format
games_time <- games %>%
mutate(
base_minutes = as.numeric(sub("\\+.*", "", increment_code)),
format = case_when(
base_minutes > 60 ~ "Classical",
base_minutes > 10 & base_minutes <= 60 ~ "Rapid",
base_minutes > 3 & base_minutes <= 10 ~ "Blitz",
base_minutes <= 3 ~ "Bullet",
TRUE ~ "Other"
)
)
ggplot(games_time, aes(x = format, y = opening_ply, fill = format)) +
geom_violin(trim = FALSE, alpha = 0.6) + # violin shows distribution
stat_summary(fun = median, geom = "crossbar", width = 0.3, color = "black") + # median line
labs(
title = "Opening Depth by Time Control",
x = "Time Control",
y = "Opening Ply (Book Moves)"
) +
theme_minimal() +
theme(legend.position = "none")
