setwd("~/Desktop/DSCI_304/Final_Project")
#https://www.kaggle.com/datasets/datasnaek/chess?resource=download
games <- read.csv("games.csv")
#View(games)
library(ggplot2)
ggplot(games, aes(x = turns, fill = winner)) +
  geom_density(alpha = 0.6) +
  labs(
    title = "Density of Game Lengths by Winner",
    x = "Number of Turns",
    y = "Density",
    fill = "Winner"
  ) +
  theme_minimal()

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Top 15 openings
top_openings <- games %>%
  count(opening_name, sort = TRUE) %>%
  slice_head(n = 15) %>%
  pull(opening_name)

# Filter to top 15
games_top <- games %>%
  filter(opening_name %in% top_openings)

# Count number of games by opening and winner
summary_df <- games_top %>%
  group_by(opening_name, winner) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(opening_name) %>%
  mutate(total = sum(n),
         pct = n / total) %>%
  ungroup()

# Interactive stacked bar chart
plot_ly(
  summary_df,
  x = ~reorder(opening_name, -total),
  y = ~n,
  color = ~winner,
  type = "bar",
  text = ~paste0(
    "Winner: ", winner, "<br>",
    "Games: ", n, "<br>",
    "Win %: ", scales::percent(pct, accuracy = 0.1)
  ),
  hoverinfo = "text"
) %>%
  layout(
    barmode = "stack",
    title = "Number of Games by Opening and Winner",
    xaxis = list(title = "Opening Name"),
    yaxis = list(title = "Number of Games")
  )
library(dplyr)
library(stringr)
library(interactions)
library(ggplot2)


# Preprocess time controls


games2 <- games %>%
  mutate(
    avg_rating = (white_rating + black_rating) / 2,

    # increment_code contains things like "10+0", "5+5", "3+2"
    base_time = as.numeric(str_extract(increment_code, "^[0-9]+")),
    increment  = as.numeric(str_extract(increment_code, "(?<=\\+)[0-9]+")),

    base_time = ifelse(is.na(base_time), 0, base_time),
    increment = ifelse(is.na(increment), 0, increment)
  ) %>%
  filter(
    base_time > 0,
    turns > 0,
    avg_rating > 0
  )


# Convert base_time → time control category


games2 <- games2 %>%
  mutate(
    base_time_cat = case_when(
      base_time <= 3 ~ "Bullet (1–3 min)",
      base_time <= 10 ~ "Blitz (4–10 min)",
      base_time <= 25 ~ "Rapid (11–25 min)",
      TRUE ~ "Classical"
    ),
    base_time_cat = factor(
      base_time_cat,
      levels = c("Bullet (1–3 min)", "Blitz (4–10 min)", "Rapid (11–25 min)", "Classical")
    )
  )


# Interaction model


m <- lm(turns ~ avg_rating * base_time_cat, data = games2)

summary(m)
## 
## Call:
## lm(formula = turns ~ avg_rating * base_time_cat, data = games2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.648 -23.588  -5.183  18.087 275.067 
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                               20.6787824 16.6932873   1.239
## avg_rating                                 0.0196658  0.0103690   1.897
## base_time_catBlitz (4–10 min)              8.5802388 16.7890298   0.511
## base_time_catRapid (11–25 min)             7.3006811 16.9198757   0.431
## base_time_catClassical                    -4.5051994 17.4845420  -0.258
## avg_rating:base_time_catBlitz (4–10 min)   0.0004847  0.0104273   0.046
## avg_rating:base_time_catRapid (11–25 min)  0.0004847  0.0105135   0.046
## avg_rating:base_time_catClassical          0.0045423  0.0108676   0.418
##                                           Pr(>|t|)  
## (Intercept)                                 0.2155  
## avg_rating                                  0.0579 .
## base_time_catBlitz (4–10 min)               0.6093  
## base_time_catRapid (11–25 min)              0.6661  
## base_time_catClassical                      0.7967  
## avg_rating:base_time_catBlitz (4–10 min)    0.9629  
## avg_rating:base_time_catRapid (11–25 min)   0.9632  
## avg_rating:base_time_catClassical           0.6760  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33.06 on 20005 degrees of freedom
## Multiple R-squared:  0.02926,    Adjusted R-squared:  0.02892 
## F-statistic: 86.13 on 7 and 20005 DF,  p-value: < 2.2e-16
# Interaction plot


interact_plot(
  model = m,
  pred = avg_rating,
  modx = base_time_cat,
  interval = TRUE,
  int.width = 0.8,
  plot.points = FALSE
) +
  labs(
    x = "Average Player Rating",
    y = "Predicted Game Length (Turns)",
    title = "Interaction: Rating × Time Control → Game Length"
  ) +
  theme_minimal(base_size = 14)

library(dplyr)
library(ggplot2)
library(ggimage)

# Categorize games by time control
games_time <- games %>%
  mutate(
    base_minutes = as.numeric(sub("\\+.*", "", increment_code)),
    format = case_when(
      base_minutes > 60 ~ "Classical",
      base_minutes > 10 & base_minutes <= 60 ~ "Rapid",
      base_minutes > 3 & base_minutes <= 10 ~ "Blitz",
      base_minutes <= 3 ~ "Bullet",
      TRUE ~ "Other"
    )
  )

# Count games per format
format_counts <- games_time %>%
  group_by(format) %>%
  summarise(n = n(), .groups = "drop")

# Add icons
format_counts$img <- NA
format_counts$img[format_counts$format=="Classical"] <- "./Classical.png"
format_counts$img[format_counts$format=="Rapid"] <- "./Rapid.png"
format_counts$img[format_counts$format=="Blitz"] <- "./Blitz.png"
format_counts$img[format_counts$format=="Bullet"] <- "./Bullet.png"

# Define icon value
icon_value <- 3500

# Expand the data: one row per icon
format_icons <- format_counts %>%
  rowwise() %>%
  mutate(n_icons = ceiling(n / icon_value)) %>% 
  do(data.frame(
    format = .$format,
    icon_y = seq(icon_value, .$n_icons * icon_value, by = icon_value) - icon_value/2,
    img = .$img
  ))

# Combined bar + pictogram
ggplot() +
  geom_col(data = format_counts, aes(x = format, y = n), fill = "steelblue") +
  geom_image(data = format_icons, aes(x = format, y = icon_y, image = img), size = 0.25) +
  labs(
    title = "Distribution of Chess Formats (Pictogram + Bars)",
    x = "Format",
    y = "Number of Games"
  ) +
  theme_minimal()

library(dplyr)
library(ggplot2)

# Categorize by format
games_time <- games %>%
  mutate(
    base_minutes = as.numeric(sub("\\+.*", "", increment_code)),
    format = case_when(
      base_minutes > 60 ~ "Classical",
      base_minutes > 10 & base_minutes <= 60 ~ "Rapid",
      base_minutes > 3 & base_minutes <= 10 ~ "Blitz",
      base_minutes <= 3 ~ "Bullet",
      TRUE ~ "Other"
    )
  )

ggplot(games_time, aes(x = format, y = opening_ply, fill = format)) +
  geom_violin(trim = FALSE, alpha = 0.6) +   # violin shows distribution
  stat_summary(fun = median, geom = "crossbar", width = 0.3, color = "black") + # median line
  labs(
    title = "Opening Depth by Time Control",
    x = "Time Control",
    y = "Opening Ply (Book Moves)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")