assignment_5b

Approach

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.1     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

url <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/assignment_5/chess_player.csv"
df <- tibble(read_csv(url))

Rows: 64 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): name, state
dbl (5): pair, pre_rating, post_rating, total_pts, op_avg_rating

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df

# A tibble: 64 × 7
    pair name               state pre_rating post_rating total_pts op_avg_rating
   <dbl> <chr>              <chr>      <dbl>       <dbl>     <dbl>         <dbl>
 1     1 GARY HUA           ON          1794        1817       6           1605.
 2     2 DAKSHESH DARURI    MI          1553        1663       6           1469.
 3     3 ADITYA BAJAJ       MI          1384        1640       6           1564.
 4     4 PATRICK H SCHILLI… MI          1716        1744       5.5         1574.
 5     5 HANSHI ZUO         MI          1655        1690       5.5         1501.
 6     6 HANSEN SONG        OH          1686        1687       5           1519.
 7     7 GARY DEE SWATHELL  MI          1649        1673       5           1372.
 8     8 EZEKIEL HOUGHTON   MI          1641        1657       5           1468.
 9     9 STEFANO LEE        ON          1411        1564       5           1523.
10    10 ANVIT RAO          MI          1365        1544       5           1554.
# ℹ 54 more rows

I realize that I won’t be able to calculate the ELO correctly, unless I parsed the dataset to also include W/L/D/H/U… It wasn’t clear whether or not we can approximate the ELO based off the avg rating of each opponent or not.

I think I will just do it correctly, where we calculate ELO per match because it would be good practice, which means I have to redo the Project 1 grep equations. Then for each W/L/D/H/U we will have to assign a map to figure out how that will effect the scoring.

We will also use the ELO rating from the video by singingbanana: A = 1 / (1 + 10^((rating_B - rating_A) / 400))

However, I wouldn’t have to completely redo my project 1 code, as the most useful part is the mapping of opponent ID to their pre_rating scores per player row. Which I can make into a pivot_longer(), where each player will have 7 rows each (unless the opponent is missing / no match) calculate the expected ELO from the rating system, them sum everything up.

Disclaimer: I kind of accidentally did the codebase prior to the approach. I was playing around with grabbing the code from Project 1 and then just started to get to work.

Equation: A = 1 / (1 + 10^((rating_B - rating_A) / 400)) Difference would be expected vs result.

Codebase

library(tidyverse)

#start
url <- "https://raw.githubusercontent.com/Siganz/CUNY_Assignments/refs/heads/main/607/assignment_5/tournamentinfo.txt"
df_raw <- read_lines(url)
df_raw_t <- tibble(df_raw)
var_1 <- c("pair", "name", "total_pts", as.vector(rbind(paste0("dec_", 1:7), paste0("opp_", 1:7))))
var_2 <- c("state", "match", "pre_rating", "post_rating")
var_3 <- c("pair", "name", "state", "pre_rating", "post_rating", "total_pts", as.vector(rbind(paste0("dec_", 1:7), paste0("opp_", 1:7))))
var_4 <- c("name", "state", "pre_rating", "post_rating", "total_pts")
var_5 <- c(paste0("d_", 1:7))

#Working
df <- tidyr::tibble(raw = df_raw) |>
  filter(
    !str_detect(raw, "^-"),
    !str_detect(raw, "\\s*Pair"),
    !str_detect(raw, "\\s*Num")
  ) |>
  mutate(index = row_number()) |>
  mutate(
    raw = str_trim(raw),
    raw = str_replace(raw, "\\|$", ""),
    raw = str_replace_all(raw, "\\|[BXN]\\s*", "|"),
    raw = str_replace_all(raw, "\\s*\\|\\s*", "|"),
    raw = str_replace_all(raw, "\\|:\\d\\s*", "|"),
    raw = str_replace_all(raw, "P\\d+", ""),
    raw = str_replace(raw, "->", "|"),
    raw = str_replace(raw, "R:", ""),
    raw = str_replace(raw, "/", "|"),
    raw = str_replace(raw, "\\|*\\|$", ""),
    raw = str_replace_all(raw, "\\|([WDL])\\s+(\\d+)", "|\\1|\\2"),
    raw = str_replace_all(raw, "\\|([HU])\\s*(?=\\|)", "|\\1|0"),
    raw = str_trim(raw)
    )

df1 <- df |> filter(index %% 2 != 0) |>
  separate(raw, sep = "\\|", into = var_1, extra = "drop", fill = "right") |>
  mutate(across(everything(), str_trim)) |>
  mutate(across(everything(), ~ na_if(.x, ""))) |>
  mutate(across(everything(), ~ na_if(.x, "0")))

df2 <- df |> filter(index %% 2 == 0) |>
  mutate(index = index - 1) |>
  separate(raw, sep = "\\|", into = var_2, extra = "drop", fill = "right") |>
  mutate(across(everything(), str_trim)) |>
  mutate(across(everything(), ~ na_if(.x, "")))

player_df <- df1 |> left_join(df2, by ='index')|> 
  select(all_of(var_3)) |>
  readr::type_convert()


── Column specification ────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  pair = col_double(),
  pre_rating = col_double(),
  post_rating = col_double(),
  total_pts = col_double(),
  opp_7 = col_double()
)
ℹ Use `spec()` for the full column specifications.

rating_map <- setNames(player_df$pre_rating, player_df$pair)

player_df <- player_df |>
  mutate(across(starts_with("opp"), ~ unname(rating_map[.x])))

score_map <- c(W = 1, D = 0.5, L = 0, H = 0.5, U = 0)

df3 <- player_df |>
  pivot_longer(cols = matches("^(dec|opp)_"), names_to = c(".value", "round"), names_pattern = "(dec|opp)_(\\d+)") |>
  filter(!is.na(dec), !is.na(opp))

results <- df3 |>
  mutate(actual = score_map[dec], expected = 1 / (1 + 10 ^ ((opp - pre_rating) / 400))) |>
  group_by(name) |>
  summarise(actual_score = sum(actual), expected_score = sum(expected), diff = actual_score - expected_score, .groups = "drop")

top_overperformers <- results |> arrange(desc(diff)) |> slice_head(n = 5)
top_underperformers <- results |> arrange(diff) |> slice_head(n = 5)

top_overperformers

# A tibble: 5 × 4
  name                     actual_score expected_score  diff
  <chr>                           <dbl>          <dbl> <dbl>
1 ADITYA BAJAJ                      6           1.95    4.05
2 ZACHARY JAMES HOUGHTON            4.5         1.37    3.13
3 ANVIT RAO                         5           1.94    3.06
4 JACOB ALEXANDER LAVALLEY          3           0.0432  2.96
5 STEFANO LEE                       5           2.29    2.71

top_underperformers

# A tibble: 5 × 4
  name               actual_score expected_score  diff
  <chr>                     <dbl>          <dbl> <dbl>
1 LOREN SCHWIEBERT            3.5           6.28 -2.78
2 GEORGE AVERY JONES          3.5           6.02 -2.52
3 JARED GE                    3             5.01 -2.01
4 RISHI SHETTY                3.5           5.09 -1.59
5 JOSHUA DAVID LEE            3.5           4.96 -1.46

cat("Top Overperformers:\n\n")

Top Overperformers:

top_overperformers |>
  mutate(report = glue::glue("{name} | {expected_score} | {actual_score} | {diff}")) |>
  pull(report) |>
  walk(~ cat(.x, "\n"))

ADITYA BAJAJ | 1.94508790746935 | 6 | 4.05491209253065 
ZACHARY JAMES HOUGHTON | 1.37330886994562 | 4.5 | 3.12669113005438 
ANVIT RAO | 1.94485405313239 | 5 | 3.05514594686761 
JACOB ALEXANDER LAVALLEY | 0.0432498102299642 | 3 | 2.95675018977004 
STEFANO LEE | 2.28654888164957 | 5 | 2.71345111835043

cat("Top Underperformers:\n\n")

Top Underperformers:

top_underperformers |>
  mutate(report = glue::glue("{name} | {expected_score} | {actual_score} | {diff}")) |>
  pull(report) |>
  walk(~ cat(.x, "\n"))

LOREN SCHWIEBERT | 6.27565044706423 | 3.5 | -2.77565044706423 
GEORGE AVERY JONES | 6.01822002311794 | 3.5 | -2.51822002311794 
JARED GE | 5.01041607668383 | 3 | -2.01041607668383 
RISHI SHETTY | 5.09246477319498 | 3.5 | -1.59246477319498 
JOSHUA DAVID LEE | 4.95789036389805 | 3.5 | -1.45789036389805