Assignment 5 B Codebase

Author

Sam Barbaro

Assignment 5B Approach

I will pick up where I left off in project one and incorporate the results of of the matches into my final data frame.

I will calculate the player’s total expected score (probability of winning, as calculated below, across all games) and the difference from their actual score (calculated as 1 pt for winning, .5 for a draw, 0 for a loss). Then, I’ll list the five players who most overperformed relative to their expected score (highest difference between their expected and actual), and the five players that most underperformed relative to their expected score (lowest difference between expected and actual).

Formulas are as follows:

new rating after a game = rating + 32(score - expected score)

players score 1 pt for winning, .5 for a draw, 0 for a loss (actual score)

expected score = probability they will win

Probability of player a winning = 1/1+10 ^((Rating of player a - Rating of player b)/400) or 1/1+10 ^((Rating of player a - Rating of player b)/400)

library(tidyverse)

Warning: package 'tidyverse' was built under R version 4.5.2

Warning: package 'ggplot2' was built under R version 4.5.2

Warning: package 'tibble' was built under R version 4.5.2

Warning: package 'readr' was built under R version 4.5.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.1     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#Here's the original txt file. I kept/changed some of the cleaning from project 1, since the resulting data frame will be somewhat different. 

new_data <- read.table("https://raw.githubusercontent.com/samanthabarbaro/data607/refs/heads/main/chess.txt", sep = "|", row.names = NULL, fill = TRUE)

filtered_new_data <- subset(new_data, V2 != "")

filtered_new_data <- filtered_new_data |>
    mutate(new_col = rep(c(1, 2), length.out = n())) 

#pivot wider to get players on the same line as their stats

final_chess <- filtered_new_data |>
  mutate(person_id = rep(1:(n()/2), each = 2)) |>
  pivot_wider(
    id_cols = person_id,
    names_from = c(new_col),
    values_from = c(V1, V2, V3, V4, V5, V6, V7, V8, V9, V10)
  )

Cleaning the data

#delete the first row here, because that is going to be a mess after a pivot

#name the columns by number so it's easy to convert them to numeric later if needed, and there's no messy first_opp, second_opp, etc. 
more_final_chess <- final_chess |> select(player_id = V1_1, hometown = V1_2, player_name = V2_1, ranks = V2_2, total_pts = V3_1, "1" = V4_1, "2" = V5_1, "3" = V6_1, "4" = V7_1, "5" = V8_1, "6" = V9_1, "7" = V10_1) |>
  slice(-1)

#pivot longer with the result and opponent number still together

chess_long_scores <- more_final_chess |>
    pivot_longer(
        cols = !player_id:total_pts,
        names_to = "round",
        values_to = "result_and_opp"
    )

#cleaning and breaking up the columns

separated_chess_w_scores <- chess_long_scores |>
    separate(col = ranks, into = c("USCF_ID", "scores"), sep = "/") |> 
  separate(col = scores, into = c("pre_rank", "post_rank"), sep = "->") |>
  separate(col = result_and_opp, into = c ("result", "opp_num"), sep = "  ")

Warning: Expected 2 pieces. Additional pieces discarded in 40 rows [82, 108, 112, 152,
189, 203, 250, 253, 257, 264, 285, 286, 287, 302, 332, 334, 342, 343, 348, 365,
...].

#getting rid of the p + numbers in the ranks and other unecessary lettre

cleaned_chess_scores <- separated_chess_w_scores|>
    mutate(pre_rank = str_sub(pre_rank, start = 4), across(everything(), ~  str_trim(.x) |> na_if(""))) |> 
    mutate(pre_rank = str_remove(pre_rank, "P.*")) |> 
    mutate(post_rank = str_remove(post_rank, "P.*")) |>
    mutate(player_name = str_to_title(player_name))

Joining and pivoting

#reference table

ref_tab <- cleaned_chess_scores |> 
  select(player_id, opp_rank = pre_rank) |> 
  distinct(player_id, .keep_all = TRUE)

#join 
#I did not drop NAs here because I wanted to include scores for byes

chess_scores_joined <- cleaned_chess_scores |>
    left_join(ref_tab, by = c("opp_num" = "player_id"))


#format as tibble
chess_scores_joined <- as_tibble(chess_scores_joined)

#make scores numeric

chess_scores_joined <- chess_scores_joined |> 
  mutate(across(c(pre_rank, post_rank, total_pts, opp_rank), as.numeric))

The math

#calculating the expected score
# 1/1+10 ^((Rating of player a - Rating of player b)/400)
chess_expected_score <- chess_scores_joined |>
  mutate(exp_score = round(1 / (1 + 10^((opp_rank - pre_rank) / 400)), 4))


#case when to determine actual score
#there are two new variables here -- actual_pts includes a score for byes and score_difference does not
#I wanted to verify the total score number already in the sheet, which includes byes, and create a column that only includes scores that go toward the player's rank.
chess_actuals <- chess_expected_score |>
  mutate(actual_pts = case_when(
  result == "W" ~ 1,
  result == "B" ~ 1,
  result == "D" ~ .5,
  result == "H" ~ .5,
  result == "L" ~ 0,
)) |> mutate(score_difference = actual_pts - exp_score) |>
   mutate(total_wo_bye = case_when(
  result == "W" ~ 1,
  result == "D" ~ .5,
  result == "L" ~ 0,
))

#This wasn't required, but I wanted to calculate points gained
#new rating after a game = rating + 32(score - expected score) 

new_rating <- chess_actuals |> 
  mutate(new_pts = (32 * score_difference))


#sum everything

sum_chess <- new_rating |> 
  group_by(across(player_id:total_pts)) |> 
  summarise(
    across(c(exp_score, actual_pts, score_difference, new_pts, total_wo_bye), 
           \(x) sum(x, na.rm = TRUE)),
    .groups = "drop"
  ) |>
  mutate(new_rank_calculated = (round(new_pts + pre_rank)))

#at this point I realized that some people had an H and received .5 points for an H (half-point bye), and people with a B received 1 pt, but that does not factor into their ELO ranking. I went back and left NAs in so point calculations reflected the total already in the table. I added the total_wo_by column to check (this is the number that factored into the actual calculations -- byes don't result in points added to a rating). 
#I also calculated the new rankings and, after extensive checking, concluded that this rating system must be slightly different from the standard. They might have used a different K factor instead of 32. 


#arrange by player number:
sum_chess_2 <- sum_chess |> mutate(player_id = as.numeric(player_id)) |> arrange(player_id) |> select(player_id:exp_score, total_wo_bye, score_difference)


#five players that most underperformed relative to their expected score (lowest difference between expected and actual).
bottom_five <- sum_chess_2 |> arrange(score_difference) |> slice(1:5)

glimpse(bottom_five)

Rows: 5
Columns: 10
$ player_id        <dbl> 25, 30, 54, 42, 31
$ hometown         <chr> "MI", "ON", "MI", "MI", "MI"
$ player_name      <chr> "Loren Schwiebert", "George Avery Jones", "Larry Hodg…
$ USCF_ID          <chr> "12486656", "12577178", "12836773", "14462326", "1513…
$ pre_rank         <dbl> 1745, 1522, 1270, 1332, 1494
$ post_rank        <dbl> 1681, 1444, 1200, 1256, 1444
$ total_pts        <dbl> 3.5, 3.5, 2.0, 3.0, 3.5
$ exp_score        <dbl> 6.2756, 6.0182, 3.3991, 5.0103, 5.0925
$ total_wo_bye     <dbl> 3.5, 3.5, 1.0, 3.0, 3.5
$ score_difference <dbl> -2.7756, -2.5182, -2.3991, -2.0103, -1.5925

print(bottom_five)

# A tibble: 5 × 10
  player_id hometown player_name  USCF_ID pre_rank post_rank total_pts exp_score
      <dbl> <chr>    <chr>        <chr>      <dbl>     <dbl>     <dbl>     <dbl>
1        25 MI       Loren Schwi… 124866…     1745      1681       3.5      6.28
2        30 ON       George Aver… 125771…     1522      1444       3.5      6.02
3        54 MI       Larry Hodge  128367…     1270      1200       2        3.40
4        42 MI       Jared Ge     144623…     1332      1256       3        5.01
5        31 MI       Rishi Shetty 151316…     1494      1444       3.5      5.09
# ℹ 2 more variables: total_wo_bye <dbl>, score_difference <dbl>

#five players who most overperformed relative to their expected score (highest difference between their expected and actual)
top_five <- sum_chess_2 |> arrange(desc(score_difference)) |> slice(1:5)

glimpse(top_five)

Rows: 5
Columns: 10
$ player_id        <dbl> 3, 15, 10, 46, 9
$ hometown         <chr> "MI", "MI", "MI", "MI", "ON"
$ player_name      <chr> "Aditya Bajaj", "Zachary James Houghton", "Anvit Rao"…
$ USCF_ID          <chr> "14959604", "15619130", "14150362", "15490981", "1495…
$ pre_rank         <dbl> 1384, 1220, 1365, 377, 1411
$ post_rank        <dbl> 1640, 1416, 1544, 1076, 1564
$ total_pts        <dbl> 6.0, 4.5, 5.0, 3.0, 5.0
$ exp_score        <dbl> 1.9450, 1.3732, 1.9448, 0.0433, 2.2867
$ total_wo_bye     <dbl> 6.0, 4.5, 5.0, 3.0, 5.0
$ score_difference <dbl> 4.0550, 3.1268, 3.0552, 2.9567, 2.7133

print(top_five)

# A tibble: 5 × 10
  player_id hometown player_name  USCF_ID pre_rank post_rank total_pts exp_score
      <dbl> <chr>    <chr>        <chr>      <dbl>     <dbl>     <dbl>     <dbl>
1         3 MI       Aditya Bajaj 149596…     1384      1640       6      1.94  
2        15 MI       Zachary Jam… 156191…     1220      1416       4.5    1.37  
3        10 MI       Anvit Rao    141503…     1365      1544       5      1.94  
4        46 MI       Jacob Alexa… 154909…      377      1076       3      0.0433
5         9 ON       Stefano Lee  149545…     1411      1564       5      2.29  
# ℹ 2 more variables: total_wo_bye <dbl>, score_difference <dbl>

Google Gemini. (2026). Gemini 3 Flash [Large language model].https://gemini.google.com. Accessed Feb 13, 2026.