Data 607 Assignment 5B

Overview

This report intends to use the ELO rating system to predict each player’s expected points based on previous tournaments performance and compare those predictions with the actual performance.

Load libraries

library(readr)
library(stringr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)

Load the data from my Github

raw_url <- "https://raw.githubusercontent.com/JDO-MSDS/DATA-607/refs/heads/main/Project%201/chess.txt"

chess <- readr::read_lines(raw_url)
head(chess, 15)

##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------" 
## [11] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|" 
## [12] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [13] "-----------------------------------------------------------------------------------------" 
## [14] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|" 
## [15] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"

Extract the information

names_v <- c()
states_v <- c()
points_v <- c()
pre_ratings_v <- c()
avg_opponent_rantings_v <- c()

# Collect all players data
player_data <- list()

for (i in seq(5, length(chess), 3)) {
  if (i + 1 <= length(chess)) {
    # player data line
    player_line <- chess[i]
    state_line <- chess[i + 1]
  
    # split line |
    player_fields_split <- strsplit(player_line, "\\|")[[1]] %>% trimws()
    state_fields_split <- strsplit(state_line, "\\|")[[1]] %>% trimws()

  
    # fields extraction for player
    player_num <- as.numeric(player_fields_split[1])
    name <- player_fields_split[2]  # player name position
    state <- state_fields_split[1] # state position
    total_points <- as.numeric(player_fields_split[3])
 
  
    # points and ratings numeric cleaning - remove everything else
    rating_match <- stringr::str_match(state_fields_split[2], "R:\\s*(\\d+)")
    pre_rating <- as.numeric(rating_match[,2])
  
    # opponents
    opponents <- c()
    for (round in 5:11) {
      if (round <= length(player_fields_split)) {
        round_result <- player_fields_split[round]
        opponent_match <- stringr::str_extract(round_result, "[WLD]\\s*(\\d+)")
        if (!is.na(opponent_match)) {
          opponent_num <- as.numeric(stringr::str_extract(opponent_match, "\\d+"))
          opponents <- c(opponents, opponent_num)
        }
      }
    }
    
    # Save player data 
    player_data[[as.character(player_num)]] <- list(
      name = name,
      state = state,
      points = total_points,
      pre_rating = pre_rating,
      opponents = opponents
    )
  }
}

# Calculate avg opponent ratings
for (player_num in names(player_data)) {
  player <- player_data[[player_num]]
  
  # avg pre rating
  opponent_ratings <- c()
  for (opponent_num in player$opponents) {
    if (as.character(opponent_num) %in% names(player_data)) {
      opponent_ratings <- c(opponent_ratings, player_data[[as.character(opponent_num)]]$pre_rating)
    }
  }
  
  avg_opponent_ranting <- if(length(opponent_ratings) > 0) {
    round(mean(opponent_ratings), 0)
  } else {
    NA
  }
  
  # update vectors
  names_v <- c(names_v, player$name)
  states_v <- c(states_v, player$state)
  points_v <- c(points_v, player$points)
  pre_ratings_v <- c(pre_ratings_v, player$pre_rating)
  avg_opponent_rantings_v <- c(avg_opponent_rantings_v, avg_opponent_ranting)
}

head(player_data, 8)

## $`1`
## $`1`$name
## [1] "GARY HUA"
## 
## $`1`$state
## [1] "ON"
## 
## $`1`$points
## [1] 6
## 
## $`1`$pre_rating
## [1] 1794
## 
## $`1`$opponents
## [1] 21 18 14  7 12  4
## 
## 
## $`2`
## $`2`$name
## [1] "DAKSHESH DARURI"
## 
## $`2`$state
## [1] "MI"
## 
## $`2`$points
## [1] 6
## 
## $`2`$pre_rating
## [1] 1553
## 
## $`2`$opponents
## [1] 58  4 17 16 20  7
## 
## 
## $`3`
## $`3`$name
## [1] "ADITYA BAJAJ"
## 
## $`3`$state
## [1] "MI"
## 
## $`3`$points
## [1] 6
## 
## $`3`$pre_rating
## [1] 1384
## 
## $`3`$opponents
## [1] 61 25 21 11 13 12
## 
## 
## $`4`
## $`4`$name
## [1] "PATRICK H SCHILLING"
## 
## $`4`$state
## [1] "MI"
## 
## $`4`$points
## [1] 5.5
## 
## $`4`$pre_rating
## [1] 1716
## 
## $`4`$opponents
## [1] 28  2 26  5 19  1
## 
## 
## $`5`
## $`5`$name
## [1] "HANSHI ZUO"
## 
## $`5`$state
## [1] "MI"
## 
## $`5`$points
## [1] 5.5
## 
## $`5`$pre_rating
## [1] 1655
## 
## $`5`$opponents
## [1] 37 12 13  4 14 17
## 
## 
## $`6`
## $`6`$name
## [1] "HANSEN SONG"
## 
## $`6`$state
## [1] "OH"
## 
## $`6`$points
## [1] 5
## 
## $`6`$pre_rating
## [1] 1686
## 
## $`6`$opponents
## [1] 29 11 35 10 27 21
## 
## 
## $`7`
## $`7`$name
## [1] "GARY DEE SWATHELL"
## 
## $`7`$state
## [1] "MI"
## 
## $`7`$points
## [1] 5
## 
## $`7`$pre_rating
## [1] 1649
## 
## $`7`$opponents
## [1] 46 13 11  1  9  2
## 
## 
## $`8`
## $`8`$name
## [1] "EZEKIEL HOUGHTON"
## 
## $`8`$state
## [1] "MI"
## 
## $`8`$points
## [1] 5
## 
## $`8`$pre_rating
## [1] 1641
## 
## $`8`$opponents
## [1] 32 14  9 47 28 19

ELO calculations

expected_elo <- function(player_a, player_b) 1/(1 + 10^((player_b - player_a) / 400))

expected_row <- lapply(names(player_data), function(pnumber) {
  pl <- player_data[[pnumber]]
  player_a <- pl$pre_rating
  
  values <- sapply(pl$opponents, function(opp_numb) {
    key <- as.character(opp_numb)
    if (key %in% names(player_data)) {
      player_b <- player_data[[key]]$pre_rating
      expected_elo(player_a, player_b)
    } else {
      NA_real_
    }
    }, simplify = TRUE, USE.NAMES = FALSE)
  
  values <- as.numeric(values)
  expected_total <- sum(values, na.rm = TRUE)
  
  tibble::tibble(
    Name = pl$name,
    State = pl$state,
    Pre_Rating = player_a,
    Actual = pl$points,
    Expected = round(expected_total, 2),
    Difference = round(pl$points - expected_total, 2)
  )
})

df_elo <- dplyr::bind_rows(expected_row)

over <- df_elo %>% arrange(desc(Difference)) %>% slice_head(n = 5)
under <- df_elo %>% arrange(Difference) %>% slice_head(n = 5)

over

## # A tibble: 5 × 6
##   Name                     State Pre_Rating Actual Expected Difference
##   <chr>                    <chr>      <dbl>  <dbl>    <dbl>      <dbl>
## 1 ADITYA BAJAJ             MI          1384    6       1.76       4.24
## 2 ANVIT RAO                MI          1365    5       1.74       3.26
## 3 ZACHARY JAMES HOUGHTON   MI          1220    4.5     1.25       3.25
## 4 DAKSHESH DARURI          MI          1553    6       2.88       3.12
## 5 JACOB ALEXANDER LAVALLEY MI           377    3       0.04       2.96

under

## # A tibble: 5 × 6
##   Name               State Pre_Rating Actual Expected Difference
##   <chr>              <chr>      <dbl>  <dbl>    <dbl>      <dbl>
## 1 LOREN SCHWIEBERT   MI          1745    3.5     5.4       -1.9 
## 2 JARED GE           MI          1332    3       4.88      -1.88
## 3 GEORGE AVERY JONES ON          1522    3.5     5.05      -1.55
## 4 DEREK YAN          MI          1242    3       4.29      -1.29
## 5 LARRY HODGE        MI          1270    2       3.28      -1.28

Summary top 5 over and underperformers

results_table <- dplyr::bind_rows(
  dplyr::mutate(over, Category = "Overperform"),
  dplyr::mutate(under, Category = "Underperform")
) %>%
  dplyr::select(Category, Name, State, Pre_Rating, Actual, Expected, Difference)

print(results_table, row.names = FALSE)

## # A tibble: 10 × 7
##    Category     Name                 State Pre_Rating Actual Expected Difference
##    <chr>        <chr>                <chr>      <dbl>  <dbl>    <dbl>      <dbl>
##  1 Overperform  ADITYA BAJAJ         MI          1384    6       1.76       4.24
##  2 Overperform  ANVIT RAO            MI          1365    5       1.74       3.26
##  3 Overperform  ZACHARY JAMES HOUGH… MI          1220    4.5     1.25       3.25
##  4 Overperform  DAKSHESH DARURI      MI          1553    6       2.88       3.12
##  5 Overperform  JACOB ALEXANDER LAV… MI           377    3       0.04       2.96
##  6 Underperform LOREN SCHWIEBERT     MI          1745    3.5     5.4       -1.9 
##  7 Underperform JARED GE             MI          1332    3       4.88      -1.88
##  8 Underperform GEORGE AVERY JONES   ON          1522    3.5     5.05      -1.55
##  9 Underperform DEREK YAN            MI          1242    3       4.29      -1.29
## 10 Underperform LARRY HODGE          MI          1270    2       3.28      -1.28

Conclusion

This report used to ELO formula to predict the peformance of chess players based on their previous performance and compare with the real performnance in the tournament. We can see that ADITYA BAJAJ clearly overperfomed since that player’s expected result was 1.76 and they ended up with a score of 6, way above the expected value. At the same time, players such as LOREN SCHWIEBERT or JARED GE had a much worse performance than expected.