This report intends to use the ELO rating system to predict each player’s expected points based on previous tournaments performance and compare those predictions with the actual performance.
library(readr)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
raw_url <- "https://raw.githubusercontent.com/JDO-MSDS/DATA-607/refs/heads/main/Project%201/chess.txt"
chess <- readr::read_lines(raw_url)
head(chess, 15)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
## [11] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [12] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [13] "-----------------------------------------------------------------------------------------"
## [14] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [15] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
names_v <- c()
states_v <- c()
points_v <- c()
pre_ratings_v <- c()
avg_opponent_rantings_v <- c()
# Collect all players data
player_data <- list()
for (i in seq(5, length(chess), 3)) {
if (i + 1 <= length(chess)) {
# player data line
player_line <- chess[i]
state_line <- chess[i + 1]
# split line |
player_fields_split <- strsplit(player_line, "\\|")[[1]] %>% trimws()
state_fields_split <- strsplit(state_line, "\\|")[[1]] %>% trimws()
# fields extraction for player
player_num <- as.numeric(player_fields_split[1])
name <- player_fields_split[2] # player name position
state <- state_fields_split[1] # state position
total_points <- as.numeric(player_fields_split[3])
# points and ratings numeric cleaning - remove everything else
rating_match <- stringr::str_match(state_fields_split[2], "R:\\s*(\\d+)")
pre_rating <- as.numeric(rating_match[,2])
# opponents
opponents <- c()
for (round in 5:11) {
if (round <= length(player_fields_split)) {
round_result <- player_fields_split[round]
opponent_match <- stringr::str_extract(round_result, "[WLD]\\s*(\\d+)")
if (!is.na(opponent_match)) {
opponent_num <- as.numeric(stringr::str_extract(opponent_match, "\\d+"))
opponents <- c(opponents, opponent_num)
}
}
}
# Save player data
player_data[[as.character(player_num)]] <- list(
name = name,
state = state,
points = total_points,
pre_rating = pre_rating,
opponents = opponents
)
}
}
# Calculate avg opponent ratings
for (player_num in names(player_data)) {
player <- player_data[[player_num]]
# avg pre rating
opponent_ratings <- c()
for (opponent_num in player$opponents) {
if (as.character(opponent_num) %in% names(player_data)) {
opponent_ratings <- c(opponent_ratings, player_data[[as.character(opponent_num)]]$pre_rating)
}
}
avg_opponent_ranting <- if(length(opponent_ratings) > 0) {
round(mean(opponent_ratings), 0)
} else {
NA
}
# update vectors
names_v <- c(names_v, player$name)
states_v <- c(states_v, player$state)
points_v <- c(points_v, player$points)
pre_ratings_v <- c(pre_ratings_v, player$pre_rating)
avg_opponent_rantings_v <- c(avg_opponent_rantings_v, avg_opponent_ranting)
}
head(player_data, 8)
## $`1`
## $`1`$name
## [1] "GARY HUA"
##
## $`1`$state
## [1] "ON"
##
## $`1`$points
## [1] 6
##
## $`1`$pre_rating
## [1] 1794
##
## $`1`$opponents
## [1] 21 18 14 7 12 4
##
##
## $`2`
## $`2`$name
## [1] "DAKSHESH DARURI"
##
## $`2`$state
## [1] "MI"
##
## $`2`$points
## [1] 6
##
## $`2`$pre_rating
## [1] 1553
##
## $`2`$opponents
## [1] 58 4 17 16 20 7
##
##
## $`3`
## $`3`$name
## [1] "ADITYA BAJAJ"
##
## $`3`$state
## [1] "MI"
##
## $`3`$points
## [1] 6
##
## $`3`$pre_rating
## [1] 1384
##
## $`3`$opponents
## [1] 61 25 21 11 13 12
##
##
## $`4`
## $`4`$name
## [1] "PATRICK H SCHILLING"
##
## $`4`$state
## [1] "MI"
##
## $`4`$points
## [1] 5.5
##
## $`4`$pre_rating
## [1] 1716
##
## $`4`$opponents
## [1] 28 2 26 5 19 1
##
##
## $`5`
## $`5`$name
## [1] "HANSHI ZUO"
##
## $`5`$state
## [1] "MI"
##
## $`5`$points
## [1] 5.5
##
## $`5`$pre_rating
## [1] 1655
##
## $`5`$opponents
## [1] 37 12 13 4 14 17
##
##
## $`6`
## $`6`$name
## [1] "HANSEN SONG"
##
## $`6`$state
## [1] "OH"
##
## $`6`$points
## [1] 5
##
## $`6`$pre_rating
## [1] 1686
##
## $`6`$opponents
## [1] 29 11 35 10 27 21
##
##
## $`7`
## $`7`$name
## [1] "GARY DEE SWATHELL"
##
## $`7`$state
## [1] "MI"
##
## $`7`$points
## [1] 5
##
## $`7`$pre_rating
## [1] 1649
##
## $`7`$opponents
## [1] 46 13 11 1 9 2
##
##
## $`8`
## $`8`$name
## [1] "EZEKIEL HOUGHTON"
##
## $`8`$state
## [1] "MI"
##
## $`8`$points
## [1] 5
##
## $`8`$pre_rating
## [1] 1641
##
## $`8`$opponents
## [1] 32 14 9 47 28 19
expected_elo <- function(player_a, player_b) 1/(1 + 10^((player_b - player_a) / 400))
expected_row <- lapply(names(player_data), function(pnumber) {
pl <- player_data[[pnumber]]
player_a <- pl$pre_rating
values <- sapply(pl$opponents, function(opp_numb) {
key <- as.character(opp_numb)
if (key %in% names(player_data)) {
player_b <- player_data[[key]]$pre_rating
expected_elo(player_a, player_b)
} else {
NA_real_
}
}, simplify = TRUE, USE.NAMES = FALSE)
values <- as.numeric(values)
expected_total <- sum(values, na.rm = TRUE)
tibble::tibble(
Name = pl$name,
State = pl$state,
Pre_Rating = player_a,
Actual = pl$points,
Expected = round(expected_total, 2),
Difference = round(pl$points - expected_total, 2)
)
})
df_elo <- dplyr::bind_rows(expected_row)
over <- df_elo %>% arrange(desc(Difference)) %>% slice_head(n = 5)
under <- df_elo %>% arrange(Difference) %>% slice_head(n = 5)
over
## # A tibble: 5 × 6
## Name State Pre_Rating Actual Expected Difference
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ADITYA BAJAJ MI 1384 6 1.76 4.24
## 2 ANVIT RAO MI 1365 5 1.74 3.26
## 3 ZACHARY JAMES HOUGHTON MI 1220 4.5 1.25 3.25
## 4 DAKSHESH DARURI MI 1553 6 2.88 3.12
## 5 JACOB ALEXANDER LAVALLEY MI 377 3 0.04 2.96
under
## # A tibble: 5 × 6
## Name State Pre_Rating Actual Expected Difference
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 LOREN SCHWIEBERT MI 1745 3.5 5.4 -1.9
## 2 JARED GE MI 1332 3 4.88 -1.88
## 3 GEORGE AVERY JONES ON 1522 3.5 5.05 -1.55
## 4 DEREK YAN MI 1242 3 4.29 -1.29
## 5 LARRY HODGE MI 1270 2 3.28 -1.28
results_table <- dplyr::bind_rows(
dplyr::mutate(over, Category = "Overperform"),
dplyr::mutate(under, Category = "Underperform")
) %>%
dplyr::select(Category, Name, State, Pre_Rating, Actual, Expected, Difference)
print(results_table, row.names = FALSE)
## # A tibble: 10 × 7
## Category Name State Pre_Rating Actual Expected Difference
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Overperform ADITYA BAJAJ MI 1384 6 1.76 4.24
## 2 Overperform ANVIT RAO MI 1365 5 1.74 3.26
## 3 Overperform ZACHARY JAMES HOUGH… MI 1220 4.5 1.25 3.25
## 4 Overperform DAKSHESH DARURI MI 1553 6 2.88 3.12
## 5 Overperform JACOB ALEXANDER LAV… MI 377 3 0.04 2.96
## 6 Underperform LOREN SCHWIEBERT MI 1745 3.5 5.4 -1.9
## 7 Underperform JARED GE MI 1332 3 4.88 -1.88
## 8 Underperform GEORGE AVERY JONES ON 1522 3.5 5.05 -1.55
## 9 Underperform DEREK YAN MI 1242 3 4.29 -1.29
## 10 Underperform LARRY HODGE MI 1270 2 3.28 -1.28
This report used to ELO formula to predict the peformance of chess players based on their previous performance and compare with the real performnance in the tournament. We can see that ADITYA BAJAJ clearly overperfomed since that player’s expected result was 1.76 and they ended up with a score of 6, way above the expected value. At the same time, players such as LOREN SCHWIEBERT or JARED GE had a much worse performance than expected.