library(readr)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
In this project, we analyzed the results of a chess tournament using ELO ratings. We calculated each player’s expected score based on their initial ratings and the ratings of their opponents. Then, we compared the expected score with the actual points each player scored. This helps us see who performed better or worse than expected.
We read the raw tournament text file from GitHub and parsed each player line to extract:Player name, Pre-rating, Total points, Round tokens.
url <- "https://raw.githubusercontent.com/arutam-antunish/DATA607/main/tournamentinfo.txt"
lines <- read_lines(url)
lines <- lines[str_trim(lines) != ""]
player_lines <- grep("^\\s*[0-9]+\\s*\\|", lines)
We will extract: Player Name, Rating, Total Points, and Results per round. We want each player in one row.
player_lines <- grep("^\\s*[0-9]+\\s*\\|", lines)
player_lines[1:10]
## [1] 5 8 11 14 17 20 23 26 29 32
idx <- player_lines[1]
line <- lines[player_lines[1]]
parts <- str_split(line, "\\|")[[1]]
parts <- str_trim(parts)
num <- as.numeric(parts[1])
name <- parts[2]
total <- as.numeric(parts[3])
r1_text <- if (length(parts) >= 4) parts[4] else NA
round_texts <- sapply(4:10, function(j) if (length(parts) >= j) parts[j] else NA)
round_results <- sapply(round_texts, function(x) if (!is.na(x) && nchar(x)>0) str_sub(x,1,1) else NA)
round_opps <- sapply(round_texts, function(x) as.numeric(str_extract(x, "\\d+")))
round_results
## W 39 W 21 W 18 W 14 W 7 D 12 D 4
## "W" "W" "W" "W" "W" "D" "D"
round_opps
## W 39 W 21 W 18 W 14 W 7 D 12 D 4
## 39 21 18 14 7 12 4
row <- data.frame(num = num,
name = as.character(name), total = total, R1_result = round_results[1], R1_opp = round_opps[1], R2_result = round_results[2], R2_opp = round_opps[2], R3_result = round_results[3], R3_opp = round_opps[3], R4_result = round_results[4], R4_opp = round_opps[4], R5_result = round_results[5], R5_opp = round_opps[5], R6_result = round_results[6], R6_opp = round_opps[6], R7_result = round_results[7], R7_opp = round_opps[7], stringsAsFactors = FALSE)
View(row)
rows <- list()
for (k in seq_along(player_lines)) {
line <- lines[player_lines[k]]
parts <- str_trim(str_split(line, "\\|")[[1]])
num <- as.numeric(parts[1])
name <- parts[2]
total <- as.numeric(parts[3])
round_texts <- sapply(4:10, function(j) if (length(parts) >= j) parts[j] else NA)
round_results <- sapply(round_texts, function(x) if (!is.na(x) && nchar(x)>0) str_sub(x,1,1) else NA)
round_opps <- sapply(round_texts, function(x) as.numeric(str_extract(x, "\\d+")))
rows[[k]] <- data.frame(
num = num, name = as.character(name), total = total,
R1_result = round_results[1], R1_opp = round_opps[1],
R2_result = round_results[2], R2_opp = round_opps[2],
R3_result = round_results[3], R3_opp = round_opps[3],
R4_result = round_results[4], R4_opp = round_opps[4],
R5_result = round_results[5], R5_opp = round_opps[5],
R6_result = round_results[6], R6_opp = round_opps[6],
R7_result = round_results[7], R7_opp = round_opps[7],
stringsAsFactors = FALSE)}
df_players <- bind_rows(rows)
nrow(df_players)
## [1] 64
View(df_players)
For each parsed player row we read the next line in the file. to extract the pre-tournament rating. Then create a lookup so we can find any opponent’s rating by their player number.
n <- nrow(df_players)
rating_pre <- rep(NA_real_, n)
for (k in 1:n) {idx_next <- player_lines[k] + 1
if (idx_next <= length(lines)) {text_next <- lines[idx_next]
val <- str_extract(text_next, "(?<=R:\\s)\\d+")
if (!is.na(val)) rating_pre[k] <- as.numeric(val)}}
df_players$rating_pre <- rating_pre
df_players[, c("num","name","rating_pre")][1:12, ]
## num name rating_pre
## W 39 1 GARY HUA 1794
## W 63 2 DAKSHESH DARURI 1553
## L 8 3 ADITYA BAJAJ 1384
## W 23 4 PATRICK H SCHILLING 1716
## W 45 5 HANSHI ZUO 1655
## W 34 6 HANSEN SONG 1686
## W 57 7 GARY DEE SWATHELL 1649
## W 3 8 EZEKIEL HOUGHTON 1641
## W 25 9 STEFANO LEE 1411
## D 16 10 ANVIT RAO 1365
## D 38 11 CAMERON WILLIAM MC LEMAN 1712
## W 42 12 KENNETH J TACK 1663
We used the ELO expected-score formula E = 1 / (1 + 10^((Rb - Ra)/400)). For every player and each round: (a) compute expected points vs opponent rating if available, (b) compute actual points from W/D/L/H. Sum across rounds to get expected_total and actual_total_calc.
expected_one <- function(ra, rb) {1 / (1 + 10 ^ ((rb - ra) / 400))}
df_players$expected_total <- 0
df_players$actual_total_calc <- 0
rating_by_num <- setNames(df_players$rating_pre, as.character(df_players$num))
for (i in 1:nrow(df_players)) {
my_rating <- df_players$rating_pre[i]
exp_sum <- 0
act_sum <- 0
for (r in 1:7) {
opp_col <- paste0("R", r, "_opp")
res_col <- paste0("R", r, "_result")
opp_num <- df_players[i, opp_col]
res <- df_players[i, res_col]
if (is.na(res)) {actual_point <- NA}
else if (res == "W") {actual_point <- 1}
else if (res == "D") {actual_point <- 0.5}
else if (res == "L") {actual_point <- 0}
else if (res == "H") {actual_point <- 0.5}
else {actual_point <- NA}
exp_point <- NA
if (!is.na(opp_num) && as.character(opp_num) %in% names(rating_by_num) && !is.na(my_rating)) {
opp_rating <- as.numeric(rating_by_num[as.character(opp_num)])
if (!is.na(opp_rating)) exp_point <- expected_one(my_rating, opp_rating)}
else {if (!is.na(res) && res == "H") exp_point <- 0.5}
if (!is.na(exp_point)) exp_sum <- exp_sum + exp_point
if (!is.na(actual_point)) act_sum <- act_sum + actual_point}
df_players$expected_total[i] <- exp_sum
df_players$actual_total_calc[i] <- act_sum}
View(df_players)
Check that the computed actual totals match the total column from the file (small differences expected). Then compute diff_expected = actual_total(file) - expected_total and list the top 5 overperformers and top 5 underperformers.
inconsistent <- df_players[!is.na(df_players$total) & !is.na(df_players$actual_total_calc) & abs(df_players$total - df_players$actual_total_calc) > 0.01, ]
nrow(inconsistent)
## [1] 8
View(inconsistent)
if (nrow(inconsistent) > 0) print(inconsistent[, c("num","name","total","actual_total_calc","expected_total")])
## num name total actual_total_calc expected_total
## B...37 37 AMIYATOSH PWNANANDAM 3.5 2.5 0.5000000
## W 59 41 KYLE WILLIAM MURPHY 3.0 2.0 0.4627626
## B...44 44 JUSTIN D SCHILLING 3.0 2.0 1.1918215
## L 14 54 LARRY HODGE 2.0 1.0 1.6222942
## L 62 55 ALEX KONG 2.0 1.0 1.4397840
## L 7 57 MICHAEL LU 2.0 1.0 1.3048595
## W 31 58 VIRAJ MOHILE 2.0 1.0 0.0000000
## L 41 59 SEAN M MC CORMICK 2.0 1.0 0.0000000
# Difference and top lists
df_players$diff_expected <- df_players$total - df_players$expected_total
# order descending for over performers
ord_desc <- order(-df_players$diff_expected)
top5_over <- df_players[ord_desc[1:5], c("num","name","total","expected_total","diff_expected")]
# order ascending for under performers
ord_asc <- order(df_players$diff_expected)
top5_under <- df_players[ord_asc[1:5], c("num","name","total","expected_total","diff_expected")]
View(top5_over)
View(top5_under)
# Results table
results_table <- df_players[order(-df_players$diff_expected), c("num","name","rating_pre","total","expected_total","diff_expected")]
View(results_table)
The ELO-based expected score helps identify players who significantly outperformed or underperformed relative to their ratings. This shows which players improved unexpectedly or struggled in the tournament.
ELO Formula Source: We used the standard ELO expected score formula which can be found in the next link:
By comparing actual scores to expected scores, we identified the players who overperformed and underperformed in the tournament. This analysis shows which players exceeded expectations and which players struggled. Using the ELO formula gives a fair way to measure performance relative to skill levels.