install.packages("stringr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
url <- "https://raw.githubusercontent.com/bb2955/Data-607/main/tournamentinfo.txt"
lines <- readLines(url)
## Warning in readLines(url): incomplete final line found on
## 'https://raw.githubusercontent.com/bb2955/Data-607/main/tournamentinfo.txt'
head(lines)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
player_lines <- lines[grep("^\\s*[0-9]+\\s+\\|", lines)]
rating_lines <- lines[grep("R:", lines)]
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
players <- data.frame(
PairNum = as.numeric(str_trim(sapply(str_split(player_lines, "\\|"), `[`, 1))),
Name = str_trim(sapply(str_split(player_lines, "\\|"), `[`, 2)),
TotalPts = as.numeric(str_extract(player_lines, "\\d+\\.\\d")),
stringsAsFactors = FALSE
)
players$State <- str_trim(substr(rating_lines, 1, 2))
players$PreRating <- as.numeric(
gsub(".*R:\\s*([0-9]+).*", "\\1", rating_lines)
)
get_opponents <- function(line) {
matches <- str_extract_all(line, "[WLD]\\s+[0-9]+")[[1]]
as.numeric(str_remove(matches, "[WLD]\\s+"))
}
opponent_list <- lapply(player_lines, get_opponents)
avg_opp_rating <- sapply(1:length(opponent_list), function(i) {
opp_nums <- opponent_list[[i]]
opp_ratings <- players$PreRating[match(opp_nums, players$PairNum)]
mean(opp_ratings, na.rm = TRUE)
})
players$AvgOppPreRating <- round(avg_opp_rating)
final_df <- players %>%
select(Name, State, TotalPts, PreRating, AvgOppPreRating)
head(final_df)
## Name State TotalPts PreRating AvgOppPreRating
## 1 GARY HUA 6.0 1794 1605
## 2 DAKSHESH DARURI 6.0 1553 1469
## 3 ADITYA BAJAJ 6.0 1384 1564
## 4 PATRICK H SCHILLING 5.5 1716 1574
## 5 HANSHI ZUO 5.5 1655 1501
## 6 HANSEN SONG 5.0 1686 1519
Week 5B ELO Calculations
To tackle this problem, I will first clean and structure the tournament data so that each player’s pre-tournament rating, total points, and list of opponents are correctly extracted. Using the standard Elo expected score formula, I will calculate the expected score for each game based on the rating difference between a player and their opponent. I will then sum these expected values across all rounds to compute each player’s total expected score. Finally, I will subtract the expected score from the player’s actual tournament score to determine whether they overperformed or underperformed, and rank players accordingly.
One anticipated data challenge is ensuring that opponent pairing numbers correctly match the corresponding player ratings, since mismatches would distort expected score calculations. Another challenge is accurately extracting game results (win, loss, draw) and opponent IDs from semi-structured text using regular expressions. Additionally, small inconsistencies in formatting or missing values could cause alignment issues between player lines and rating lines, so careful validation and checks will be necessary before performing the Elo calculations.
expected_score <- function(player_rating, opp_rating) {
1 / (1 + 10^((opp_rating - player_rating)/400))
}
expected_totals <- sapply(1:length(opponent_list), function(i) {
player_rating <- players$PreRating[i]
opp_nums <- opponent_list[[i]]
opp_ratings <- players$PreRating[match(opp_nums, players$PairNum)]
exp_scores <- expected_score(player_rating, opp_ratings)
sum(exp_scores, na.rm = TRUE)
})
players$ExpectedScore <- round(expected_totals, 2)
players$Difference <- round(players$TotalPts - players$ExpectedScore, 2)
results <- players %>%
select(Name, PreRating, TotalPts, ExpectedScore, Difference) %>%
arrange(desc(Difference))
head(results)
## Name PreRating TotalPts ExpectedScore Difference
## 1 ADITYA BAJAJ 1384 6.0 1.95 4.05
## 2 ZACHARY JAMES HOUGHTON 1220 4.5 1.37 3.13
## 3 ANVIT RAO 1365 5.0 1.94 3.06
## 4 JACOB ALEXANDER LAVALLEY 377 3.0 0.04 2.96
## 5 AMIYATOSH PWNANANDAM 980 3.5 0.77 2.73
## 6 STEFANO LEE 1411 5.0 2.29 2.71
top_over <- results %>%
arrange(desc(Difference)) %>%
head(5)
top_over
## Name PreRating TotalPts ExpectedScore Difference
## 1 ADITYA BAJAJ 1384 6.0 1.95 4.05
## 2 ZACHARY JAMES HOUGHTON 1220 4.5 1.37 3.13
## 3 ANVIT RAO 1365 5.0 1.94 3.06
## 4 JACOB ALEXANDER LAVALLEY 377 3.0 0.04 2.96
## 5 AMIYATOSH PWNANANDAM 980 3.5 0.77 2.73
top_under <- results %>%
arrange(Difference) %>%
head(5)
top_under
## Name PreRating TotalPts ExpectedScore Difference
## 1 LOREN SCHWIEBERT 1745 3.5 6.28 -2.78
## 2 GEORGE AVERY JONES 1522 3.5 6.02 -2.52
## 3 JARED GE 1332 3.0 5.01 -2.01
## 4 RISHI SHETTY 1494 3.5 5.09 -1.59
## 5 JOSHUA DAVID LEE 1438 3.5 4.96 -1.46
Using the standard International Chess Federation (FIDE) expected score formula, I calculated each player’s expected score for every round and summed these values across all seven rounds. I then compared the expected totals to each player’s actual tournament score.
The five largest overperformers were Aditya Bajaj (+4.05), Zachary James Houghton (+3.13), Anvit Rao (+3.06), Jacob Alexander Lavalley (+2.96), and Amiyatosh Pwnanandam (+2.73). These players significantly exceeded rating-based expectations.
The five largest underperformers were Loren Schwiebert (-2.78), George Avery Jones (-2.52), Jared Ge (-2.01), Rishi Shetty (-1.59), and Joshua David Lee (-1.46). These players scored well below what their ratings predicted.
Link to FIDE: https://handbook.fide.com/chapter/B022017