library(knitr)
library(ggplot2)
library(forcats)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
To compute each player’s expected score and the difference vs actual, we use the Elo expected score formula for each game:
The formula I am using comes directly from Arpad Elo’s original rating system, which was introduced in the 1960s to improve the older chess rating methods used by the US Chess Federation (USCF). Later, FIDE (the international chess federation) formally adopted it in 1970 as the official system for international chess ratings.
In this chess tournament dataset, the Elo calculation is based on two key columns:
The Elo expected score formula is:
\[ E = \frac{1}{1 + 10^{\frac{(R_{opp} - R_{player})}{400}}} \]
Where:
This gives the expected score for each player in the tournament.Returns a value between 0 and 1, representing the expected fraction of points in one game.
Then the new rating is updated after comparing the actual score (from Points) vs the expected score \(E\):
\[ R_{new} = R_{old} + K \times (S - E) \]
Where:
So in this dataset:
“Gary Hua Elo Calculation Step by Step”
Compute Expected Score
The Elo expected score formula:
\[ E = \frac{1}{1 + 10^{\frac{R_{\text{opp}} - R_{\text{player}}}{400}}} \]
Substitute Gary’s values:
\[ E = \frac{1}{1 + 10^{\frac{1605 - 1794}{400}}} \]
Compute the exponent first:
\[ 1605 - 1794 = -189 \]
\[ \frac{-189}{400} = -0.4725 \]
Then:
\[ 10^{-0.4725} \approx 0.336 \]
\[ 1 + 0.336 = 1.336 \]
\[ E = \frac{1}{1.336} \approx 0.748 \]
This is the expected score per game against a 1605 rated opponent. Returns a value between 0 and 1, representing the expected fraction of points in one game.
Scale to Tournament Points
Assuming 7 rounds in the tournament:
\[ \text{Expected Points} \approx 0.748 \times 7 \approx 5.24 \]
Elo expects Gary to score about 5.24 points.
Compute Difference vs Actual
\[ \text{Diff} = \text{Actual Points} - \text{Expected Points} \]
\[ \text{Diff} = 6.0 - 5.2 = 0.8 \]
Gary slightly outperformed
Load Tournament.txt from Github
https://raw.githubusercontent.com/prnakyazze94/Data_607/refs/heads/main/Class%20Tournament.txt
# Read the file
lines <- readLines("https://raw.githubusercontent.com/prnakyazze94/Data_607/refs/heads/main/Class%20Tournament.txt")
Filter only player lines information.
# Remove separator lines and blank lines
lines <- lines[!grepl("^-+|^\\s*$", lines)]
# Remove the two header lines
lines <- lines[-c(1, 2)]
# Separate into player + info lines
player_lines <- lines[seq(1, length(lines), by = 2)]
info_lines <- lines[seq(2, length(lines), by = 2)]
create df
results <- data.frame(
PairNum = integer(),
Name = character(),
State = character(),
USCF_ID = character(),
PreRating = numeric(),
PostRating = numeric(),
TotalScore = numeric(),
stringsAsFactors = FALSE
)
Extract all Player information, PreRating, Points, State, OpponentNums, OpponentPreRatings calculate Average(mean) and Prepare output data frame
# Extract all pre-ratings once (used for AvgOpponentRating)
pre_ratings <- sapply(info_lines, function(info) {
rating_match <- regmatches(info, regexpr("R:\\s*\\d+", info))
as.integer(gsub("R:\\s*", "", rating_match))
})
# Prepare output data frame
output <- data.frame(
Player = character(),
State = character(),
Points = numeric(),
PreRating = integer(),
AvgOpponentRating = numeric(),
PlayerNum = integer(),
OpponentNums = character(),
OpponentPreRatings = character(),
stringsAsFactors = FALSE
)
# Loop through players
for (i in seq_along(player_lines)) {
pl <- player_lines[i]
info <- info_lines[i]
# Extract clean player name (remove leading number and pipe)
name <- trimws(gsub("^\\d+\\s*\\|\\s*", "", substr(pl, 5, 36)))
# Extract points
points_match <- regmatches(pl, regexpr("\\|\\s*[0-9]+\\.?[0-9]*\\s*\\|", pl))
points <- as.numeric(gsub("[^0-9.]", "", points_match))
# Extract state (e.g., ON, MI)
state_match <- regmatches(info, regexpr("\\b[A-Z]{2}\\b", info))
state <- if (length(state_match) > 0) state_match else NA
# Extract pre-rating
rating_match <- regmatches(info, regexpr("R:\\s*\\d+", info))
pre_rating <- as.integer(gsub("R:\\s*", "", rating_match))
# Extract opponent indices from line 1 (rounds)
rounds <- unlist(strsplit(pl, "\\|"))
rounds <- rounds[4:length(rounds)] # skip first 3 parts
opp_indices <- as.integer(gsub("[^0-9]", "", rounds))
opp_indices <- opp_indices[!is.na(opp_indices)]
# Calculate average opponent rating
if (length(opp_indices) > 0) {
opp_ratings <- pre_ratings[opp_indices]
avg_opp_rating <- round(mean(opp_ratings, na.rm = TRUE), 0)
} else {
avg_opp_rating <- NA
}
# Add row to output
output <- rbind(output, data.frame(
Player = name,
State = state,
Points = points,
PreRating = pre_rating,
AvgOpponentRating = avg_opp_rating,
stringsAsFactors = FALSE
))
}
p <- 10
# Print only the first p rows
print(head(output, n = p))
## Player State Points PreRating AvgOpponentRating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
## 7 GARY DEE SWATHELL MI 5.0 1649 1372
## 8 EZEKIEL HOUGHTON MI 5.0 1641 1468
## 9 STEFANO LEE ON 5.0 1411 1523
## 10 ANVIT RAO MI 5.0 1365 1554
# View output
#print(output)
calculate each player’s expected score (e.g. 4.3) and the difference from their actual score (e.g 4.0)
# --- Function to compute expected score for one game ---
elo_expect <- function(r_player, r_opp) {
1 / (1 + 10 ^ ((r_opp - r_player) / 400))
}
# --- Prepare an output frame with consistent columns ---
output <- data.frame(
Player = character(),
Points = numeric(),
PreRating = integer(),
AvgOpponentRating = numeric(),
ExpectedPoints = numeric(),
Diff = numeric(),
NewRating = numeric(),
stringsAsFactors = FALSE
)
n_players <- length(pre_ratings) # assumes pre_ratings exists
K <- 32 # You can adjust this value depending on federation rules
# --- Loop through each player ---
for (i in seq_along(player_lines)) {
pl <- player_lines[i]
info <- info_lines[i]
# Extract player name
name <- trimws(gsub("^\\d+\\s*\\|\\s*", "", substr(pl, 5, 36)))
# Extract points
points_match <- regmatches(pl, regexpr("\\|\\s*[0-9]+\\.?[0-9]*\\s*\\|", pl))
points <- as.numeric(gsub("[^0-9.]", "", points_match))
# Extract pre-rating
rating_match <- regmatches(info, regexpr("R:\\s*\\d+", info))
pre_rating <- if (length(rating_match) > 0) as.integer(gsub("R:\\s*", "", rating_match)) else NA
# Extract opponent indices from round columns
rounds <- unlist(strsplit(pl, "\\|"))
if (length(rounds) >= 4) {
rounds <- rounds[4:length(rounds)]
opp_indices <- as.integer(str_extract(rounds, "\\d+"))
opp_indices <- opp_indices[!is.na(opp_indices)]
opp_indices <- opp_indices[opp_indices >= 1 & opp_indices <= n_players]
} else {
opp_indices <- integer(0)
}
# Compute average opponent rating and expected score
if (length(opp_indices) > 0) {
opp_ratings <- pre_ratings[opp_indices]
avg_opp_rating <- round(mean(opp_ratings, na.rm = TRUE), 0)
exp_per_game <- elo_expect(pre_rating, opp_ratings)
expected_points <- sum(exp_per_game, na.rm = TRUE)
} else {
avg_opp_rating <- NA
expected_points <- NA
}
#Difference vs Actual Score
diff_points <- if (!is.na(expected_points)) points - expected_points else NA
# --- New Elo Rating calculation
new_rating <- if (!is.na(expected_points)) {
pre_rating + K * (points - expected_points)
} else {
NA
}
# Append to output dataframe
output <- rbind(output, data.frame(
Player = name,
Points = points,
PreRating = pre_rating,
AvgOpponentRating = avg_opp_rating,
ExpectedPoints = expected_points,
Diff = diff_points,
NewRating = new_rating,
stringsAsFactors = FALSE
))
}
# --- Rounding & formatting ---
output <- output %>%
mutate(
ExpectedPoints = round(ExpectedPoints, 1),
Diff = round(Diff, 2),
NewRating = round(NewRating, 0)
)
final_output <- output %>%
select(Player, Points, ExpectedPoints, Diff, PreRating, AvgOpponentRating, NewRating)
#print(final_output)
kable(
final_output,
caption = "Tournament Results with Elo Expected Points and Updated Ratings",
align = "lcccccc"
)
Player | Points | ExpectedPoints | Diff | PreRating | AvgOpponentRating | NewRating |
---|---|---|---|---|---|---|
GARY HUA | 6.0 | 5.2 | 0.84 | 1794 | 1605 | 1821 |
DAKSHESH DARURI | 6.0 | 3.8 | 2.22 | 1553 | 1469 | 1624 |
ADITYA BAJAJ | 6.0 | 1.9 | 4.05 | 1384 | 1564 | 1514 |
PATRICK H SCHILLING | 5.5 | 4.7 | 0.76 | 1716 | 1574 | 1740 |
HANSHI ZUO | 5.5 | 4.4 | 1.12 | 1655 | 1501 | 1691 |
HANSEN SONG | 5.0 | 4.9 | 0.06 | 1686 | 1519 | 1688 |
GARY DEE SWATHELL | 5.0 | 4.6 | 0.42 | 1649 | 1372 | 1662 |
EZEKIEL HOUGHTON | 5.0 | 5.0 | -0.03 | 1641 | 1468 | 1640 |
STEFANO LEE | 5.0 | 2.3 | 2.71 | 1411 | 1523 | 1498 |
ANVIT RAO | 5.0 | 1.9 | 3.06 | 1365 | 1554 | 1463 |
CAMERON WILLIAM MC LEMAN | 4.5 | 5.3 | -0.84 | 1712 | 1468 | 1685 |
KENNETH J TACK | 4.5 | 4.1 | 0.39 | 1663 | 1506 | 1676 |
TORRANCE HENRY JR | 4.5 | 5.0 | -0.45 | 1666 | 1498 | 1652 |
BRADLEY SHAW | 4.5 | 4.2 | 0.32 | 1610 | 1515 | 1620 |
ZACHARY JAMES HOUGHTON | 4.5 | 1.4 | 3.13 | 1220 | 1484 | 1320 |
MIKE NIKITIN | 4.0 | 3.8 | 0.20 | 1604 | 1386 | 1610 |
RONALD GRZEGORCZYK | 4.0 | 4.7 | -0.66 | 1629 | 1499 | 1608 |
DAVID SUNDEEN | 4.0 | 4.6 | -0.59 | 1600 | 1480 | 1581 |
DIPANKAR ROY | 4.0 | 4.3 | -0.33 | 1564 | 1426 | 1554 |
JASON ZHENG | 4.0 | 5.1 | -1.13 | 1595 | 1411 | 1559 |
DINH DANG BUI | 4.0 | 4.3 | -0.32 | 1563 | 1470 | 1553 |
EUGENE L MCCLURE | 4.0 | 4.5 | -0.48 | 1555 | 1300 | 1540 |
ALAN BUI | 4.0 | 3.9 | 0.06 | 1363 | 1214 | 1365 |
MICHAEL R ALDRICH | 4.0 | 2.6 | 1.45 | 1229 | 1357 | 1275 |
LOREN SCHWIEBERT | 3.5 | 6.3 | -2.78 | 1745 | 1363 | 1656 |
MAX ZHU | 3.5 | 4.1 | -0.60 | 1579 | 1507 | 1560 |
GAURAV GIDWANI | 3.5 | 4.0 | -0.50 | 1552 | 1222 | 1536 |
SOFIA ADINA STANESCU-BELLU | 3.5 | 3.3 | 0.19 | 1507 | 1522 | 1513 |
CHIEDOZIE OKORIE | 3.5 | 4.6 | -1.10 | 1602 | 1314 | 1567 |
GEORGE AVERY JONES | 3.5 | 6.0 | -2.52 | 1522 | 1144 | 1441 |
RISHI SHETTY | 3.5 | 5.1 | -1.59 | 1494 | 1260 | 1443 |
JOSHUA PHILIP MATHEWS | 3.5 | 3.7 | -0.22 | 1441 | 1379 | 1434 |
JADE GE | 3.5 | 4.6 | -1.14 | 1449 | 1277 | 1412 |
MICHAEL JEFFERY THOMAS | 3.5 | 3.4 | 0.06 | 1399 | 1375 | 1401 |
JOSHUA DAVID LEE | 3.5 | 5.0 | -1.46 | 1438 | 1150 | 1391 |
SIDDHARTH JHA | 3.5 | 2.7 | 0.80 | 1355 | 1388 | 1381 |
AMIYATOSH PWNANANDAM | 3.5 | 0.8 | 2.73 | 980 | 1385 | 1067 |
BRIAN LIU | 3.0 | 2.1 | 0.87 | 1423 | 1539 | 1451 |
JOEL R HENDON | 3.0 | 3.6 | -0.62 | 1436 | 1430 | 1416 |
FOREST ZHANG | 3.0 | 2.9 | 0.06 | 1348 | 1391 | 1350 |
KYLE WILLIAM MURPHY | 3.0 | 2.4 | 0.64 | 1403 | 1248 | 1423 |
JARED GE | 3.0 | 5.0 | -2.01 | 1332 | 1150 | 1268 |
ROBERT GLEN VASEY | 3.0 | 4.3 | -1.33 | 1283 | 1107 | 1240 |
JUSTIN D SCHILLING | 3.0 | 2.1 | 0.93 | 1199 | 1327 | 1229 |
DEREK YAN | 3.0 | 4.4 | -1.37 | 1242 | 1152 | 1198 |
JACOB ALEXANDER LAVALLEY | 3.0 | 0.0 | 2.96 | 377 | 1358 | 472 |
ERIC WRIGHT | 2.5 | 3.2 | -0.69 | 1362 | 1392 | 1340 |
DANIEL KHAIN | 2.5 | 2.5 | -0.03 | 1382 | 1356 | 1381 |
MICHAEL J MARTIN | 2.5 | 2.5 | -0.04 | 1291 | 1286 | 1290 |
SHIVAM JHA | 2.5 | 1.4 | 1.08 | 1056 | 1296 | 1090 |
TEJAS AYYAGARI | 2.5 | 1.0 | 1.47 | 1011 | 1356 | 1058 |
ETHAN GUO | 2.5 | 0.3 | 2.20 | 935 | 1495 | 1006 |
JOSE C YBARRA | 2.0 | 1.7 | 0.28 | 1393 | 1345 | 1402 |
LARRY HODGE | 2.0 | 3.4 | -1.40 | 1270 | 1206 | 1225 |
ALEX KONG | 2.0 | 1.4 | 0.56 | 1186 | 1406 | 1204 |
MARISA RICCI | 2.0 | 1.1 | 0.92 | 1153 | 1414 | 1182 |
MICHAEL LU | 2.0 | 1.3 | 0.70 | 1092 | 1363 | 1114 |
VIRAJ MOHILE | 2.0 | 0.4 | 1.57 | 917 | 1391 | 967 |
SEAN M MC CORMICK | 2.0 | 0.4 | 1.59 | 853 | 1319 | 904 |
JULIA SHEN | 1.5 | 0.6 | 0.90 | 967 | 1330 | 996 |
JEZZEL FARKAS | 1.5 | 1.0 | 0.53 | 955 | 1327 | 972 |
ASHWIN BALAJI | 1.0 | 0.9 | 0.12 | 1530 | 1186 | 1534 |
THOMAS JOSEPH HOSMER | 1.0 | 1.4 | -0.43 | 1175 | 1350 | 1161 |
BEN LI | 1.0 | 2.3 | -1.27 | 1163 | 1263 | 1122 |
Top 5 overperformers
# Top 5 overperformers
kable((output %>%
arrange(desc(Diff)) %>%
slice_head(n = 5) %>%
select(Player, Points, ExpectedPoints, Diff, PreRating, NewRating, AvgOpponentRating,)))
Player | Points | ExpectedPoints | Diff | PreRating | NewRating | AvgOpponentRating |
---|---|---|---|---|---|---|
ADITYA BAJAJ | 6.0 | 1.9 | 4.05 | 1384 | 1514 | 1564 |
ZACHARY JAMES HOUGHTON | 4.5 | 1.4 | 3.13 | 1220 | 1320 | 1484 |
ANVIT RAO | 5.0 | 1.9 | 3.06 | 1365 | 1463 | 1554 |
JACOB ALEXANDER LAVALLEY | 3.0 | 0.0 | 2.96 | 377 | 472 | 1358 |
AMIYATOSH PWNANANDAM | 3.5 | 0.8 | 2.73 | 980 | 1067 | 1385 |
Aditya’s PreRating is 1384.
His Opponents averaged 1564, which is 180 points higher.
Elo predicts Aditya should lose most games, his expected score is 1.9 points. But he actually scored 6 points beating much stronger players.
As a result, Elo rewarded him with +130 rating points, bringing his new prerating to 1514.
PLOT TO VISUALLY SHOW ELO TOP PERFORMERS
top5 <- output %>%
arrange(desc(Diff)) %>%
slice_head(n = 5) %>%
select(Player, Points, ExpectedPoints, Diff, PreRating, NewRating, AvgOpponentRating)
# Plot: Actual vs Expected Points with Diff labels
ggplot(top5, aes(x = reorder(Player, -Diff))) +
geom_col(aes(y = Points, fill = "Actual Points"), width = 0.6) +
geom_line(aes(y = ExpectedPoints, group = 1, color = "Expected Points"), size = 1) +
geom_point(aes(y = ExpectedPoints, color = "Expected Points"), size = 3) +
geom_text(aes(y = Points + 0.2, label = paste0("Diff=", round(Diff, 2))),
vjust = 0, size = 3.5) +
scale_fill_manual("", values = c("Actual Points" = "orange")) +
scale_color_manual("", values = c("Expected Points" = "red")) +
labs(title = "Top 5 Elo Overperformers",
x = "Player", y = "Points") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
You can see that all these players scored well above Elo expectations,
which explains why each of them gained rating points.
All five players in this plot overperformed relative to Elo.
The red “expected” line sits much lower than the orange bars, this visual gap highlights how much each exceeded expectations.
These players see big Elo rating gains after the tournament.
The biggest standout is Aditya Bajaj, with over 4 points more than expected.
5 TOP UNDER PERFORMERS
# Top 5 underperformers
kable((output %>%
arrange(Diff) %>%
slice_head(n = 5) %>%
select(Player, Points, ExpectedPoints, Diff, PreRating, NewRating, AvgOpponentRating,)))
Player | Points | ExpectedPoints | Diff | PreRating | NewRating | AvgOpponentRating |
---|---|---|---|---|---|---|
LOREN SCHWIEBERT | 3.5 | 6.3 | -2.78 | 1745 | 1656 | 1363 |
GEORGE AVERY JONES | 3.5 | 6.0 | -2.52 | 1522 | 1441 | 1144 |
JARED GE | 3.0 | 5.0 | -2.01 | 1332 | 1268 | 1150 |
RISHI SHETTY | 3.5 | 5.1 | -1.59 | 1494 | 1443 | 1260 |
JOSHUA DAVID LEE | 3.5 | 5.0 | -1.46 | 1438 | 1391 | 1150 |
LOREN SCHWIEBERT’s PreRating is 1745 which suggests Loren was much stronger than his opponents with a 1363 averageopponents preRating.
Elo expected him to dominate 6.3 points.
But he only scored 3.5.
Therefore, Elo deducted about 89 rating points, giving him a NewRating of 1656 which better reflects his actual tournament strength.
PLOT TO VISUALLY SHOW ELO TOP UNDER PERFORMERS
lower5 <- output %>%
arrange(Diff) %>%
slice_head(n = 5) %>%
select(Player, Points, ExpectedPoints, Diff, PreRating, NewRating, AvgOpponentRating)
# Plot: Actual vs Expected Points with Diff labels
ggplot(lower5, aes(x = reorder(Player, -Diff))) +
geom_col(aes(y = Points, fill = "Actual Points"), width = 0.6) +
geom_line(aes(y = ExpectedPoints, group = 1, color = "Expected Points"), size = 1) +
geom_point(aes(y = ExpectedPoints, color = "Expected Points"), size = 3) +
geom_text(aes(y = Points + 0.2, label = paste0("Diff=", round(Diff, 2))),
vjust = 0, size = 3.5) +
scale_fill_manual("", values = c("Actual Points" = "orange")) +
scale_color_manual("", values = c("Expected Points" = "red")) +
labs(title = "Top 5 Elo Underperformers",
x = "Player", y = "Points") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
The red “expected” line sits much higher above the orange bars, this
visual gap highlights how much each player did not meet
expectations.
These players see big Elo rating gains after the tournament.
The biggest standout is LOREN SCHWIEBERT, with 2.78 points less than expected.