This project analyzes chess tournament results from a text file. The goal is to extract relevant player information, including names, states, total points, and pre-tournament ratings and calculate the average pre-rating of each player’s opponents. The data is later compiled into a clean csv file for future analysis or database import.
lines <- readLines('https://raw.githubusercontent.com/vincent-usny/project1/refs/heads/main/project1.txt', warn = FALSE)
data <- lines[-c(1:4)] #remove first 4 lines
#get player names
players_lines <- data[seq(1, length(data), by = 3)]
parts_line1 <- strsplit(players_lines, '\\|')
names <- sapply(parts_line1, function(x) trimws(x[2]))
#get states
state_lines <- data[seq(2, length(data), by = 3)]
parts_line2 <- strsplit(state_lines, '\\|')
states <- sapply(parts_line2, function(x) trimws(x[1]))
#get total points
total_points <- sapply(parts_line1, function(x) trimws(x[3]))
#get pre-rating
ratings <- sapply(parts_line2, function(x) x[2])
pre_rating <- as.numeric(sapply(ratings, function(x) {
sub(".*R: *(\\d+).*", "\\1", x)
}))
pair_nums <- sapply(players_lines, function(x) as.numeric(strsplit(x, "\\|")[[1]][1]))
pre_rating_lookup <- setNames(pre_rating, pair_nums)
avg_opp_rating <- sapply(parts_line1, function(y) {
rounds <- y[4:10]
opps <-c()
for (r in rounds){
r <- trimws(r)
result <- substr(r,1,1)
if (result %in% c('W','L','D')) {
opp_num <- as.numeric(strsplit(r, "\\s+")[[1]][2])
opps <- c(opps, pre_rating_lookup[opp_num])
}
}
mean(opps)
})
avg_opp_rating <- round(avg_opp_rating)
chess_df <- data.frame(
Player = names,
State = states,
Total_points = total_points,
Pre_rating = pre_rating,
Avg_opp_rating = avg_opp_rating
)
head(chess_df)
## Player State Total_points Pre_rating Avg_opp_rating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
# Calculate expected scores
exp_score <- sapply(seq_along(parts_line1), function(y) {
rounds <- parts_line1[[y]][4:10]
player_rating <- pre_rating[y]
Es <- c()
for (r in rounds){
r <- trimws(r)
result <- substr(r,1,1)
if (result %in% c('W','L','D')) {
opp_num <- as.numeric(strsplit(r, "\\s+")[[1]][2])
opp_rating <- pre_rating_lookup[opp_num]
E <- 1 / (1 + 10 ^((opp_rating - player_rating)/400))
# formula source: https://handbook.fide.com/chapter/B022014
Es <- c(Es, E)
}
}
sum(Es)
})
exp_score <- round(exp_score,2)
diff_score <- as.numeric(total_points) - exp_score
# create table for players and the diff btw actual
# and expected score
result <- data.frame(
Player = names,
Pre_rating = pre_rating,
Actual_score = as.numeric(total_points),
Expected_score = exp_score,
Difference = diff_score
)
head(result)
## Player Pre_rating Actual_score Expected_score Difference
## 1 GARY HUA 1794 6.0 5.16 0.84
## 2 DAKSHESH DARURI 1553 6.0 3.78 2.22
## 3 ADITYA BAJAJ 1384 6.0 1.95 4.05
## 4 PATRICK H SCHILLING 1716 5.5 4.74 0.76
## 5 HANSHI ZUO 1655 5.5 4.38 1.12
## 6 HANSEN SONG 1686 5.0 4.94 0.06
# sort the overperformers and underperformers
overperformers <- result[order(-result$Difference), ][1:5]
underperformers <- result[order(result$Difference), ][1:5]
head(overperformers,5)
## Player Pre_rating Actual_score Expected_score Difference
## 3 ADITYA BAJAJ 1384 6.0 1.95 4.05
## 15 ZACHARY JAMES HOUGHTON 1220 4.5 1.37 3.13
## 10 ANVIT RAO 1365 5.0 1.94 3.06
## 46 JACOB ALEXANDER LAVALLEY 377 3.0 0.04 2.96
## 37 AMIYATOSH PWNANANDAM 980 3.5 0.77 2.73
head(underperformers,5)
## Player Pre_rating Actual_score Expected_score Difference
## 25 LOREN SCHWIEBERT 1745 3.5 6.28 -2.78
## 30 GEORGE AVERY JONES 1522 3.5 6.02 -2.52
## 42 JARED GE 1332 3.0 5.01 -2.01
## 31 RISHI SHETTY 1494 3.5 5.09 -1.59
## 35 JOSHUA DAVID LEE 1438 3.5 4.96 -1.46
The tournament data was successfully extracted and cleaned, creating a new dataset containing players’ names, states, total points, pre-tournament ratings, and average ratings of each player’s opponents. The csv file is now available to be imported into MySQL or other data tools.