# Read the raw file
raw_lines <- readLines("tournamentinfo.txt")
# Look at the first 10 lines
head(raw_lines, 10)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|"
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |"
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA | 6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI | 6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
# Read the raw file
raw_lines <- readLines("tournamentinfo.txt")
# Look at the first 10 lines
head(raw_lines, 10)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|"
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |"
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA | 6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI | 6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
# Remove dashes lines and header lines - keep only player data
player_lines <- raw_lines[!grepl("^-", raw_lines)] # remove dash lines
player_lines <- player_lines[3:length(player_lines)] # remove the 3 header lines
head(player_lines, 6)
## [1] " 1 | GARY HUA | 6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] " 2 | DAKSHESH DARURI | 6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [4] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [5] " 3 | ADITYA BAJAJ | 6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [6] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
# Separate into odd lines (player data) and even lines (state/rating)
row1 <- player_lines[seq(1, length(player_lines), by = 2)] # odd lines
row2 <- player_lines[seq(2, length(player_lines), by = 2)] # even lines
head(row1, 3)
## [1] " 1 | GARY HUA | 6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI | 6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ | 6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
head(row2, 3)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
library(stringr)
player_name <- str_trim(str_extract(row1, "(?<=\\|)[^|]+(?=\\|)"))
head(player_name, 3)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
# Total points
total_pts <- as.numeric(str_trim(str_extract(row1, "(?<=\\|)\\s*\\d+\\.\\d+\\s*(?=\\|)")))
# State
player_state <- str_trim(str_extract(row2, "^\\s*([A-Z]{2})"))
# Pre-rating (the number after R: and before -> , ignoring any P and numbers after it)
pre_rating <- as.numeric(str_extract(str_extract(row2, "R:\\s*\\d+"), "\\d+"))
head(total_pts, 3)
## [1] 6 6 6
head(player_state, 3)
## [1] "ON" "MI" "MI"
head(pre_rating, 3)
## [1] 1794 1553 1384
# Extract opponent pair numbers from each round (ignore byes, H, U, B, X)
get_opponents <- function(line) {
as.numeric(na.omit(str_extract_all(line, "(?<=[WLDX])\\s*(\\d+)")[[1]] |>
(\(x) str_trim(x))()))
}
opponents <- lapply(row1, get_opponents)
opponents[[1]] # should show Gary Hua's opponents: 39 21 18 14 7 12 4
## [1] 39 21 18 14 7 12 4
# Build a lookup table: pair number -> pre-rating
pair_nums <- as.numeric(str_trim(str_extract(row1, "^\\s*\\d+")))
rating_lookup <- setNames(pre_rating, pair_nums)
# Calculate average opponent pre-rating for each player
avg_opp_rating <- sapply(opponents, function(opp_ids) {
round(mean(rating_lookup[as.character(opp_ids)], na.rm = TRUE))
})
avg_opp_rating[1] # should be 1605 for Gary Hua
## [1] 1605
# Build the final data frame
chess_df <- data.frame(
Player_Name = player_name,
Player_State = player_state,
Total_Points = total_pts,
Pre_Rating = pre_rating,
Avg_Opp_Rating = avg_opp_rating
)
# Preview it
head(chess_df)
## Player_Name Player_State Total_Points Pre_Rating Avg_Opp_Rating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
# Write the CSV
write.csv(chess_df, "chess_results.csv", row.names = FALSE)