# Read the raw file
raw_lines <- readLines("tournamentinfo.txt")

# Look at the first 10 lines
head(raw_lines, 10)
##  [1] "-----------------------------------------------------------------------------------------"
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round|"
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  |"
##  [4] "-----------------------------------------------------------------------------------------"
##  [5] "    1 | GARY HUA                        | 6.0 |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
##  [6] "   ON | 15445895 / R: 1794 ->1817       |N:2  |W    |B    |W    |B    |W    |B    |W    |"
##  [7] "-----------------------------------------------------------------------------------------"
##  [8] "    2 | DAKSHESH DARURI                 | 6.0 |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
##  [9] "   MI | 14598900 / R: 1553 ->1663       |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [10] "-----------------------------------------------------------------------------------------"
# Read the raw file
raw_lines <- readLines("tournamentinfo.txt")

# Look at the first 10 lines
head(raw_lines, 10)
##  [1] "-----------------------------------------------------------------------------------------"
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round|"
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  |"
##  [4] "-----------------------------------------------------------------------------------------"
##  [5] "    1 | GARY HUA                        | 6.0 |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
##  [6] "   ON | 15445895 / R: 1794 ->1817       |N:2  |W    |B    |W    |B    |W    |B    |W    |"
##  [7] "-----------------------------------------------------------------------------------------"
##  [8] "    2 | DAKSHESH DARURI                 | 6.0 |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
##  [9] "   MI | 14598900 / R: 1553 ->1663       |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [10] "-----------------------------------------------------------------------------------------"
# Remove dashes lines and header lines - keep only player data
player_lines <- raw_lines[!grepl("^-", raw_lines)]  # remove dash lines
player_lines <- player_lines[3:length(player_lines)] # remove the 3 header lines
head(player_lines, 6)
## [1] "    1 | GARY HUA                        | 6.0 |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794 ->1817       |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "    2 | DAKSHESH DARURI                 | 6.0 |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [4] "   MI | 14598900 / R: 1553 ->1663       |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [5] "    3 | ADITYA BAJAJ                    | 6.0 |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [6] "   MI | 14959604 / R: 1384 ->1640       |N:2  |W    |B    |W    |B    |W    |B    |W    |"
# Separate into odd lines (player data) and even lines (state/rating)
row1 <- player_lines[seq(1, length(player_lines), by = 2)]  # odd lines
row2 <- player_lines[seq(2, length(player_lines), by = 2)]  # even lines

head(row1, 3)
## [1] "    1 | GARY HUA                        | 6.0 |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "    2 | DAKSHESH DARURI                 | 6.0 |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3] "    3 | ADITYA BAJAJ                    | 6.0 |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
head(row2, 3)
## [1] "   ON | 15445895 / R: 1794 ->1817       |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "   MI | 14598900 / R: 1553 ->1663       |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "   MI | 14959604 / R: 1384 ->1640       |N:2  |W    |B    |W    |B    |W    |B    |W    |"
library(stringr)

player_name <- str_trim(str_extract(row1, "(?<=\\|)[^|]+(?=\\|)"))
head(player_name, 3)
## [1] "GARY HUA"        "DAKSHESH DARURI" "ADITYA BAJAJ"
# Total points
total_pts <- as.numeric(str_trim(str_extract(row1, "(?<=\\|)\\s*\\d+\\.\\d+\\s*(?=\\|)")))

# State
player_state <- str_trim(str_extract(row2, "^\\s*([A-Z]{2})"))

# Pre-rating (the number after R: and before -> , ignoring any P and numbers after it)
pre_rating <- as.numeric(str_extract(str_extract(row2, "R:\\s*\\d+"), "\\d+"))

head(total_pts, 3)
## [1] 6 6 6
head(player_state, 3)
## [1] "ON" "MI" "MI"
head(pre_rating, 3)
## [1] 1794 1553 1384
# Extract opponent pair numbers from each round (ignore byes, H, U, B, X)
get_opponents <- function(line) {
  as.numeric(na.omit(str_extract_all(line, "(?<=[WLDX])\\s*(\\d+)")[[1]] |> 
  (\(x) str_trim(x))()))
}

opponents <- lapply(row1, get_opponents)
opponents[[1]]  # should show Gary Hua's opponents: 39 21 18 14 7 12 4
## [1] 39 21 18 14  7 12  4
# Build a lookup table: pair number -> pre-rating
pair_nums <- as.numeric(str_trim(str_extract(row1, "^\\s*\\d+")))
rating_lookup <- setNames(pre_rating, pair_nums)

# Calculate average opponent pre-rating for each player
avg_opp_rating <- sapply(opponents, function(opp_ids) {
  round(mean(rating_lookup[as.character(opp_ids)], na.rm = TRUE))
})

avg_opp_rating[1]  # should be 1605 for Gary Hua
## [1] 1605
# Build the final data frame
chess_df <- data.frame(
  Player_Name = player_name,
  Player_State = player_state,
  Total_Points = total_pts,
  Pre_Rating = pre_rating,
  Avg_Opp_Rating = avg_opp_rating
)

# Preview it
head(chess_df)
##           Player_Name Player_State Total_Points Pre_Rating Avg_Opp_Rating
## 1            GARY HUA           ON          6.0       1794           1605
## 2     DAKSHESH DARURI           MI          6.0       1553           1469
## 3        ADITYA BAJAJ           MI          6.0       1384           1564
## 4 PATRICK H SCHILLING           MI          5.5       1716           1574
## 5          HANSHI ZUO           MI          5.5       1655           1501
## 6         HANSEN SONG           OH          5.0       1686           1519
# Write the CSV
write.csv(chess_df, "chess_results.csv", row.names = FALSE)