1 1. Load raw data
2 2. Inspect & segment into player blocks
3 3. Extract player fields
4 4) Expand opponents and compute average opponent pre-rating
5 5. Sanity check: Gary Hua example
6 6. Export CSV
7 7. Notes / Edge cases

1 1. Load raw data

Place tournamentinfo.txt in the project root or data/. Adjust the path below if needed.

# Try common locations
paths <- c("tournamentinfo.txt", "data/tournamentinfo.txt")
path <- paths[file.exists(paths)][1]
stopifnot(!is.na(path))  # fail fast if not found

raw <- readLines(path, warn = FALSE)
length(raw)

## [1] 196

head(raw, 40)

##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------" 
## [11] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|" 
## [12] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [13] "-----------------------------------------------------------------------------------------" 
## [14] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|" 
## [15] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |" 
## [16] "-----------------------------------------------------------------------------------------" 
## [17] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|" 
## [18] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [19] "-----------------------------------------------------------------------------------------" 
## [20] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|" 
## [21] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |" 
## [22] "-----------------------------------------------------------------------------------------" 
## [23] "    7 | GARY DEE SWATHELL               |5.0  |W  57|W  46|W  13|W  11|L   1|W   9|L   2|" 
## [24] "   MI | 11146376 / R: 1649   ->1673     |N:3  |W    |B    |W    |B    |B    |W    |W    |" 
## [25] "-----------------------------------------------------------------------------------------" 
## [26] "    8 | EZEKIEL HOUGHTON                |5.0  |W   3|W  32|L  14|L   9|W  47|W  28|W  19|" 
## [27] "   MI | 15142253 / R: 1641P17->1657P24  |N:3  |B    |W    |B    |W    |B    |W    |W    |" 
## [28] "-----------------------------------------------------------------------------------------" 
## [29] "    9 | STEFANO LEE                     |5.0  |W  25|L  18|W  59|W   8|W  26|L   7|W  20|" 
## [30] "   ON | 14954524 / R: 1411   ->1564     |N:2  |W    |B    |W    |B    |W    |B    |B    |" 
## [31] "-----------------------------------------------------------------------------------------" 
## [32] "   10 | ANVIT RAO                       |5.0  |D  16|L  19|W  55|W  31|D   6|W  25|W  18|" 
## [33] "   MI | 14150362 / R: 1365   ->1544     |N:3  |W    |W    |B    |B    |W    |B    |W    |" 
## [34] "-----------------------------------------------------------------------------------------" 
## [35] "   11 | CAMERON WILLIAM MC LEMAN        |4.5  |D  38|W  56|W   6|L   7|L   3|W  34|W  26|" 
## [36] "   MI | 12581589 / R: 1712   ->1696     |N:3  |B    |W    |B    |W    |B    |W    |B    |" 
## [37] "-----------------------------------------------------------------------------------------" 
## [38] "   12 | KENNETH J TACK                  |4.5  |W  42|W  33|D   5|W  38|H    |D   1|L   3|" 
## [39] "   MI | 12681257 / R: 1663   ->1670     |N:3  |W    |B    |W    |B    |     |W    |B    |" 
## [40] "-----------------------------------------------------------------------------------------"

2 2. Inspect & segment into player blocks

The file typically has player records spanning 2 lines per player: a header line (pair #, name, state, total points) and a rounds/ratings line.

# 2) Segment into player blocks (robust)
clean <- raw |>
  stringr::str_replace_all("\\r", "") |>
  stringr::str_trim(side = "right")

tab <- clean[stringr::str_detect(clean, "\\|")]

# Find rows that start with a numeric pair id like " 1 | ..."
idx <- which(stringr::str_detect(tab, "^\\s*\\d+\\s*\\|"))

# Pair each header with its next line (detail)
blocks <- tibble::tibble(
  header = tab[idx],
  detail = tab[idx + 1]
)

# sanity checks
nrow(blocks)

## [1] 64

blocks$header[1]

## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"

blocks$detail[1]

## [1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

3 3. Extract player fields

We want: Name, State, Total Points, Pre-Rating, and the list of Opponent Pair Numbers.

# ---- helper ----
split_cols <- function(x) {
  stringr::str_split(x, "\\|")[[1]] |> stringr::str_trim()
}

# ---- header parser: name, points, opponents (from HEADER line) ----
parse_header <- function(x) {
  cols <- split_cols(x)

  # Pair & Name
  pair_val <- stringr::str_extract(cols[1], "\\d+") |> as.integer()
  name_val <- cols[2] |> stringr::str_squish() |> stringr::str_to_title()

  # Total Points = first numeric-looking field after name
  after_name <- cols[-(1:2)]
  tp_first <- stringr::str_extract(after_name, "\\d+(?:\\.\\d+)?")
  total_pts_val <- as.numeric(tp_first[which(!is.na(tp_first))[1]])

  # Opponent pair numbers live on the header line as W/D/L/B/H/U tokens
  # e.g., "W 39", "D12", "L-7", "B 0"
  tokens <- stringr::str_extract_all(x, "(?i)\\b[WDLHBU][\\s-]*\\d+")[[1]]
  opp_nums <- as.integer(stringr::str_extract_all(paste(tokens, collapse = " "), "\\d+")[[1]])
  opp_nums <- opp_nums[!is.na(opp_nums) & opp_nums > 0]  # drop byes/zeros

  tibble::tibble(
    pair = pair_val,
    name = name_val,
    total_pts = total_pts_val,
    opponents = list(opp_nums)
  )
}

# ---- detail parser: state + pre-rating (from DETAIL line) ----
parse_detail <- function(x) {
  y <- x |>
    stringr::str_replace_all("\\t", " ") |>
    stringr::str_squish()

  # US states + Canadian provinces/territories
  allowed_regions <- c(state.abb, "ON","QC","BC","AB","SK","MB","NB","NS","PE","NL","YT","NT","NU")

  # State token
  toks <- stringr::str_extract_all(y, "\\b[A-Z]{2}\\b")[[1]]
  state_val <- toks[toks %in% allowed_regions]
  state_val <- if (length(state_val)) state_val[1] else NA_character_

  # Pre-rating like "R: 1794"
  pre <- stringr::str_match(y, "R:\\s*(\\d+)")[,2] |> as.integer()

  tibble::tibble(state = state_val, pre_rating = pre)
}

# ---- build players ----
hdr  <- purrr::map_dfr(blocks$header, parse_header)
det  <- purrr::map_dfr(blocks$detail, parse_detail)

players <- dplyr::bind_cols(hdr, det) |>
  # Drop any stray header row (safety)
  dplyr::filter(!grepl("^Player\\s*Name$", name, ignore.case = TRUE))

# Quick diagnostics
players |> dplyr::slice(1:5)

## # A tibble: 5 × 6
##    pair name                total_pts opponents state pre_rating
##   <int> <chr>                   <dbl> <list>    <chr>      <int>
## 1     1 Gary Hua                  6   <int [7]> ON          1794
## 2     2 Dakshesh Daruri           6   <int [7]> MI          1553
## 3     3 Aditya Bajaj              6   <int [7]> MI          1384
## 4     4 Patrick H Schilling       5.5 <int [7]> MI          1716
## 5     5 Hanshi Zuo                5.5 <int [7]> MI          1655

players |> dplyr::transmute(name, pair, pre_rating, opp_len = lengths(opponents)) |> head(12)

## # A tibble: 12 × 4
##    name                      pair pre_rating opp_len
##    <chr>                    <int>      <int>   <int>
##  1 Gary Hua                     1       1794       7
##  2 Dakshesh Daruri              2       1553       7
##  3 Aditya Bajaj                 3       1384       7
##  4 Patrick H Schilling          4       1716       7
##  5 Hanshi Zuo                   5       1655       7
##  6 Hansen Song                  6       1686       7
##  7 Gary Dee Swathell            7       1649       7
##  8 Ezekiel Houghton             8       1641       7
##  9 Stefano Lee                  9       1411       7
## 10 Anvit Rao                   10       1365       7
## 11 Cameron William Mc Leman    11       1712       7
## 12 Kenneth J Tack              12       1663       6

4 4) Expand opponents and compute average opponent pre-rating

# Build lookup: pair -> pre_rating
lookup <- players |> dplyr::select(pair, pre_rating)

# Unnest opponent list into rows and clean opponent IDs
opp_tbl <- players |>
  dplyr::mutate(opp_num = opponents) |>
  dplyr::select(pair, name, state, total_pts, pre_rating, opp_num) |>
  tidyr::unnest_longer(opp_num, values_to = "opp_pair", keep_empty = TRUE) |>
  dplyr::mutate(opp_pair = as.integer(opp_pair)) |>
  dplyr::filter(!is.na(opp_pair), opp_pair > 0, opp_pair != pair, opp_pair %in% players$pair)

# Join opponents to their pre-ratings
opp_joined <- opp_tbl |>
  dplyr::left_join(lookup, by = c("opp_pair" = "pair"), suffix = c("", "_opp"))

# Compute mean opponent pre-rating per player
final <- opp_joined |>
  dplyr::group_by(pair, name, state, total_pts, pre_rating) |>
  dplyr::summarize(avg_opp_pre = mean(pre_rating_opp, na.rm = TRUE), .groups = "drop") |>
  dplyr::mutate(avg_opp_pre = ifelse(is.nan(avg_opp_pre), NA, round(avg_opp_pre)))

5 5. Sanity check: Gary Hua example

The assignment’s example states that Gary Hua should have an average opponent pre-rating of 1605. Confirm below.

final |> filter(str_detect(name, regex("^Gary Hua$", ignore_case = TRUE)))

## # A tibble: 1 × 6
##    pair name     state total_pts pre_rating avg_opp_pre
##   <int> <chr>    <chr>     <dbl>      <int>       <dbl>
## 1     1 Gary Hua ON            6       1794        1605

6 6. Export CSV

out <- final |>
  transmute(
    Name = name,
    State = state,
    `Total Points` = total_pts,
    `Pre-Rating` = pre_rating,
    `Avg Opponent Pre-Rating` = avg_opp_pre
  )
write_csv(out, "project1_results.csv")

7 7. Notes / Edge cases

Byes/forfeits: If an opponent number is missing (no numeric id), the join yields NA and is excluded from the average.
Rounding: Averages are rounded to nearest integer.
Reproducibility: Re-run the entire document from a fresh session to regenerate results.

DATA 607 – Project 1: Chess Tournament Analysis

Kevin Martin