#setup
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
dir.create("output", showWarnings = FALSE)
output_dir <- "output"
dir.create(output_dir, showWarnings = FALSE)
# step1 target output
TARGET_COLS <- c("Name","State","TotalPoints","PreRating","AvgOppPreRating")
assert_output_schema <- function(df) {
stopifnot(is.data.frame(df))
miss <- setdiff(TARGET_COLS, names(df))
if (length(miss)) stop("Missing columns: ", paste(miss, collapse = ", "))
df %>%
mutate(
Name = as.character(Name),
State = as.character(State),
TotalPoints = as.numeric(TotalPoints),
PreRating = as.integer(PreRating),
AvgOppPreRating = as.integer(AvgOppPreRating)
) %>%
invisible()
}
expected_first_user <- list(Name="Gary Hua", State="ON", TotalPoints=6.0, PreRating=1794, AvgOppPreRating=1605)
# ---- step2 path
# ---- step2: fixed path + simpler loaders ----
url <- "https://raw.githubusercontent.com/lher96/MSDS-Assignments/main/data.tournament.info.txt"
read_and_normalize <- function(path_or_url) {
x <- readr::read_lines(path_or_url, progress = FALSE)
x <- stringr::str_replace_all(x, stringr::fixed("\u00A0"), " ") # NBSP -> space
x <- stringr::str_replace_all(x, "\t", " ") # tabs -> space
x <- stringr::str_replace_all(x, "\\s*\\|\\s*", " | ") # normalize pipe padding
stringr::str_squish(x) # collapse multi-spaces & trim
}
lines_raw <- read_and_normalize(url)
# Identify meta lines and pair each with the previous non-empty line
is_meta_line <- \(s) stringr::str_detect(s, "R:\\s*\\d+")
idx_meta <- which(is_meta_line(lines_raw))
stopifnot(length(idx_meta) > 0)
blocks_tbl <- tibble::tibble(
Line1 = purrr::map_chr(idx_meta, \(i2) {
i1 <- i2 - 1
while (i1 >= 1 && !nzchar(lines_raw[i1])) i1 <- i1 - 1
if (i1 < 1) stop("Meta line at top without a preceding player line.")
lines_raw[i1]
}),
Line2 = lines_raw[idx_meta]
)
utils::head(blocks_tbl, 2)
## # A tibble: 2 × 2
## Line1 Line2
## <chr> <chr>
## 1 1 | GARY HUA | 6.0 | W 39 | W 21 | W 18 | W 14 | W 7 | D 12 | D 4 | ON |…
## 2 2 | DAKSHESH DARURI | 6.0 | W 63 | W 58 | L 4 | W 17 | W 16 | W 20 | W … MI |…
# ---- step3_parsers ----
# Line 1 example:
# "1 | GARY HUA |6.0 |W 39|W 21|... "
LINE1_RE <- "^\\s*(\\d+)\\s*\\|\\s*([^|]+?)\\s*\\|\\s*(\\d+(?:\\.\\d)?)\\s*\\|"
parse_line1_meta <- function(line1) {
m <- str_match(line1, LINE1_RE)
if (all(is.na(m))) {
warning("Could not parse Line1: ", line1)
return(tibble(PlayerNum=NA_integer_, Name=NA_character_, TotalPoints=NA_real_))
}
# Title-case names to match the assignment's expected output (e.g., "Gary Hua")
nm <- str_to_title(str_squish(m[,3]))
tibble(
PlayerNum = as.integer(m[,2]),
Name = nm,
TotalPoints = as.numeric(m[,4])
)
}
# From Line 1, get opponent player numbers (skip the first three '|' fields).
parse_opponents_from_line1 <- function(line1, owner_num) {
parts <- str_split(line1, "\\|")[[1]]
if (length(parts) <= 3) {
return(tibble(PlayerNum=integer(), Round=integer(), OppPlayerNum=integer()))
}
rounds <- parts[-(1:3)] # fields after "Pair|Name|Total"
opp_ids <- str_extract(rounds, "\\d+") # one number per field, if present
opp_ids <- opp_ids[!is.na(opp_ids)]
if (!length(opp_ids)) {
return(tibble(PlayerNum=integer(), Round=integer(), OppPlayerNum=integer()))
}
tibble(
PlayerNum = as.integer(owner_num),
Round = seq_along(opp_ids),
OppPlayerNum = as.integer(opp_ids)
)
}
# Line 2 example:
# "ON | 15445895 / R: 1794 ->1817 |N:2 |W ..."
parse_line2_meta <- function(line2) {
state <- str_match(line2, "^\\s*([A-Z]{2})\\s*\\|")[,2]
pre <- str_match(line2, "R:\\s*(\\d+)")[,2] # robust to "1641P17" etc. (captures 1641)
tibble(
State = ifelse(nchar(state)==2, state, NA_character_),
PreRating = as.integer(pre)
)
}
# Parse both lines into one row + opponents-long
parse_block <- function(Line1, Line2) {
m1 <- parse_line1_meta(Line1)
m2 <- parse_line2_meta(Line2)
row <- dplyr::bind_cols(m1, m2)
opp <- parse_opponents_from_line1(Line1, owner_num = m1$PlayerNum[1])
list(row = row, opp = opp)
}
parsed <- purrr::pmap(blocks_tbl, parse_block)
players_meta <- map_dfr(parsed, "row") %>%
select(PlayerNum, Name, State, TotalPoints, PreRating)
opponents_long <- map_dfr(parsed, "opp")
# Quick sanity checks
stopifnot(nrow(players_meta) > 0, nrow(opponents_long) > 0)
players_meta %>% slice(1)
## # A tibble: 1 × 5
## PlayerNum Name State TotalPoints PreRating
## <int> <chr> <chr> <dbl> <int>
## 1 1 Gary Hua ON 6 1794
opponents_long %>% slice(1:6)
## # A tibble: 6 × 3
## PlayerNum Round OppPlayerNum
## <int> <int> <int>
## 1 1 1 39
## 2 1 2 21
## 3 1 3 18
## 4 1 4 14
## 5 1 5 7
## 6 1 6 12
# ---- step4_join_opp_preratings ----
stopifnot(exists("players_meta"), exists("opponents_long"))
# clean up just in case edge cases
players_meta <- players_meta %>%
distinct(PlayerNum, .keep_all = TRUE)
# Map each (PlayerNum, Round) to the opponent's **pre** rating
# - Drop impossible OppPlayerNum (<=0), self-pairings, and any unmatched opponent rows.
opp_with_ratings <- opponents_long %>%
filter(!is.na(OppPlayerNum), OppPlayerNum > 0) %>%
filter(OppPlayerNum != PlayerNum) %>% # safety: ignore accidental self matches
left_join(
players_meta %>%
transmute(OppPlayerNum = PlayerNum, OppPreRating = PreRating),
by = "OppPlayerNum"
)
# Preview
print(utils::head(opp_with_ratings), row.names = FALSE)
## # A tibble: 6 × 4
## PlayerNum Round OppPlayerNum OppPreRating
## <int> <int> <int> <int>
## 1 1 1 39 1436
## 2 1 2 21 1563
## 3 1 3 18 1600
## 4 1 4 14 1610
## 5 1 5 7 1649
## 6 1 6 12 1663
# ---- step5_avg_opp ----
# Compute each player's average opponent pre-rating
avg_opp_pr <- opp_with_ratings %>%
group_by(PlayerNum) %>%
summarize(
AvgOppPreRating = {
m <- mean(OppPreRating, na.rm = TRUE)
if (is.nan(m)) NA_integer_ else as.integer(round(m))
},
.groups = "drop"
)
utils::head(avg_opp_pr)
## # A tibble: 6 × 2
## PlayerNum AvgOppPreRating
## <int> <int>
## 1 1 1605
## 2 2 1469
## 3 3 1564
## 4 4 1574
## 5 5 1501
## 6 6 1519
# ---- step6_assemble_output ----
final_players <- players_meta %>%
left_join(avg_opp_pr, by = "PlayerNum") %>%
# Preserve original listing order by PlayerNum
arrange(PlayerNum) %>%
transmute(
Name = Name,
State = State,
TotalPoints = TotalPoints,
PreRating = PreRating,
AvgOppPreRating = AvgOppPreRating %||% NA_integer_ # players with no valid opponents
)
# Quick preview
print(utils::head(final_players), row.names = FALSE)
## # A tibble: 6 × 5
## Name State TotalPoints PreRating AvgOppPreRating
## <chr> <chr> <dbl> <int> <int>
## 1 Gary Hua ON 6 1794 1605
## 2 Dakshesh Daruri MI 6 1553 1469
## 3 Aditya Bajaj MI 6 1384 1564
## 4 Patrick H Schilling MI 5.5 1716 1574
## 5 Hanshi Zuo MI 5.5 1655 1501
## 6 Hansen Song OH 5 1686 1519
# Keep schema honest (from Step 1)
final_players <- assert_output_schema(final_players)
# ---- step7_write_validate ----
output_file <- file.path(output_dir, "tournament_players.csv")
readr::write_csv(final_players, output_file)
cat("Wrote CSV to:", normalizePath(output_file, winslash = "/"), "\n")
## Wrote CSV to: /Users/luishernandez/Desktop/MSDS/MS 607/output/tournament_players.csv
# --- Acceptance tests ---
# A) First row should match assignment’s example IF your input lists Gary first.
first_row <- final_players[1, , drop = FALSE]
# B) Independent check: find Gary Hua by name (case sensitive to avoid collisions)
gary_row <- final_players %>% filter(Name == "Gary Hua") %>% slice(1)
# Compose expected list (from Step 1)
EXPECTED_FIRST <- list(Name = "Gary Hua", State = "ON",
TotalPoints = 6.0, PreRating = 1794, AvgOppPreRating = 1605)
# Validate A (only warn if first row isn’t Gary, since some files reorder)
if (nrow(first_row) == 1) {
same_first <- isTRUE(all.equal(
unname(c(first_row$Name, first_row$State, first_row$TotalPoints, first_row$PreRating, first_row$AvgOppPreRating)),
unname(unlist(EXPECTED_FIRST))
))
if (!same_first) {
warning("First row does not match the Gary Hua example")
} else {
message("First-row check passed ")
}
}
# Validate B (robust explicit check)
if (nrow(gary_row) == 1) {
same_gary <- isTRUE(all.equal(
unname(c(gary_row$Name, gary_row$State, gary_row$TotalPoints, gary_row$PreRating, gary_row$AvgOppPreRating)),
unname(unlist(EXPECTED_FIRST))
))
if (!same_gary) {
stop("Gary Hua acceptance test FAILED")
} else {
message("Gary Hua acceptance test pass")
}
} else {
warning("Could not find 'Gary Hua' row — name/spelling/state")
}