Project1.607.rmd

#setup

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

dir.create("output", showWarnings = FALSE)


output_dir  <- "output"                   
dir.create(output_dir, showWarnings = FALSE)

# step1 target output
TARGET_COLS <- c("Name","State","TotalPoints","PreRating","AvgOppPreRating")

assert_output_schema <- function(df) {
  stopifnot(is.data.frame(df))
  miss <- setdiff(TARGET_COLS, names(df))
  if (length(miss)) stop("Missing columns: ", paste(miss, collapse = ", "))
  df %>%
    mutate(
      Name            = as.character(Name),
      State           = as.character(State),
      TotalPoints     = as.numeric(TotalPoints),
      PreRating       = as.integer(PreRating),
      AvgOppPreRating = as.integer(AvgOppPreRating)
    ) %>%
    invisible()
}

expected_first_user <- list(Name="Gary Hua", State="ON", TotalPoints=6.0, PreRating=1794, AvgOppPreRating=1605)

# ---- step2 path
# ---- step2: fixed path + simpler loaders ----
url <- "https://raw.githubusercontent.com/lher96/MSDS-Assignments/main/data.tournament.info.txt"

read_and_normalize <- function(path_or_url) {
  x <- readr::read_lines(path_or_url, progress = FALSE)
  x <- stringr::str_replace_all(x, stringr::fixed("\u00A0"), " ") # NBSP -> space
  x <- stringr::str_replace_all(x, "\t", " ")                     # tabs -> space
  x <- stringr::str_replace_all(x, "\\s*\\|\\s*", " | ")          # normalize pipe padding
  stringr::str_squish(x)                                          # collapse multi-spaces & trim
}

lines_raw <- read_and_normalize(url)

# Identify meta lines and pair each with the previous non-empty line
is_meta_line <- \(s) stringr::str_detect(s, "R:\\s*\\d+")
idx_meta <- which(is_meta_line(lines_raw))
stopifnot(length(idx_meta) > 0)

blocks_tbl <- tibble::tibble(
  Line1 = purrr::map_chr(idx_meta, \(i2) {
    i1 <- i2 - 1
    while (i1 >= 1 && !nzchar(lines_raw[i1])) i1 <- i1 - 1
    if (i1 < 1) stop("Meta line at top without a preceding player line.")
    lines_raw[i1]
  }),
  Line2 = lines_raw[idx_meta]
)

utils::head(blocks_tbl, 2)

## # A tibble: 2 × 2
##   Line1                                                                    Line2
##   <chr>                                                                    <chr>
## 1 1 | GARY HUA | 6.0 | W 39 | W 21 | W 18 | W 14 | W 7 | D 12 | D 4 |      ON |…
## 2 2 | DAKSHESH DARURI | 6.0 | W 63 | W 58 | L 4 | W 17 | W 16 | W 20 | W … MI |…

# ---- step3_parsers ----
# Line 1 example:
# "1 | GARY HUA |6.0 |W 39|W 21|... "
LINE1_RE <- "^\\s*(\\d+)\\s*\\|\\s*([^|]+?)\\s*\\|\\s*(\\d+(?:\\.\\d)?)\\s*\\|"

parse_line1_meta <- function(line1) {
  m <- str_match(line1, LINE1_RE)
  if (all(is.na(m))) {
    warning("Could not parse Line1: ", line1)
    return(tibble(PlayerNum=NA_integer_, Name=NA_character_, TotalPoints=NA_real_))
  }
  # Title-case names to match the assignment's expected output (e.g., "Gary Hua")
  nm <- str_to_title(str_squish(m[,3]))
  tibble(
    PlayerNum   = as.integer(m[,2]),
    Name        = nm,
    TotalPoints = as.numeric(m[,4])
  )
}

# From Line 1, get opponent player numbers (skip the first three '|' fields).
parse_opponents_from_line1 <- function(line1, owner_num) {
  parts <- str_split(line1, "\\|")[[1]]
  if (length(parts) <= 3) {
    return(tibble(PlayerNum=integer(), Round=integer(), OppPlayerNum=integer()))
  }
  rounds <- parts[-(1:3)]                # fields after "Pair|Name|Total"
  opp_ids <- str_extract(rounds, "\\d+") # one number per field, if present
  opp_ids <- opp_ids[!is.na(opp_ids)]
  if (!length(opp_ids)) {
    return(tibble(PlayerNum=integer(), Round=integer(), OppPlayerNum=integer()))
  }
  tibble(
    PlayerNum    = as.integer(owner_num),
    Round        = seq_along(opp_ids),
    OppPlayerNum = as.integer(opp_ids)
  )
}

# Line 2 example:
# "ON | 15445895 / R: 1794 ->1817 |N:2 |W ..."
parse_line2_meta <- function(line2) {
  state <- str_match(line2, "^\\s*([A-Z]{2})\\s*\\|")[,2]
  pre   <- str_match(line2, "R:\\s*(\\d+)")[,2]  # robust to "1641P17" etc. (captures 1641)
  tibble(
    State     = ifelse(nchar(state)==2, state, NA_character_),
    PreRating = as.integer(pre)
  )
}

# Parse both lines into one row + opponents-long
parse_block <- function(Line1, Line2) {
  m1 <- parse_line1_meta(Line1)
  m2 <- parse_line2_meta(Line2)
  row <- dplyr::bind_cols(m1, m2)
  opp <- parse_opponents_from_line1(Line1, owner_num = m1$PlayerNum[1])
  list(row = row, opp = opp)
}
parsed <- purrr::pmap(blocks_tbl, parse_block)

players_meta <- map_dfr(parsed, "row") %>%
  select(PlayerNum, Name, State, TotalPoints, PreRating)

opponents_long <- map_dfr(parsed, "opp")

# Quick sanity checks
stopifnot(nrow(players_meta) > 0, nrow(opponents_long) > 0)
players_meta %>% slice(1)

## # A tibble: 1 × 5
##   PlayerNum Name     State TotalPoints PreRating
##       <int> <chr>    <chr>       <dbl>     <int>
## 1         1 Gary Hua ON              6      1794

opponents_long %>% slice(1:6)

## # A tibble: 6 × 3
##   PlayerNum Round OppPlayerNum
##       <int> <int>        <int>
## 1         1     1           39
## 2         1     2           21
## 3         1     3           18
## 4         1     4           14
## 5         1     5            7
## 6         1     6           12

# ---- step4_join_opp_preratings ----
stopifnot(exists("players_meta"), exists("opponents_long"))

# clean up just in case edge cases
players_meta <- players_meta %>%
  distinct(PlayerNum, .keep_all = TRUE)

# Map each (PlayerNum, Round) to the opponent's **pre** rating
# - Drop impossible OppPlayerNum (<=0), self-pairings, and any unmatched opponent rows.
opp_with_ratings <- opponents_long %>%
  filter(!is.na(OppPlayerNum), OppPlayerNum > 0) %>%
  filter(OppPlayerNum != PlayerNum) %>%                 # safety: ignore accidental self matches
  left_join(
    players_meta %>%
      transmute(OppPlayerNum = PlayerNum, OppPreRating = PreRating),
    by = "OppPlayerNum"
  )
# Preview
print(utils::head(opp_with_ratings), row.names = FALSE)

## # A tibble: 6 × 4
##   PlayerNum Round OppPlayerNum OppPreRating
##       <int> <int>        <int>        <int>
## 1         1     1           39         1436
## 2         1     2           21         1563
## 3         1     3           18         1600
## 4         1     4           14         1610
## 5         1     5            7         1649
## 6         1     6           12         1663

# ---- step5_avg_opp ----
# Compute each player's average opponent pre-rating
avg_opp_pr <- opp_with_ratings %>%
  group_by(PlayerNum) %>%
  summarize(
    AvgOppPreRating = {
      m <- mean(OppPreRating, na.rm = TRUE)
      if (is.nan(m)) NA_integer_ else as.integer(round(m))
    },
    .groups = "drop"
  )

utils::head(avg_opp_pr)

## # A tibble: 6 × 2
##   PlayerNum AvgOppPreRating
##       <int>           <int>
## 1         1            1605
## 2         2            1469
## 3         3            1564
## 4         4            1574
## 5         5            1501
## 6         6            1519

# ---- step6_assemble_output ----
final_players <- players_meta %>%
  left_join(avg_opp_pr, by = "PlayerNum") %>%
  # Preserve original listing order by PlayerNum
  arrange(PlayerNum) %>%
  transmute(
    Name        = Name,
    State       = State,
    TotalPoints = TotalPoints,
    PreRating   = PreRating,
    AvgOppPreRating = AvgOppPreRating %||% NA_integer_  # players with no valid opponents
  )

# Quick preview
print(utils::head(final_players), row.names = FALSE)

## # A tibble: 6 × 5
##   Name                State TotalPoints PreRating AvgOppPreRating
##   <chr>               <chr>       <dbl>     <int>           <int>
## 1 Gary Hua            ON            6        1794            1605
## 2 Dakshesh Daruri     MI            6        1553            1469
## 3 Aditya Bajaj        MI            6        1384            1564
## 4 Patrick H Schilling MI            5.5      1716            1574
## 5 Hanshi Zuo          MI            5.5      1655            1501
## 6 Hansen Song         OH            5        1686            1519

# Keep schema honest (from Step 1)
final_players <- assert_output_schema(final_players)

# ---- step7_write_validate ----
output_file <- file.path(output_dir, "tournament_players.csv")
readr::write_csv(final_players, output_file)

cat("Wrote CSV to:", normalizePath(output_file, winslash = "/"), "\n")

## Wrote CSV to: /Users/luishernandez/Desktop/MSDS/MS 607/output/tournament_players.csv

# --- Acceptance tests ---
# A) First row should match assignment’s example IF your input lists Gary first.
first_row <- final_players[1, , drop = FALSE]

# B) Independent check: find Gary Hua by name (case sensitive to avoid collisions)
gary_row <- final_players %>% filter(Name == "Gary Hua") %>% slice(1)

# Compose expected list (from Step 1)
EXPECTED_FIRST <- list(Name = "Gary Hua", State = "ON",
                       TotalPoints = 6.0, PreRating = 1794, AvgOppPreRating = 1605)

# Validate A (only warn if first row isn’t Gary, since some files reorder)
if (nrow(first_row) == 1) {
  same_first <- isTRUE(all.equal(
    unname(c(first_row$Name, first_row$State, first_row$TotalPoints, first_row$PreRating, first_row$AvgOppPreRating)),
    unname(unlist(EXPECTED_FIRST))
  ))
  if (!same_first) {
    warning("First row does not match the Gary Hua example")
  } else {
    message("First-row check passed ")
  }
}

# Validate B (robust explicit check)
if (nrow(gary_row) == 1) {
  same_gary <- isTRUE(all.equal(
    unname(c(gary_row$Name, gary_row$State, gary_row$TotalPoints, gary_row$PreRating, gary_row$AvgOppPreRating)),
    unname(unlist(EXPECTED_FIRST))
  ))
  if (!same_gary) {
    stop("Gary Hua acceptance test FAILED")
  } else {
    message("Gary Hua acceptance test pass")
  }
} else {
  warning("Could not find 'Gary Hua' row — name/spelling/state")
}

Project1.607.rmd

2025-09-15