Proj1

Author

ZIHAO YU

1.How will I tackle the problem?

I will first split the variables, extracting name, region, total score, and pre-tournament rating. I will process B/H/U matches separately and calculate the average afterward. The output will consist of 5 columns and be exported as a CSV file.

2.What data challenges do I anticipate?

The main challenges are the presence of mixed B/H/U records within the match data.

source: “https://raw.githubusercontent.com/XxY-coder/data607-Proj.Y/refs/heads/main/tournamentinfo.txt”

3.Data cleanup

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(stringr)

chess.raw <- readLines("https://raw.githubusercontent.com/XxY-coder/data607-Proj.Y/refs/heads/main/tournamentinfo.txt")

Warning in
readLines("https://raw.githubusercontent.com/XxY-coder/data607-Proj.Y/refs/heads/main/tournamentinfo.txt"):
incomplete final line found on
'https://raw.githubusercontent.com/XxY-coder/data607-Proj.Y/refs/heads/main/tournamentinfo.txt'

head(chess.raw, 20)

 [1] "-----------------------------------------------------------------------------------------" 
 [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
 [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
 [4] "-----------------------------------------------------------------------------------------" 
 [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
 [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
 [7] "-----------------------------------------------------------------------------------------" 
 [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
 [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
[10] "-----------------------------------------------------------------------------------------" 
[11] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|" 
[12] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
[13] "-----------------------------------------------------------------------------------------" 
[14] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|" 
[15] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |" 
[16] "-----------------------------------------------------------------------------------------" 
[17] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|" 
[18] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
[19] "-----------------------------------------------------------------------------------------" 
[20] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"

chess_df <- chess.raw[-(1:4)]
chess_df <- chess_df[chess_df != ""]

chess_df <- chess_df[!str_detect(chess_df, "^\\s*-+\\s*$")]

head(chess_df, 20)

 [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
 [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
 [3] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
 [4] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
 [5] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
 [6] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
 [7] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
 [8] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
 [9] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
[10] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
[11] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"
[12] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"
[13] "    7 | GARY DEE SWATHELL               |5.0  |W  57|W  46|W  13|W  11|L   1|W   9|L   2|"
[14] "   MI | 11146376 / R: 1649   ->1673     |N:3  |W    |B    |W    |B    |B    |W    |W    |"
[15] "    8 | EZEKIEL HOUGHTON                |5.0  |W   3|W  32|L  14|L   9|W  47|W  28|W  19|"
[16] "   MI | 15142253 / R: 1641P17->1657P24  |N:3  |B    |W    |B    |W    |B    |W    |W    |"
[17] "    9 | STEFANO LEE                     |5.0  |W  25|L  18|W  59|W   8|W  26|L   7|W  20|"
[18] "   ON | 14954524 / R: 1411   ->1564     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
[19] "   10 | ANVIT RAO                       |5.0  |D  16|L  19|W  55|W  31|D   6|W  25|W  18|"
[20] "   MI | 14150362 / R: 1365   ->1544     |N:3  |W    |W    |B    |B    |W    |B    |W    |"

line_n <- which(str_detect(chess_df, "^\\s*\\d+\\s*\\|"))
line_n

 [1]   1   3   5   7   9  11  13  15  17  19  21  23  25  27  29  31  33  35  37
[20]  39  41  43  45  47  49  51  53  55  57  59  61  63  65  67  69  71  73  75
[39]  77  79  81  83  85  87  89  91  93  95  97  99 101 103 105 107 109 111 113
[58] 115 117 119 121 123 125 127

line_odd <- chess_df[line_n]      
line_even <- chess_df[line_n + 1]   

head(line_odd,5)

[1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
[2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
[3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
[4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
[5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"

head(line_even,5)

[1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
[2] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
[3] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
[4] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
[5] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"

4.Break it down into tables

parse_line1 <-
  function(x) {
    parts <-
      str_split(x, "\\|", simplify = TRUE) |> 
      as.character()
    parts <- str_trim(parts)
    parts <- parts[parts != ""]
    
    tibble(
      player_num   = as.integer(parts[1]),
      name         = parts[2],
      total_points = as.numeric(parts[3]),
      
      round1       = parts[4],
      round2       = parts[5],
      round3       = parts[6],
      round4       = parts[7],
      round5       = parts[8],
      round6       = parts[9],
      round7       = parts[10]
  )
}

line1_df <- 
  map_dfr(line_odd, parse_line1)

line1_df |> 
  select(player_num, name, total_points, round1:round7) |>
  head()

# A tibble: 6 × 10
  player_num name  total_points round1 round2 round3 round4 round5 round6 round7
       <int> <chr>        <dbl> <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr> 
1          1 GARY…          6   W  39  W  21  W  18  W  14  W   7  D  12  D   4 
2          2 DAKS…          6   W  63  W  58  L   4  W  17  W  16  W  20  W   7 
3          3 ADIT…          6   L   8  W  61  W  25  W  21  W  11  W  13  W  12 
4          4 PATR…          5.5 W  23  D  28  W   2  W  26  D   5  W  19  D   1 
5          5 HANS…          5.5 W  45  W  37  D  12  D  13  D   4  W  14  W  17 
6          6 HANS…          5   W  34  D  29  L  11  W  35  D  10  W  27  W  21

Get the state and pre-rating.

parse_line2 <- 
  function(x) {
    parts <- 
      str_split(x, "\\|", simplify = TRUE) |>
      as.character()
    parts <- str_trim(parts)
    parts <- parts[parts != ""]
    state <- parts[1]
  
    pre <- str_match(x, "R:\\s*(\\d{3,4})")[, 2]
  
    if (is.na(pre)) pre <- 
      str_match(x, "(\\d{3,4})\\s*->")[, 2]

    tibble(
      state = state,
      pre_rating = as.integer(pre)
      )
}

line2_df <- 
  map_dfr(line_even, parse_line2)
line2_df |> 
  head()

# A tibble: 6 × 2
  state pre_rating
  <chr>      <int>
1 ON          1794
2 MI          1553
3 MI          1384
4 MI          1716
5 MI          1655
6 OH          1686

nrow(line1_df)

[1] 64

nrow(line2_df)

[1] 64

players <- bind_cols(line1_df, line2_df)

players |>
  select(player_num, name, state, total_points, pre_rating) |>
  head()

# A tibble: 6 × 5
  player_num name                state total_points pre_rating
       <int> <chr>               <chr>        <dbl>      <int>
1          1 GARY HUA            ON             6         1794
2          2 DAKSHESH DARURI     MI             6         1553
3          3 ADITYA BAJAJ        MI             6         1384
4          4 PATRICK H SCHILLING MI             5.5       1716
5          5 HANSHI ZUO          MI             5.5       1655
6          6 HANSEN SONG         OH             5         1686

5. Dealing with the pre-rating.

select the rounds and match the rating, then to find out the average rating.

opp_info <-
  players |>
  select(player_num, starts_with("round")) |>
  pivot_longer(
    cols = starts_with("round"),
    names_to = "round",
    values_to = "cell"
  ) |>
  mutate(
    cell = str_trim(cell),
    opp_num = as.integer(str_extract(cell, "\\d+"))
  )

opp_info |>
  filter(player_num == 1)

# A tibble: 7 × 4
  player_num round  cell  opp_num
       <int> <chr>  <chr>   <int>
1          1 round1 W  39      39
2          1 round2 W  21      21
3          1 round3 W  18      18
4          1 round4 W  14      14
5          1 round5 W   7       7
6          1 round6 D  12      12
7          1 round7 D   4       4

find_rating <- 
  players |>
  select(player_num, opp_pre = pre_rating)

avg_rating <- 
  opp_info |>
  left_join(find_rating, by = c("opp_num" = "player_num")) |>
  group_by(player_num) |>
  summarise(
    avg_opp_pre = mean(opp_pre, na.rm = TRUE),
    .groups = "drop"
  ) |>
  mutate(
    avg_opp_pre = ifelse(is.nan(avg_opp_pre), NA, avg_opp_pre)
  )

avg_rating |>
  head(10)

# A tibble: 10 × 2
   player_num avg_opp_pre
        <int>       <dbl>
 1          1       1605.
 2          2       1469.
 3          3       1564.
 4          4       1574.
 5          5       1501.
 6          6       1519.
 7          7       1372.
 8          8       1468.
 9          9       1523.
10         10       1554.

6.Complete the table and export as csv.

final_table <- 
  players |>
  left_join(avg_rating, by = "player_num") |>
  transmute(
    Name = name,
    State = state,
    TotalPoints = total_points,
    PreRating = pre_rating,
    AvgPreRating = as.integer(round(avg_opp_pre, 0))
  )

final_table |>
  head(10)

# A tibble: 10 × 5
   Name                State TotalPoints PreRating AvgPreRating
   <chr>               <chr>       <dbl>     <int>        <int>
 1 GARY HUA            ON            6        1794         1605
 2 DAKSHESH DARURI     MI            6        1553         1469
 3 ADITYA BAJAJ        MI            6        1384         1564
 4 PATRICK H SCHILLING MI            5.5      1716         1574
 5 HANSHI ZUO          MI            5.5      1655         1501
 6 HANSEN SONG         OH            5        1686         1519
 7 GARY DEE SWATHELL   MI            5        1649         1372
 8 EZEKIEL HOUGHTON    MI            5        1641         1468
 9 STEFANO LEE         ON            5        1411         1523
10 ANVIT RAO           MI            5        1365         1554

write.csv(final_table, "chess_players.csv", row.names = FALSE)