Same as before I imported the TXT file from Github and skipped four lines to wear the data actually starts, this time I’ve also replaced the arrows in the data.

knitr::opts_chunk$set(echo = TRUE)
#install.packages("readr")
library(readr)

# import raw txt file from Github
url <- 'https://raw.githubusercontent.com/tiffhugh/Data-Acquisition-Mangement-/refs/heads/main/tournamentinfo.txt'
raw_data <- read_lines(url, skip = 4)
head(raw_data) # loaded correctly with skipped lines 
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [5] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "-----------------------------------------------------------------------------------------"
# Replace "->" with ">>"
raw_data <- gsub("->", ">>", raw_data)
head(raw_data)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   >>1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [5] "   MI | 14598900 / R: 1553   >>1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "-----------------------------------------------------------------------------------------"

When I first did the project I split the strings into vectors containing the player information all at once but this time I did it in seperate vectors. During class dicussion classmates showed how they extracted the players info in two batches then mergeing.In the next chunk extraction and combine will be used.

knitr::opts_chunk$set(echo = TRUE)
#install.packages("stringr")
#install.packages("dplyr")
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
player_info<- raw_data[str_detect(raw_data, "^\\s*\\d")]

# Format 2: Records starting with uppercase letters (e.g., state codes)
player_info2 <- raw_data[str_detect(raw_data, "^\\s*[A-Z]{2}")]

head(player_info)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
## [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"
head(player_info2)
## [1] "   ON | 15445895 / R: 1794   >>1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "   MI | 14598900 / R: 1553   >>1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "   MI | 14959604 / R: 1384   >>1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "   MI | 12616049 / R: 1716   >>1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5] "   MI | 14601533 / R: 1655   >>1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "   OH | 15055204 / R: 1686   >>1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"
knitr::opts_chunk$set(echo = TRUE)

player_rating <- data.frame(
  state  = str_trim(substr(player_info2, 1, 6), side = "both"),
  uscf_id       = str_extract(substr(player_info2, 8, 40), "\\d+"),
  pre_rating    = as.numeric(str_extract(substr(player_info2, 8, 40), "(?<=R: )\\d+")),
  post_rating   = as.numeric(str_extract(substr(player_info2, 8, 40), "(?<=>>)\\d+")),
  stringsAsFactors = FALSE
)

player_rounds <- data.frame(
  id = as.numeric(str_trim(substr(player_info, 1, 6))),  
  name = str_trim(substr(player_info, 8, 40)),           
  total_pts = as.numeric(str_trim(substr(player_info, 42, 46))), 
  round1 = str_trim(substr(player_info, 48, 52)),               
  round2 = str_trim(substr(player_info, 54, 58)),               
  round3 = str_trim(substr(player_info, 60, 64)),               
  round4 = str_trim(substr(player_info, 66, 70)),               
  round5 = str_trim(substr(player_info, 72, 76)),                
  round6 = str_trim(substr(player_info, 78, 82)),               
  round7 = str_trim(substr(player_info, 84, 88)),           
  stringsAsFactors = FALSE
)

chess_tournament <- cbind(player_rating, player_rounds)
head(chess_tournament)
##   state  uscf_id pre_rating post_rating id                name total_pts round1
## 1    ON 15445895       1794        1817  1            GARY HUA       6.0  W  39
## 2    MI 14598900       1553        1663  2     DAKSHESH DARURI       6.0  W  63
## 3    MI 14959604       1384        1640  3        ADITYA BAJAJ       6.0  L   8
## 4    MI 12616049       1716        1744  4 PATRICK H SCHILLING       5.5  W  23
## 5    MI 14601533       1655        1690  5          HANSHI ZUO       5.5  W  45
## 6    OH 15055204       1686        1687  6         HANSEN SONG       5.0  W  34
##   round2 round3 round4 round5 round6 round7
## 1  W  21  W  18  W  14  W   7  D  12  D   4
## 2  W  58  L   4  W  17  W  16  W  20  W   7
## 3  W  61  W  25  W  21  W  11  W  13  W  12
## 4  D  28  W   2  W  26  D   5  W  19  D   1
## 5  W  37  D  12  D  13  D   4  W  14  W  17
## 6  D  29  L  11  W  35  D  10  W  27  W  21

In my first attempt, I took a more manual and iterative approach, looping through each player’s data row by row. I used a helper function to extract the round results and concatenated new values for each attribute into separate vectors. In my second attempt, I streamlined the process by using vectorized functions like substr and str_trim to extract data based on specific character positions.

knitr::opts_chunk$set(echo = TRUE)

AvgOppPreChessRating <- numeric(nrow(chess_tournament))

# Loop each player's rounds
for (i in seq_len(nrow(chess_tournament))) {
  Rounds <- str_extract_all(c(chess_tournament$round1[i], chess_tournament$round2[i], chess_tournament$round3[i],
                              chess_tournament$round4[i], chess_tournament$round5[i], chess_tournament$round6[i],
                              chess_tournament$round7[i]), "\\d+") 
  round_indices <- as.numeric(unlist(Rounds))

  # actual rounds played
  if (length(round_indices) > 0) {
    # the average pre-rating for actual rounds played by opponents
    total_rating <- sum(chess_tournament$pre_rating[round_indices], na.rm = TRUE)
    num_opponents <- sum(!is.na(chess_tournament$pre_rating[round_indices]))
    if (num_opponents > 0) {
      AvgOppPreChessRating[i] <- round(total_rating / num_opponents, 0)
    } else {
      AvgOppPreChessRating[i] <- NA  # account for the opponents not played
    }
  } else {
    AvgOppPreChessRating[i] <- NA  # account for players did not play against 
  }
}

chess_tournament$AvgOppPreChessRating <- AvgOppPreChessRating
#Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
chess_rematch <- chess_tournament %>%
  select(name, state, total_pts, pre_rating, AvgOppPreChessRating)
head(chess_rematch)
##                  name state total_pts pre_rating AvgOppPreChessRating
## 1            GARY HUA    ON       6.0       1794                 1605
## 2     DAKSHESH DARURI    MI       6.0       1553                 1561
## 3        ADITYA BAJAJ    MI       6.0       1384                 1665
## 4 PATRICK H SCHILLING    MI       5.5       1716                 1574
## 5          HANSHI ZUO    MI       5.5       1655                 1588
## 6         HANSEN SONG    OH       5.0       1686                 1519
# Write to CSV
write.csv(chess_rematch, file = "chess_rematch.csv")

In my first attempt to calculate the average pre-rating of opponents, I manually looped through each round for every player, constructing round-by-round column names and extracting opponent IDs before matching them to their pre-ratings. This method required handling missing values with multiple checks. In the second attempt, I streamlined the process by using str_extract_all to automatically gather opponent IDs for all rounds in one step, eliminating the need for manual string construction and nested loops. However I am still having some discrepancies in my first attempt I got the correct average for Gary and Daksheh but Mike was incorrect. This time Gary and Mike’s average is correct and Daksheh is wrong. Still trying to figure out where I went astray since this time I made sure to account for players that did not play against each other and in all rounds.