Initially I attempted to use read.table and extract the necessary data from each column, but the data was far too messy to continue on this way.

Chess_Tournament <- read.table("/Users/Michele/Desktop/Chess_Tournament.txt", header=FALSE, fill=TRUE)

columns <-c(1,3:13)
myvars <- paste("V", columns, sep="")
less_Chess_Tournament <- Chess_Tournament[myvars]
head(less_Chess_Tournament)
##                                                                                          V1
## 1 -----------------------------------------------------------------------------------------
## 2                                                                                      Pair
## 3                                                                                       Num
## 4 -----------------------------------------------------------------------------------------
## 5                                                                                         1
## 6                                                                                        ON
##         V3   V4                                                V5   V6
## 1                                                                     
## 2   Player Name |Total|Round|Round|Round|Round|Round|Round|Round|     
## 3     USCF   ID                                                 /  Rtg
## 4                                                                     
## 5     GARY  HUA                                              |6.0   |W
## 6 15445895    /                                                R: 1794
##            V7   V8   V9  V10 V11  V12 V13
## 1                                        
## 2                                        
## 3 (Pre->Post)    |  Pts    |   1    |   2
## 4                                        
## 5        39|W 21|W 18|W 14|W 7|D 12|D  4|
## 6      ->1817 |N:2   |W   |B  |W   |B  |W

Decided to look into alternative methods for imputting data and found readLines, and this was much cleaner.

Chess_Tournament_readlines <- readLines("/Users/Michele/Desktop/Chess_Tournament.txt")
## Warning in readLines("/Users/Michele/Desktop/Chess_Tournament.txt"):
## incomplete final line found on '/Users/Michele/Desktop/
## Chess_Tournament.txt'
head(Chess_Tournament_readlines, 10)
##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------"

From this, I noticed that there is a pattern to the data, with every third row starting with row 5 having names and competitors, while every third rown starting on row 6 has states and ratings.

rows_with_names_rounds <- seq(5, 196, 3)
names_rounds <- Chess_Tournament_readlines[rows_with_names_rounds]
head(names_rounds)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
## [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"
rows_with_state_ratings <- seq(6, 196, 3)
state_rating <- Chess_Tournament_readlines[rows_with_state_ratings]
head(state_rating)
## [1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"

Used the stringr library to extract names, states, pre-rating, post-rating, and competitors. All of this data was placed into a dataframe called Chess_Data.

library(stringr)
names <- unlist(str_extract_all(names_rounds, "\\w+\\s\\w+\\s\\w+\\s\\w+\\s|\\w+\\s\\w+\\s\\w+|\\w+\\s\\w+"))
head(names)
## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"
states <- unlist(str_extract_all(state_rating, "\\s\\w{2}\\s"))
head(states)
## [1] " ON " " MI " " MI " " MI " " MI " " OH "
pre_rating <- unlist(str_extract_all(state_rating, "[:]\\s+[[:digit:]]+"))
pre_rating <- unlist(str_extract_all(pre_rating, "[[:digit:]]+"))
head(pre_rating)
## [1] "1794" "1553" "1384" "1716" "1655" "1686"
post_rating <- unlist(str_extract_all(state_rating, "[>]\\s*[[:digit:]]+"))
post_rating <- unlist(str_extract_all(post_rating, "[[:digit:]]+"))
head(post_rating)
## [1] "1817" "1663" "1640" "1744" "1690" "1687"
rounds <- unlist(str_extract_all(names_rounds, "\\s?[[:print:]]{1,2}[|][[:alpha:]]?"))

round_1 <- rounds[seq(4, length(rounds), 10)]
round_2 <- rounds[seq(5, length(rounds), 10)]
round_3 <- rounds[seq(6, length(rounds), 10)]
round_4 <- rounds[seq(7, length(rounds), 10)]
round_5 <- rounds[seq(8, length(rounds), 10)]
round_6 <- rounds[seq(9, length(rounds), 10)]
round_7 <- rounds[seq(10, length(rounds), 10)]

round_1 <- unlist(str_extract_all(round_1, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_1 <- unlist(str_extract_all(round_1, "[[:print:]]+[^[|]]"))
round_2 <- unlist(str_extract_all(round_2, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_2 <- unlist(str_extract_all(round_2, "[[:print:]]+[^[|]]"))
round_3 <- unlist(str_extract_all(round_3, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_3 <- unlist(str_extract_all(round_3, "[[:print:]]+[^[|]]"))
round_4 <- unlist(str_extract_all(round_4, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_4 <- unlist(str_extract_all(round_4, "[[:print:]]+[^[|]]"))
round_5 <- unlist(str_extract_all(round_5, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_5 <- unlist(str_extract_all(round_5, "[[:print:]]+[^[|]]"))
round_6 <- unlist(str_extract_all(round_6, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_6 <- unlist(str_extract_all(round_6, "[[:print:]]+[^[|]]"))
round_7 <- unlist(str_extract_all(round_7, "[[:print:]]+[^[|]]"))

head(round_1)
## [1] " 39" " 63" "  8" " 23" " 45" " 34"
id <- seq(1, 64, 1)
Chess_Data <- data.frame(id, names, states, "Pre Rating" = as.numeric(pre_rating), "Post Rating" = as.numeric(post_rating), "Round 1" = as.numeric(round_1), "Round 2" = as.numeric(round_2), "Round 3" = as.numeric(round_3), "Round 4" = as.numeric(round_4), "Round 5" = as.numeric(round_5), "Round 6" = as.numeric(round_6), "Round 7" = as.numeric(round_7))
head(Chess_Data)
##   id               names states Pre.Rating Post.Rating Round.1 Round.2
## 1  1            GARY HUA    ON        1794        1817      39      21
## 2  2     DAKSHESH DARURI    MI        1553        1663      63      58
## 3  3        ADITYA BAJAJ    MI        1384        1640       8      61
## 4  4 PATRICK H SCHILLING    MI        1716        1744      23      28
## 5  5          HANSHI ZUO    MI        1655        1690      45      37
## 6  6         HANSEN SONG    OH        1686        1687      34      29
##   Round.3 Round.4 Round.5 Round.6 Round.7
## 1      18      14       7      12       4
## 2       4      17      16      20       7
## 3      25      21      11      13      12
## 4       2      26       5      19       1
## 5      12      13       4      14      17
## 6      11      35      10      27      21

Finally, I used a for loop to grab the Pre.Rating for each person’s competitor and compute their average. This was then added to the datafram Chess_Data_Final.

Average_Opponent_Pre_Rating <- c()

for (i in Chess_Data$id){
  a <- Chess_Data[Chess_Data$'Round.1'==i, "Pre.Rating"]
  a <- a[!is.na(a)]
  b <- Chess_Data[Chess_Data$'Round.2'==i, "Pre.Rating"]
  b <- b[!is.na(b)]
  c <- Chess_Data[Chess_Data$'Round.3'==i, "Pre.Rating"]
  c <- c[!is.na(c)]
  d <- Chess_Data[Chess_Data$'Round.4'==i, "Pre.Rating"]
  d <- d[!is.na(d)]
  e <- Chess_Data[Chess_Data$'Round.5'==i, "Pre.Rating"]
  e <- e[!is.na(e)]
  f <- Chess_Data[Chess_Data$'Round.6'==i, "Pre.Rating"]
  f <- f[!is.na(f)]
  g <- Chess_Data[Chess_Data$'Round.7'==i, "Pre.Rating"]
  g <- g[!is.na(g)]
  x <- mean(c(a, b, c, d, e, f, g))
  Average_Opponent_Pre_Rating  <- c(Average_Opponent_Pre_Rating, x)
}
Chess_Data["Average Opponent Pre Rating"] <- Average_Opponent_Pre_Rating
Chess_Data_Final <- subset(Chess_Data, select = c("id", "names", "states", "Post.Rating", "Average Opponent Pre Rating"))
head(Chess_Data_Final)
##   id               names states Post.Rating Average Opponent Pre Rating
## 1  1            GARY HUA    ON         1817                    1605.286
## 2  2     DAKSHESH DARURI    MI         1663                    1469.286
## 3  3        ADITYA BAJAJ    MI         1640                    1563.571
## 4  4 PATRICK H SCHILLING    MI         1744                    1573.571
## 5  5          HANSHI ZUO    MI         1690                    1500.857
## 6  6         HANSEN SONG    OH         1687                    1518.714

Exporting dataframe to a csv.

write.csv(Chess_Data_Final, "Chess Tournament.csv")