Loading packages and reading in the data

library('tidyverse')

link <- 'https://raw.githubusercontent.com/pmahdi/cuny-data-607/main/proj-1-chess.txt'
chess_og <- read_delim(
  file = link, 
  delim = '|', 
  col_names = FALSE, 
  na = c('-----------------------------------------------------------------------------------------', ''), 
  trim_ws = TRUE,
  skip = 1, 
  skip_empty_rows = TRUE)

glimpse(chess_og)
## Rows: 194
## Columns: 11
## $ X1  <chr> "Pair", "Num", NA, "1", "ON", NA, "2", "MI", NA, "3", "MI", NA, "4…
## $ X2  <chr> "Player Name", "USCF ID / Rtg (Pre->Post)", NA, "GARY HUA", "15445…
## $ X3  <chr> "Total", "Pts", NA, "6.0", "N:2", NA, "6.0", "N:2", NA, "6.0", "N:…
## $ X4  <chr> "Round", "1", NA, "W  39", "W", NA, "W  63", "B", NA, "L   8", "W"…
## $ X5  <chr> "Round", "2", NA, "W  21", "B", NA, "W  58", "W", NA, "W  61", "B"…
## $ X6  <chr> "Round", "3", NA, "W  18", "W", NA, "L   4", "B", NA, "W  25", "W"…
## $ X7  <chr> "Round", "4", NA, "W  14", "B", NA, "W  17", "W", NA, "W  21", "B"…
## $ X8  <chr> "Round", "5", NA, "W   7", "W", NA, "W  16", "B", NA, "W  11", "W"…
## $ X9  <chr> "Round", "6", NA, "D  12", "B", NA, "W  20", "W", NA, "W  13", "B"…
## $ X10 <chr> "Round", "7", NA, "D   4", "W", NA, "W   7", "B", NA, "W  12", "W"…
## $ X11 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

Transforming the data

It looks like column X11 consists of only NA values. This is probably because the input file has a ‘|’ at the end of each line, which also happens to be the delimiter character. So, read_delim() mistakenly adds on an extra column.

# Checking there are no values in X11
sum(!is.na(chess_og$X11))
## [1] 0
# Dropping the column X11
chess_mod <- chess_og[-11]
glimpse(chess_mod)
## Rows: 194
## Columns: 10
## $ X1  <chr> "Pair", "Num", NA, "1", "ON", NA, "2", "MI", NA, "3", "MI", NA, "4…
## $ X2  <chr> "Player Name", "USCF ID / Rtg (Pre->Post)", NA, "GARY HUA", "15445…
## $ X3  <chr> "Total", "Pts", NA, "6.0", "N:2", NA, "6.0", "N:2", NA, "6.0", "N:…
## $ X4  <chr> "Round", "1", NA, "W  39", "W", NA, "W  63", "B", NA, "L   8", "W"…
## $ X5  <chr> "Round", "2", NA, "W  21", "B", NA, "W  58", "W", NA, "W  61", "B"…
## $ X6  <chr> "Round", "3", NA, "W  18", "W", NA, "L   4", "B", NA, "W  25", "W"…
## $ X7  <chr> "Round", "4", NA, "W  14", "B", NA, "W  17", "W", NA, "W  21", "B"…
## $ X8  <chr> "Round", "5", NA, "W   7", "W", NA, "W  16", "B", NA, "W  11", "W"…
## $ X9  <chr> "Round", "6", NA, "D  12", "B", NA, "W  20", "W", NA, "W  13", "B"…
## $ X10 <chr> "Round", "7", NA, "D   4", "W", NA, "W   7", "B", NA, "W  12", "W"…

It also seems like every 3rd row is full of NA values. So, let’s drop all rows that only have NA values.

all_na_index <- logical(length = nrow(chess_mod)) 

for (i in 1:nrow(chess_mod)) {
  all_na_index[i] <- all(is.na(chess_mod[i, ]))
}

chess_mod <- chess_mod[!all_na_index, ]

Next, new column names are added and the first 2 rows are dropped.

colnames(chess_mod) <- c('id', 'name', 'points', 'round_1', 'round_2', 'round_3', 'round_4', 'round_5', 'round_6', 'round_7')

chess_mod <- chess_mod[c(-1, -2), ]

The next step should be to add 2 extra columns to the data frame for populating with useful information from each player’s 2nd row (state name and pre-tournament rating). After information from the second row of each player is added to the 2 columns, each player’s second row (so, all the even rows) will be dropped.

chess_mod <- cbind(chess_mod, tibble(state = rep('', nrow(chess_mod)), pre_rating = rep('', nrow(chess_mod))))

for (i in seq(2, nrow(chess_mod), 2)) {
  chess_mod[i - 1, 'state'] <- chess_mod[i, 'id']
  chess_mod[i - 1, 'pre_rating'] <- chess_mod[i, 'name']
}

chess_mod <- chess_mod[-seq(2, nrow(chess_mod), 2), ]

# turns out R automatically assigns rownames corresponding to the previous indices
head(rownames(chess_mod))
## [1] "1"  "3"  "5"  "7"  "9"  "11"
rownames(chess_mod) <- NULL
head(rownames(chess_mod))  # much better, less confusing row indices
## [1] "1" "2" "3" "4" "5" "6"

At this stage, the pre_rating column needs to be cleaned so that it contains only the pre-tournament rating of each player. For this, regular expressions will be used alongside the stringr package from tidyverse. The plan is to extract the correct information from each row using pattern matching and save that to a vector. Then, if the result looks good, that vector, after coercion to numeric, will update the information in the pre_rating column.

extracted <- str_extract(string = chess_mod$pre_rating, pattern = ' \\d{3,4}')

# Checking that the extraction went alright
length(extracted) == nrow(chess_mod); sum(is.na(extracted))
## [1] TRUE
## [1] 0
head(extracted); tail(extracted)
## [1] " 1794" " 1553" " 1384" " 1716" " 1655" " 1686"
## [1] " 853"  " 967"  " 955"  " 1530" " 1175" " 1163"
# Need to remove the whitespace in front of each rating
extracted <- str_trim(string = extracted, side = 'both')

# Converting to numeric
extracted <- as.numeric(extracted)

# Populating chess_mod$pre_rating with extracted
chess_mod$pre_rating <- extracted

Almost all of the required information has been isolated in individual columns. Only average pre-tournament opponent rating is left. To get that value, the ids of each player’s opponents have to be isolated.

opp_id <- apply(X = chess_mod[4:10], MARGIN = 2, FUN = function(x) {str_extract(string = x, pattern = '\\d{1,2}')})

colnames(opp_id) <- c('opp_id_r1', 'opp_id_r2', 'opp_id_r3', 'opp_id_r4', 'opp_id_r5', 'opp_id_r6', 'opp_id_r7')

chess_mod <- cbind(chess_mod, opp_id)

After isolating the ids, a new numeric column named avg_opp_pre_rating is added to chess_mod. It’s filled with 0s. The pre-tournament ratings of each player’s opponents are fetched and those values are put through mean() to get the average opponent rating. Then, this output updates the avg_opp_pre_rating column for the corresponding player.

# Creating the column for average opponent rating and filling it with 0
chess_mod$avg_opp_pre_rating <- rep(0, nrow(chess_mod))

# Using a loop to calculate the avg_opp_pre_rating value for each player
for (i in 1:nrow(chess_mod)) {
  chess_mod[i, 'avg_opp_pre_rating'] <- 
    round(
      mean(
        chess_mod[
          as.character(chess_mod[i, 13:19]), 
          'pre_rating'], 
        na.rm = TRUE
        )
      )
}

Finally, all the required information has been isolated in individual columns. Those columns will be subsetted and saved to a new data frame named chess_output.

chess_output <- chess_mod[, c('name', 'state', 'points', 'pre_rating', 'avg_opp_pre_rating')]

colnames(chess_output) <- c('name', 'state', 'total_points', 'player_rating', 'avg_opp_rating')

# Converting total_points column to a numeric
chess_output$total_points <- as.numeric(chess_output$total_points)
glimpse(chess_output)
## Rows: 64
## Columns: 5
## $ name           <chr> "GARY HUA", "DAKSHESH DARURI", "ADITYA BAJAJ", "PATRICK…
## $ state          <chr> "ON", "MI", "MI", "MI", "MI", "OH", "MI", "MI", "ON", "…
## $ total_points   <dbl> 6.0, 6.0, 6.0, 5.5, 5.5, 5.0, 5.0, 5.0, 5.0, 5.0, 4.5, …
## $ player_rating  <dbl> 1794, 1553, 1384, 1716, 1655, 1686, 1649, 1641, 1411, 1…
## $ avg_opp_rating <dbl> 1605, 1469, 1564, 1574, 1501, 1519, 1372, 1468, 1523, 1…

The final task is exporting chess_output to a csv file. The readr package’s write_csv function can be used.

write_csv(
  x = chess_output, 
  file = './chess_output.csv', 
  na = 'NA', 
  append = FALSE, 
  col_names = TRUE, 
  quote = 'needed', 
  escape = 'double', 
  eol = '\n'
)