library(tidyverse)
# Load data
cross_table <- read.delim('tournamentinfo.txt')
# Check type
typeof(cross_table)
## [1] "list"
# Preview data
head(cross_table,12)
## X.........................................................................................
## 1 Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## 2 Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## 3 -----------------------------------------------------------------------------------------
## 4 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 5 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 6 -----------------------------------------------------------------------------------------
## 7 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## 8 MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## 9 -----------------------------------------------------------------------------------------
## 10 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|
## 11 MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |
## 12 -----------------------------------------------------------------------------------------
This function removes leading and/or trailing whitespace from a string. It is used in the main conversion function, to_data_frame.
remove_ws <- function(line_split) {
# Initiate empty vector
vector <- c()
# Loop through split line
for (i in line_split) {
# Remove leading and/or trailing whitespace
vector[i] <- trimws(i)
}
vector
}
This function extracts the prerating score of the player from a string via regex and is used in the main conversion function, to_data_frame. The regular expression looks for a 3-4 digit number following "R:" (e.g. "R: 1393"). The following lines remove "R:" from the string, trim the whitespace, and convert to an integer.
extract_score <- function(raw_string) {
# Use regular expression to match to 3 to 4 digit number 1 to 2 characters from "R:"
clean_str <- str_extract(raw_string, "(R:.)(.{1,2}[0-9]{3,4})") #"(R:.)(...[0-9])")
# Remove "R:"
clean_str <- str_replace(clean_str, "R:", "")
# Remove leading and/or trailing whitespace
clean_str <- trimws(clean_str)
# Convert from string to integer
clean_str <- as.integer(clean_str)
clean_str
}
This function extracts the player number from a string via regex and is used in the main conversion function, to_data_frame. The regular expression looks for a 1-2 digit number. Note, the regular expression will need to be updated if player numbers exceed two digits.
extract_opponent <- function(raw_string) {
# Use regular expression to match to 1 to 2 digit number
clean_str <- str_extract(raw_string, "[0-9]{1,2}")
clean_str
}
This is the main data cleaning and conversion function which at a high-level creates a dataframe with four out of the five attributes needed for this project, plus the player's opponents numbers. First, the function initiates a dataframe with dummy data that will be removed later. Second, the function loops through the dataframe in 3 row increments. While in the loop, various data is extracted and cleaned, then added to the dataframe.
to_data_frame <- function(cross_table) {
# Initiate dataframe with dummy data which will be removed later
df_players <- tibble("name" = "temp", "state" = "te", "points" = 1,
"prerating" = 1, "opponents" = "")
# Loop through the dataframe in 3 row increments
for (i in seq(4, nrow(cross_table), by = 3)) {
# Split the two lines for each player using pipes as the seperator
line_split_one <- str_split(cross_table[(i):(i+1),][1], "\\|")
line_split_two <- str_split(cross_table[(i):(i+1),][2], "\\|")
# Remove whitespace
line_clean_one <- remove_ws(line_split_one)
line_clean_two <- remove_ws(line_split_two)
# Add row to dataframe with player's name, state, points, prerating, and opponent player numbers
df_players <- df_players %>% add_row("name" = str_to_title(line_clean_one[[2]], locale = "en"),
"state" = line_clean_two[[1]],
"points" = as.numeric(line_clean_one[[3]]),
"prerating" = extract_score(line_clean_two[[2]]),
"opponents" = paste(
toString(extract_opponent(line_split_one[[1]][[4]])),
toString(extract_opponent(line_split_one[[1]][[5]])),
toString(extract_opponent(line_split_one[[1]][[6]])),
toString(extract_opponent(line_split_one[[1]][[7]])),
toString(extract_opponent(line_split_one[[1]][[8]])),
toString(extract_opponent(line_split_one[[1]][[9]])),
toString(extract_opponent(line_split_one[[1]][[10]])),
sep = ", "))
}
# Set as a dataframe and remove extraneous rows
df_players <- as.data.frame(df_players)
df_players <- df_players %>% slice(2:n())
}
df_chess <- to_data_frame(cross_table)
head(df_chess, 12)
## name state points prerating opponents
## 1 Gary Hua ON 6.0 1794 39, 21, 18, 14, 7, 12, 4
## 2 Dakshesh Daruri MI 6.0 1553 63, 58, 4, 17, 16, 20, 7
## 3 Aditya Bajaj MI 6.0 1384 8, 61, 25, 21, 11, 13, 12
## 4 Patrick H Schilling MI 5.5 1716 23, 28, 2, 26, 5, 19, 1
## 5 Hanshi Zuo MI 5.5 1655 45, 37, 12, 13, 4, 14, 17
## 6 Hansen Song OH 5.0 1686 34, 29, 11, 35, 10, 27, 21
## 7 Gary Dee Swathell MI 5.0 1649 57, 46, 13, 11, 1, 9, 2
## 8 Ezekiel Houghton MI 5.0 1641 3, 32, 14, 9, 47, 28, 19
## 9 Stefano Lee ON 5.0 1411 25, 18, 59, 8, 26, 7, 20
## 10 Anvit Rao MI 5.0 1365 16, 19, 55, 31, 6, 25, 18
## 11 Cameron William Mc Leman MI 4.5 1712 38, 56, 6, 7, 3, 34, 26
## 12 Kenneth J Tack MI 4.5 1663 42, 33, 5, 38, NA, 1, 3
Add player_num column and create opp_pre (average opponent prerating score) column with temp data.
# Turn index into player_num column
df_chess <- cbind(player_num = rownames(df_chess), df_chess)
rownames(df_chess) <- 1:nrow(df_chess)
df_chess['opp_pre']= 0
This function calculates the average pre chess rating of opponents for each player. While looping through the dataframe one row at a time each opponents cell is converted into a list, which is used to filter for only the rows containing the current player's opponents. The total sum is calculated, then divided by the number of games played (i.e. byes, forfeits, recored as NAs, are not included). Note, R's base round function rounds down positive .5 values; anything above .5 is rounded up.
cal_avg_pre_opp_rating <- function(df_chess) {
for (i in 1:nrow(df_chess)) {
# Split opponents string by comma
opp_list <- as.list(strsplit(df_chess[i, ]$opponents, ",\\s"))
# Convert list of strings to list of numeric values
opp_list <-as.numeric(opp_list[[1]])
# Filter for only opponents
prerating_opp <- subset(df_chess, player_num %in% opp_list)
# Set opp_pre cell value with calculated average rounded to nearest full-point
df_chess[i, 7] = round(sum(prerating_opp$prerating) / length(prerating_opp$prerating), digits = 0)
}
df_chess
}
df_final <- cal_avg_pre_opp_rating(df_chess)
# Drop opponent column
df_final <- subset(df_final, select = -c(player_num, opponents))
head(df_final, 12)
## name state points prerating opp_pre
## 1 Gary Hua ON 6.0 1794 1605
## 2 Dakshesh Daruri MI 6.0 1553 1469
## 3 Aditya Bajaj MI 6.0 1384 1564
## 4 Patrick H Schilling MI 5.5 1716 1574
## 5 Hanshi Zuo MI 5.5 1655 1501
## 6 Hansen Song OH 5.0 1686 1519
## 7 Gary Dee Swathell MI 5.0 1649 1372
## 8 Ezekiel Houghton MI 5.0 1641 1468
## 9 Stefano Lee ON 5.0 1411 1523
## 10 Anvit Rao MI 5.0 1365 1554
## 11 Cameron William Mc Leman MI 4.5 1712 1468
## 12 Kenneth J Tack MI 4.5 1663 1506
# Write to CSV
write.csv(df_final, "chess_players_final.csv", row.names=FALSE)
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.