In this project, a text file with chess tournament results was provided. The objective was to create an R Markdown file that generates a .CSV file with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
For example:
Gary Hua, ON, 6.0, 1794, 1605
Load required packages.
library(RCurl)
library(tidyverse)
Read in the text file from github and take a look at the start.
x <- getURL("https://raw.githubusercontent.com/klgriffen96/spring23_data607_proj1/main/chess_scores.txt")
glimpse(x)
## chr "-----------------------------------------------------------------------------------------\r\n Pair | Player Nam"| __truncated__
Text files are generally broken into new lines using
\r\n
which can be seen in this file. Start by splitting on
\r\n
so each line is separated.
# Split x
s <- str_split_fixed(x, "\r\n", n=Inf)
# View first ten entries
s[1:10]
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
From the file extract the:
# Get start row and last PID
for (i in 1:length(s)){
temp <- as.integer(str_extract_all(s[i],"^\\s+[1]\\s+"))
temp2 <- as.integer(str_extract_all(s[i],"^\\s+[0-9]+"))
if (is.na(temp) == FALSE){
start_row <- i # Get the row that the actual games start on
cat("Start row is: ", start_row)
}
if (is.na(temp2) == FALSE){
last_id <- temp2 # Get the PID of the last game
}
}
## Start row is: 5
cat("Last PID is: ",last_id)
## Last PID is: 64
# Get the number of rounds
for (i in 1:length(s)){
temp <- str_extract_all(s[i],"\\s+[0-9]+\\s+\\|\\s+$")
temp <- str_split_fixed(temp, "\\|", n=Inf)
if (temp[1][1] != "character(0)"){
n_rounds <- as.integer(temp[1][1])
cat("Number of rounds: ", n_rounds)
break
}
}
## Number of rounds: 7
Form one dataframe that has the following information:
Form a matrix that has the PID followed by the opponent IDs, with NAs for no opponent for that round.
# Make the dataframes
p_info <- data.frame(
p_id = integer(),
p_name = character(),
p_points = double(),
p_state = character(),
p_prerating = integer()
)
o_ids <- matrix(, nrow = last_id, ncol = n_rounds + 1)
for (i in seq(start_row,length(s),by =3)){
ss_1 <- str_split_fixed(s[i],regex("|", literal=TRUE),n=Inf)
# PID | P Name | Total Points | X Opponent PID | X Opponent PID | etc...
# If X is W,L or D - read Opponent PID, increment total opponents
p_id <- as.integer(ss_1[1])
p_name <- str_trim(ss_1[2],side="both")
p_points <- as.double(ss_1[3])
ss_2 <- str_split_fixed(s[i+1],regex("|", literal=TRUE), n=Inf)
# State | X/R: Player’s Pre-Rating
p_state <- str_trim(ss_2[1],side="both")
temp <- str_extract_all(ss_2[2],"[0-9]+")
p_prerating <- as.integer(temp[[1]][2])
start_i <- 4
start_o <- 2
o_ids[p_id, 1] <- p_id
for (ii in start_i:(length(ss_1)-1)){
o_ids[p_id, start_o] <- as.integer(str_extract(ss_1[ii],"[0-9]+"))
start_o <- start_o + 1
}
p_temp <- data.frame(
p_id,
p_name,
p_points,
p_state,
p_prerating)
p_info <- rbind(p_info, p_temp)
}
Now that there is a dataframe with all the player information and a matrix with all of the player-opponent information, the calculation for the average pre-chess rating of opponents can be made for each player.
pco_rating <- matrix(, nrow = last_id, ncol = 2)
for (i in 1:dim(o_ids)[1]){
temp_sum <- 0
temp_total <-0
for (ii in 2:dim(o_ids)[2]){
temp_o_id <- o_ids[i,ii]
if (is.na(temp_o_id) == FALSE){
temp_sum <- temp_sum + p_info$p_prerating[temp_o_id]
temp_total <- temp_total + 1
}
}
if (temp_total > 0){
temp_average <- temp_sum/temp_total
} else {
temp_average <- NA # handle case where no opponents
}
pco_rating[i,1] <- o_ids[i,1]
pco_rating[i,2] <- temp_average
}
Now, all relevant information is in the dataframe except the pre-chess rating of the opponents. Do a quick check that the number of rows in the dataframe and the matrix match.
dim(p_info)
## [1] 64 5
dim(pco_rating)
## [1] 64 2
Make a new dataframe that includes the rounded pre-chess rating of the opponents.
df <- data.frame(
p_name = character(),
p_points = double(),
p_state = character(),
p_prerating = integer(),
o_prerating = integer())
for (i in 1:dim(p_info)[1]){
p_info$p_id[i]
for (ii in 1:dim(pco_rating)[1]){
if (p_info$p_id[i] == pco_rating[ii,1]){
df_temp <- data.frame(
p_info$p_name[i],
p_info$p_points[i],
p_info$p_state[i],
p_info$p_prerating[i],
round(pco_rating[ii,2]))
df <- rbind(df, df_temp)
break
}
}
}
Do a quick check to see if the results make sense.
head(df)
## p_info.p_name.i. p_info.p_points.i. p_info.p_state.i.
## 1 GARY HUA 6.0 ON
## 2 DAKSHESH DARURI 6.0 MI
## 3 ADITYA BAJAJ 6.0 MI
## 4 PATRICK H SCHILLING 5.5 MI
## 5 HANSHI ZUO 5.5 MI
## 6 HANSEN SONG 5.0 OH
## p_info.p_prerating.i. round.pco_rating.ii..2..
## 1 1794 1605
## 2 1553 1469
## 3 1384 1564
## 4 1716 1574
## 5 1655 1501
## 6 1686 1519
A brief check validates the data.
Write the dataframe to a csv file.
write.csv(df, "chess_ratings.csv")
This code successfully takes the input text file and produces an output file with the Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents.
This code is flexible in a few ways:
1
.For debugging purposes, rather than relying on the index of the
p_info
dataframe, the o_ids
matrix, and the
pco_ratings
matrix to align and indicate the Player ID, I
stored the Player ID in each. For p_info
it is
p_info$p_id
and for o_ids
and
pco_ratings
it is the first column. This way if an entry or
entries are missing, the Player ID is still preserved and can be
referenced back to.
This code will include players in the output csv, even if the player
did not have any opponents. In this case, the average pre-chess rating
of opponents column would be NA
.
A shortcoming of this code is that it may not handle an incorrectly formatted row well. Depending on what the formatting error is, the code could get into a state that it would not recover from.
To extend this code, I would take the input file and introduce different formatting errors to it and work on the error handling aspects of the code.