The chess cross table data in the text file was unstructured and not suitable for analysis. Using regular expressions I parsed the necessary data in order to transform it into a structure needed for analysis. I also performed the necessary calculations to get the total and average player ratings for each of the players’ opponents. Finally, I merged the average and total player ratings with the parsed fields from the unstructured cross table file and dumped them into a csv file. The data was now organized in a structured and easy to read format.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.9
## v tidyr 1.2.0 v stringr 1.4.1
## v readr 2.1.2 v forcats 0.5.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
chess_info <- readLines("https://raw.githubusercontent.com/GitHub-Vlad/Data-Science/main/tournamentinfo.txt")
## Warning in readLines("https://raw.githubusercontent.com/GitHub-Vlad/Data-
## Science/main/tournamentinfo.txt"): incomplete final line found on 'https://
## raw.githubusercontent.com/GitHub-Vlad/Data-Science/main/tournamentinfo.txt'
# 1 Regex analysis to parsing player number:
#Lookbehind assertion (<=\\) checks/skips if white space in the third to fourth position (s{3,4}) precedes a digit in the first to second position (d{1,2}) followed by a lookahead assertion which checks if white space follows the digit. We then use the unlist() function to convert this list of player numbers to a vector
pair_num <- as.numeric(unlist(str_extract_all(chess_info ,"(?<=\\s{3,4})\\d{1,2}(?=\\s)")))
# 2 Regex analysis to parsing player name:
#Lookbehind assertion (<=\\) checks/skips if after digit (#\\d) goes white space and if after white space goes (|), then selects all letters ([A-z]), skips one white space (*\\s{1}),selects all letter ([[:alpha:]]) and finally skips one white space (\\s{1})
name <- unlist(str_extract_all(chess_info ,"(?<=\\d\\s\\|\\s)([A-z]*\\s{1})[[:alpha:]]*(?=\\s{1})"))
# 3 Regex analysis to parsing player state:
#Lookbehind assertion (<=\\) checks/skips 3 white spaces(s{3,4}),then selects the first two characters after,then skips white space (\\s) and finally skips (\\|) character.
state <- unlist(str_extract_all(chess_info , "(?<=\\s{3,4})[[:upper:]]{2}(?=\\s\\|)"))
# 4 Regex analysis to parsing player points:
#Lookbehind assertion (<=\\) checks/skips (|) then it process a digit(\\d) followed by a (.) followed by another digit(\\d)
points <- as.numeric(unlist(str_extract_all(chess_info , "(?<=\\|)(\\d\\.\\d)")))
# 5 Regex analysis to parsing pre chess player rating:
#Lookbehind assertion (<=\\) checks/skips (|) then it process a digit(\\d) followed by a (.) followed by another digit(\\d)
pcr_rating <- as.numeric(unlist(str_extract_all(chess_info , "(?<=R:\\s{1,2})\\d{3,4}")))
#Add parsed fields to the data frame
chess_info_parsed <- data.frame(pair_num,name,state, points, pcr_rating)
opponent_player_ratings <- as.numeric(unlist(str_extract_all(chess_info,"(?<=\\|(W|L|D)\\s{2,3})\\d{1,2}")))
#create and add column names to a matrix with 64 rows and 2 columns
rating_matrix <-matrix(data = NA, nrow = 64, ncol = 2)
colnames(rating_matrix) <- c("opponent_total_ratings", "opponent_average_ratings")
#Loop through the opponent player ratings list.
counter=1
row_cnt <- 1
#Loop to retrieve the rating of each of the opponents for a given row (player)
for (x in opponent_player_ratings) {
#utilized the modulus function to get the number value divisible by 7 of each row and work backwards 6 to get values of all opponents of a particular player
if(counter%%7==0)
{
opponent_total_ratings <-
as.integer(opponent_player_ratings[counter]) + as.integer(opponent_player_ratings[counter-1]) + as.integer(opponent_player_ratings[counter-2]) + as.integer(opponent_player_ratings[counter-3]) + as.integer(opponent_player_ratings[counter-4]) +as.integer(opponent_player_ratings[counter-5]) +
as.integer(opponent_player_ratings[counter-6])
opponent_average_ratings<- opponent_total_ratings/7
rating_matrix[row_cnt,1] <- opponent_total_ratings
rating_matrix[row_cnt,2] <- opponent_average_ratings
row_cnt=row_cnt+1
}
counter=counter+1
}
chess_info_parsed <- cbind(chess_info_parsed, rating_matrix)
# getting the absolute download path of the current directory easy access during debugging.
path <- getwd()
#Export the data frame to a csv file
write.csv(chess_info_parsed, file.path(path, "chess_info_final.csv"))