Vladimir Nimchenko

Introduction:

The chess cross table data in the text file was unstructured and not suitable for analysis. Using regular expressions I parsed the necessary data in order to transform it into a structure needed for analysis. I also performed the necessary calculations to get the total and average player ratings for each of the players’ opponents. Finally, I merged the average and total player ratings with the parsed fields from the unstructured cross table file and dumped them into a csv file. The data was now organized in a structured and easy to read format.

Retrieve the data from GitHub

library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.2     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.2     v tibble    3.2.1
## v lubridate 1.9.2     v tidyr     1.3.0
## v purrr     1.0.1     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
chess_info <- readLines("https://raw.githubusercontent.com/GitHub-Vlad/Data-Science-Projects/main/Data%20Transformation/Chess%20Tournament%20Results.txt")
## Warning in
## readLines("https://raw.githubusercontent.com/GitHub-Vlad/Data-Science-Projects/main/Data%20Transformation/Chess%20Tournament%20Results.txt"):
## incomplete final line found on
## 'https://raw.githubusercontent.com/GitHub-Vlad/Data-Science-Projects/main/Data%20Transformation/Chess%20Tournament%20Results.txt'

Use Regular Expressions to parse the fields necessary for analysis and add field to data frame

# 1 Regex analysis to parsing player number:

#Lookbehind assertion (<=\\) checks/skips if white space in the third to fourth position (s{3,4}) precedes a digit in the first to second position (d{1,2}) followed by a lookahead assertion which checks if white space follows the digit. We then use the unlist() function to convert this list of player numbers to a vector

pair_num <- as.numeric(unlist(str_extract_all(chess_info ,"(?<=\\s{3,4})\\d{1,2}(?=\\s)")))

# 2 Regex analysis to parsing player name:

#Lookbehind assertion (<=\\) checks/skips if after digit (#\\d) goes white space and if after white space goes (|), then selects all letters ([A-z]), skips one white space (*\\s{1}),selects all letter ([[:alpha:]]) and finally skips one white space (\\s{1})

name <- unlist(str_extract_all(chess_info ,"(?<=\\d\\s\\|\\s)([A-z]*\\s{1})[[:alpha:]]*(?=\\s{1})"))

# 3 Regex analysis to parsing player state:

#Lookbehind assertion (<=\\) checks/skips 3 white spaces(s{3,4}),then selects the first two characters after,then skips white space (\\s) and finally skips (\\|) character.
state <- unlist(str_extract_all(chess_info , "(?<=\\s{3,4})[[:upper:]]{2}(?=\\s\\|)"))

# 4 Regex analysis to parsing player points:

 #Lookbehind assertion (<=\\) checks/skips (|) then it process a digit(\\d) followed by a (.) followed by another digit(\\d)
points <- as.numeric(unlist(str_extract_all(chess_info , "(?<=\\|)(\\d\\.\\d)")))

# 5 Regex analysis to parsing  pre chess player rating:

 #Lookbehind assertion (<=\\) checks/skips (|) then it process a digit(\\d) followed by a (.) followed by another digit(\\d)
pcr_rating <- as.numeric(unlist(str_extract_all(chess_info , "(?<=R:\\s{1,2})\\d{3,4}")))

#Add parsed fields to the data frame
chess_info_parsed <- data.frame(pair_num,name,state, points, pcr_rating)

create List of opponent player ratings

opponent_player_ratings <- as.numeric(unlist(str_extract_all(chess_info,"(?<=\\|(W|L|D)\\s{2,3})\\d{1,2}")))

Calculate the total and average ratings of all seven entries for each of the opponent players and add all parsed columns to a matrix

#create and add column names to a matrix with 64 rows and 2 columns
rating_matrix <-matrix(data = NA, nrow = 64, ncol = 2)
colnames(rating_matrix) <- c("opponent_total_ratings", "opponent_average_ratings")

#Loop through the opponent player ratings list.
counter=1
row_cnt <- 1
#Loop to retrieve the rating of each of the opponents for a given row (player)
for (x in  opponent_player_ratings) {
  #utilized the modulus function to get the number value divisible by 7 of each row and work backwards 6    to get values of all opponents of a particular player
  if(counter%%7==0)
  {
   
    opponent_total_ratings <- 
    as.integer(opponent_player_ratings[counter]) + as.integer(opponent_player_ratings[counter-1]) +          as.integer(opponent_player_ratings[counter-2]) + as.integer(opponent_player_ratings[counter-3]) +        as.integer(opponent_player_ratings[counter-4]) +as.integer(opponent_player_ratings[counter-5]) +  
    as.integer(opponent_player_ratings[counter-6])
    
     opponent_average_ratings<- opponent_total_ratings/7
     
     rating_matrix[row_cnt,1] <- opponent_total_ratings 
     rating_matrix[row_cnt,2] <- opponent_average_ratings
     row_cnt=row_cnt+1

  }
 
  counter=counter+1
} 

Add all the opponent_total_ratings and opponent_average_ratings to the chess_info_parsed data frame

chess_info_parsed <- cbind(chess_info_parsed, rating_matrix)

Writing the chess_info_parsed data frame to a csv file

# getting the absolute download path of the current directory easy access during debugging.
path <- getwd()

#Export the data frame to a csv file
write.csv(chess_info_parsed, file.path(path, "chess_info_final.csv"))