DATA 607 - Project # 1

Vladimir Nimchenko

Introduction:

The chess cross table data in the text file was unstructured and not suitable for analysis. Using regular expressions I parsed the necessary data in order to transform it into a structure needed for analysis. I also performed the necessary calculations to get the total and average player ratings for each of the players’ opponents. Finally, I merged the average and total player ratings with the parsed fields from the unstructured cross table file and dumped them into a csv file. The data was now organized in a structured and easy to read format.

Retrieve the data from GitHub

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.8     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.1
## v readr   2.1.2     v forcats 0.5.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

chess_info <- readLines("https://raw.githubusercontent.com/GitHub-Vlad/Data-Science/main/tournamentinfo.txt")

## Warning in readLines("https://raw.githubusercontent.com/GitHub-Vlad/Data-
## Science/main/tournamentinfo.txt"): incomplete final line found on 'https://
## raw.githubusercontent.com/GitHub-Vlad/Data-Science/main/tournamentinfo.txt'

Use Regular Expressions to parse the fields necessary for analysis and add field to data frame

# 1 Regex analysis to parsing player number:

#Lookbehind assertion (<=\\) checks/skips if white space in the third to fourth position (s{3,4}) precedes a digit in the first to second position (d{1,2}) followed by a lookahead assertion which checks if white space follows the digit. We then use the unlist() function to convert this list of player numbers to a vector

pair_num <- as.numeric(unlist(str_extract_all(chess_info ,"(?<=\\s{3,4})\\d{1,2}(?=\\s)")))

# 2 Regex analysis to parsing player name:

#Lookbehind assertion (<=\\) checks/skips if after digit (#\\d) goes white space and if after white space goes (|), then selects all letters ([A-z]), skips one white space (*\\s{1}),selects all letter ([[:alpha:]]) and finally skips one white space (\\s{1})

name <- unlist(str_extract_all(chess_info ,"(?<=\\d\\s\\|\\s)([A-z]*\\s{1})[[:alpha:]]*(?=\\s{1})"))

# 3 Regex analysis to parsing player state:

#Lookbehind assertion (<=\\) checks/skips 3 white spaces(s{3,4}),then selects the first two characters after,then skips white space (\\s) and finally skips (\\|) character.
state <- unlist(str_extract_all(chess_info , "(?<=\\s{3,4})[[:upper:]]{2}(?=\\s\\|)"))

# 4 Regex analysis to parsing player points:

 #Lookbehind assertion (<=\\) checks/skips (|) then it process a digit(\\d) followed by a (.) followed by another digit(\\d)
points <- as.numeric(unlist(str_extract_all(chess_info , "(?<=\\|)(\\d\\.\\d)")))

# 5 Regex analysis to parsing  pre chess player rating:

 #Lookbehind assertion (<=\\) checks/skips (|) then it process a digit(\\d) followed by a (.) followed by another digit(\\d)
pcr_rating <- as.numeric(unlist(str_extract_all(chess_info , "(?<=R:\\s{1,2})\\d{3,4}")))

#Add parsed fields to the data frame
chess_info_parsed <- data.frame(pair_num,name,state, points, pcr_rating)

create List of opponent player ratings

opponent_player_ratings <- as.numeric(unlist(str_extract_all(chess_info,"(?<=\\|(W|L|D)\\s{2,3})\\d{1,2}")))

Calculate the total and average ratings of all seven entries for each of the opponent players and add all parsed columns to a matrix

#create and add column names to a matrix with 64 rows and 2 columns
rating_matrix <-matrix(data = NA, nrow = 64, ncol = 2)
colnames(rating_matrix) <- c("opponent_total_ratings", "opponent_average_ratings")

#Loop through the opponent player ratings list.
counter=1
row_cnt <- 1
#Loop to retrieve the rating of each of the opponents for a given row (player)
for (x in  opponent_player_ratings) {
  #utilized the modulus function to get the number value divisible by 7 of each row and work backwards 6    to get values of all opponents of a particular player
  if(counter%%7==0)
  {
   
    opponent_total_ratings <- 
    as.integer(opponent_player_ratings[counter]) + as.integer(opponent_player_ratings[counter-1]) +          as.integer(opponent_player_ratings[counter-2]) + as.integer(opponent_player_ratings[counter-3]) +        as.integer(opponent_player_ratings[counter-4]) +as.integer(opponent_player_ratings[counter-5]) +  
    as.integer(opponent_player_ratings[counter-6])
    
     opponent_average_ratings<- opponent_total_ratings/7
     
     rating_matrix[row_cnt,1] <- opponent_total_ratings 
     rating_matrix[row_cnt,2] <- opponent_average_ratings
     row_cnt=row_cnt+1

  }
 
  counter=counter+1
}

Add all the opponent_total_ratings and opponent_average_ratings to the chess_info_parsed data frame

chess_info_parsed <- cbind(chess_info_parsed, rating_matrix)

Writing the chess_info_parsed data frame to a csv file

# getting the absolute download path of the current directory easy access during debugging.
path <- getwd()

#Export the data frame to a csv file
write.csv(chess_info_parsed, file.path(path, "chess_info_final.csv"))