DESCRIPTION.

The purpose of this project is to analize data from a TXT file with chess tournament results. Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents, will be the information required to analized the data, and obtain all the information required for the project.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
#get raw data from my GitHub Account
raw_data <- readLines("https://raw.githubusercontent.com/vitugo23/DATA607/main/project1/Chess_tournament.txt")
#extract key fields and putting into a dataframe using Regex
player_num <- as.numeric(unlist(str_extract_all(raw_data,"(?<=\\s{3,4})\\d{1,2}(?=\\s)")))
player_name <- unlist(str_extract_all(raw_data,"(?<=\\d\\s\\|\\s)([A-z, -]*\\s){1,}[[:alpha:]]*(?=\\s*\\|)"))
player_state <- unlist(str_extract_all(raw_data, "[[:upper:]]{2}(?=\\s\\|)"))
total_pts <- as.numeric(unlist(str_extract_all(raw_data, "(?<=\\|)\\d\\.\\d")))
player_pre_rat <- as.numeric(unlist(str_extract_all(raw_data, "(?<=R:\\s{1,2})(\\d{3,4}(?=\\s))|(\\d{3,4}(?=P\\d{1,2}\\s*-))")))
processed_data <- data.frame(player_num, player_name, player_state, total_pts, player_pre_rat)
#review dataframe structure
str(processed_data)
## 'data.frame':    64 obs. of  5 variables:
##  $ player_num    : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ player_name   : chr  "GARY HUA                        " "DAKSHESH DARURI                 " "ADITYA BAJAJ                    " "PATRICK H SCHILLING             " ...
##  $ player_state  : chr  "ON" "MI" "MI" "MI" ...
##  $ total_pts     : num  6 6 6 5.5 5.5 5 5 5 5 5 ...
##  $ player_pre_rat: num  1794 1553 1384 1716 1655 ...
#create a secondary row with opponents to compare the data between players
secondary_rows <- raw_data[seq(5, 196, 3)]
opponent_num <- as.numeric(unlist(str_extract_all(secondary_rows, "(?<=\\|(W|L|D)\\s{2,3})[[:digit:]]{1,2}(?=\\|)|((?<!->)(?<=\\|(U|H|B|X))\\s{4}(?=\\|))")))
#Calculate the PCR (average pre chess rating) for opponents
pcr_matrix <- matrix(data = NA, nrow = 64, ncol = 2)

# Assign names 'total_opp_pcr, and avg_opp_pcr to dataframe 
colnames(pcr_matrix) <- c("total_opp_pcr", "avg_opp_pcr")
row_counter <- 0
for(i in seq(from=1, to=length(opponent_num)-6, by=7)){
  row_counter <- row_counter + 1
  
  pcr_matrix[row_counter, 1] <- (sum(subset(processed_data$player_pre_rat, processed_data$player_num %in% opponent_num[seq(from=i, to=i+6, by=1)])))
  
# Calculate the average score per row
  pcr_matrix[row_counter, 2] <- pcr_matrix[row_counter, 1] / length(subset(opponent_num[seq(from=i, to=i+6, by=1)],!is.na(opponent_num[seq(from=i, to=i+6, by=1)])))
  
}
# Verify matrix works as need it
head(pcr_matrix, 5)
##      total_opp_pcr avg_opp_pcr
## [1,]         11237    1605.286
## [2,]         10285    1469.286
## [3,]         10945    1563.571
## [4,]         11015    1573.571
## [5,]         10506    1500.857
#Round to the nearest whole number
pcr_matrix[, 2] <- round(pcr_matrix[,2], digits = 0)

processed_data <- cbind(processed_data, pcr_matrix[, 2])
processed_data <- rename(processed_data, avg_opp_pcr = `pcr_matrix[, 2]`)
# Get working directory path
path <- getwd()

# Export file to working directory
write.csv(processed_data, file.path(path, "chess_processed_data.csv"))
head(processed_data, 5)
##   player_num                      player_name player_state total_pts
## 1          1 GARY HUA                                   ON       6.0
## 2          2 DAKSHESH DARURI                            MI       6.0
## 3          3 ADITYA BAJAJ                               MI       6.0
## 4          4 PATRICK H SCHILLING                        MI       5.5
## 5          5 HANSHI ZUO                                 MI       5.5
##   player_pre_rat avg_opp_pcr
## 1           1794        1605
## 2           1553        1469
## 3           1384        1564
## 4           1716        1574
## 5           1655        1501
library(ggplot2)
processed_data %>%
  ggplot(aes(x = total_pts)) + geom_bar()

barplot(rbind(processed_data$`player_pre_rat`[41:64], processed_data$`avg_opp_pcr`[41:64]), beside = TRUE, col = c("yellow", "green"), 
        xlab = "Player Pre-Rat. vs. Average Opponent Pre-Rat.", ylab = "Players pre-rating", names.arg = c(41:64)) 

Conclussion.

The data was succesfully extracted from the text file, loaded into a data frame, and analized in order to get the information required in the project. The process was succesfully achieved thanks to some of the Tidyverse functions as well as Regex.

Sources.

R for Data Science
Stack Overflow
GeeksforGeeks
DataCamp