The purpose of this project is to analize data from a TXT file with
chess tournament results. Player’s Name, Player’s State, Total Number of
Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents,
will be the information required to analized the data, and obtain all
the information required for the project.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
#get raw data from my GitHub Account
raw_data <- readLines("https://raw.githubusercontent.com/vitugo23/DATA607/main/project1/Chess_tournament.txt")
#extract key fields and putting into a dataframe using Regex
player_num <- as.numeric(unlist(str_extract_all(raw_data,"(?<=\\s{3,4})\\d{1,2}(?=\\s)")))
player_name <- unlist(str_extract_all(raw_data,"(?<=\\d\\s\\|\\s)([A-z, -]*\\s){1,}[[:alpha:]]*(?=\\s*\\|)"))
player_state <- unlist(str_extract_all(raw_data, "[[:upper:]]{2}(?=\\s\\|)"))
total_pts <- as.numeric(unlist(str_extract_all(raw_data, "(?<=\\|)\\d\\.\\d")))
player_pre_rat <- as.numeric(unlist(str_extract_all(raw_data, "(?<=R:\\s{1,2})(\\d{3,4}(?=\\s))|(\\d{3,4}(?=P\\d{1,2}\\s*-))")))
processed_data <- data.frame(player_num, player_name, player_state, total_pts, player_pre_rat)
#review dataframe structure
str(processed_data)
## 'data.frame': 64 obs. of 5 variables:
## $ player_num : num 1 2 3 4 5 6 7 8 9 10 ...
## $ player_name : chr "GARY HUA " "DAKSHESH DARURI " "ADITYA BAJAJ " "PATRICK H SCHILLING " ...
## $ player_state : chr "ON" "MI" "MI" "MI" ...
## $ total_pts : num 6 6 6 5.5 5.5 5 5 5 5 5 ...
## $ player_pre_rat: num 1794 1553 1384 1716 1655 ...
#create a secondary row with opponents to compare the data between players
secondary_rows <- raw_data[seq(5, 196, 3)]
opponent_num <- as.numeric(unlist(str_extract_all(secondary_rows, "(?<=\\|(W|L|D)\\s{2,3})[[:digit:]]{1,2}(?=\\|)|((?<!->)(?<=\\|(U|H|B|X))\\s{4}(?=\\|))")))
#Calculate the PCR (average pre chess rating) for opponents
pcr_matrix <- matrix(data = NA, nrow = 64, ncol = 2)
# Assign names 'total_opp_pcr, and avg_opp_pcr to dataframe
colnames(pcr_matrix) <- c("total_opp_pcr", "avg_opp_pcr")
row_counter <- 0
for(i in seq(from=1, to=length(opponent_num)-6, by=7)){
row_counter <- row_counter + 1
pcr_matrix[row_counter, 1] <- (sum(subset(processed_data$player_pre_rat, processed_data$player_num %in% opponent_num[seq(from=i, to=i+6, by=1)])))
# Calculate the average score per row
pcr_matrix[row_counter, 2] <- pcr_matrix[row_counter, 1] / length(subset(opponent_num[seq(from=i, to=i+6, by=1)],!is.na(opponent_num[seq(from=i, to=i+6, by=1)])))
}
# Verify matrix works as need it
head(pcr_matrix, 5)
## total_opp_pcr avg_opp_pcr
## [1,] 11237 1605.286
## [2,] 10285 1469.286
## [3,] 10945 1563.571
## [4,] 11015 1573.571
## [5,] 10506 1500.857
#Round to the nearest whole number
pcr_matrix[, 2] <- round(pcr_matrix[,2], digits = 0)
processed_data <- cbind(processed_data, pcr_matrix[, 2])
processed_data <- rename(processed_data, avg_opp_pcr = `pcr_matrix[, 2]`)
# Get working directory path
path <- getwd()
# Export file to working directory
write.csv(processed_data, file.path(path, "chess_processed_data.csv"))
head(processed_data, 5)
## player_num player_name player_state total_pts
## 1 1 GARY HUA ON 6.0
## 2 2 DAKSHESH DARURI MI 6.0
## 3 3 ADITYA BAJAJ MI 6.0
## 4 4 PATRICK H SCHILLING MI 5.5
## 5 5 HANSHI ZUO MI 5.5
## player_pre_rat avg_opp_pcr
## 1 1794 1605
## 2 1553 1469
## 3 1384 1564
## 4 1716 1574
## 5 1655 1501
library(ggplot2)
processed_data %>%
ggplot(aes(x = total_pts)) + geom_bar()

barplot(rbind(processed_data$`player_pre_rat`[41:64], processed_data$`avg_opp_pcr`[41:64]), beside = TRUE, col = c("yellow", "green"),
xlab = "Player Pre-Rat. vs. Average Opponent Pre-Rat.", ylab = "Players pre-rating", names.arg = c(41:64))
