In this project I will read a text file of chess tournament results into R, tidy the resulting table, and produce a csv file with the following information of all players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
#Loading Packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Reading the text file into R
rawdata <- read.table("C:\\Users\\chung\\Downloads\\tournamentinfo.txt", sep = "|", fill = TRUE)
head(rawdata)
## V1
## 1 -----------------------------------------------------------------------------------------
## 2 Pair
## 3 Num
## 4 -----------------------------------------------------------------------------------------
## 5 1
## 6 ON
## V2 V3 V4 V5 V6 V7 V8 V9
## 1
## 2 Player Name Total Round Round Round Round Round Round
## 3 USCF ID / Rtg (Pre->Post) Pts 1 2 3 4 5 6
## 4
## 5 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7 D 12
## 6 15445895 / R: 1794 ->1817 N:2 W B W B W B
## V10 V11
## 1 NA
## 2 Round NA
## 3 7 NA
## 4 NA
## 5 D 4 NA
## 6 W NA
#Removing the first three rows of header information, and every third row (used in the text file to seperate information from one player to the next). This leaves us with increments of two rows (each pair corresponds to the player and their performance in the tournament)
data <- rawdata[-c(1:3),-11]
data <- data[-grep('^----', data$V1),]
#Extracting player IDs and player prechess ratings from every second row.
staterow <- data[seq(2,128,2),]
#Removing unneeded columns
staterow <- staterow[,-c(3:10)]
#Splitting column V2 to extract the player id and prechess ratings
staterow <- staterow %>%
separate(V2, c('c1','c2','c3','c4'))
## Warning: Expected 4 pieces. Additional pieces discarded in 64 rows [1, 2, 3, 4, 5, 6, 7,
## 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
#Removing unneeded columns
staterow <- staterow[,c(-2,-4)]
#Removing provisional "P" and game count of each player if applicable.
staterow <- staterow %>%
separate(c4, c('pre.rating','c5'), sep = 'P')
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 54 rows [1, 2, 3, 4, 5,
## 6, 7, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, ...].
#removing unneeded columns and renamming
staterow <- staterow[,-4]
staterow <- staterow %>%
rename(state = V1,
p.id = c2)
#Adding in a player number column so that we can reference the prechess ratings
staterow$player.number <- seq.int(nrow(staterow))
#Extracting player name and total points from every other row of the “data” dataframe
#Creating new dataframe with the desired rows
playerrow <- data[seq(1,128,2),]
#Extracting the opponent numebrs for each round
playerrow <- extract (playerrow, V4, c('result1','O1'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V5, c('result2','O2'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V6, c('result3','O3'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V7, c('result4','O4'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V8, c('result5','O5'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V9, c('result6','O6'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V10, c('result7','O7'), '([A-Z]{0,1}) +([0-9]{0,2})')
#Remove unneeded columns and rename
playerrow <- playerrow[,-c(4,6,8,10,12,14,16)]
playerrow <- playerrow %>%
rename(player.number = V1,
player.name = V2,
total.points = V3)
#Creating a reference table to use left join to join the preratings into the player row data table
reference <- staterow[,c(4,3)]
#Join opponent pre chess ratings into player row to be able to calculate the average of opponent ratings.
playerrow <- merge(playerrow,reference,by.x = "O1",by.y = "player.number", sort = FALSE, all.x=TRUE)
playerrow <- merge(playerrow,reference,by.x = "O2",by.y = "player.number", sort = FALSE, all.x=TRUE)
playerrow <- merge(playerrow,reference,by.x = "O3",by.y = "player.number", sort = FALSE, all.x=TRUE)
playerrow <- merge(playerrow,reference,by.x = "O4",by.y = "player.number", sort = FALSE, all.x=TRUE)
## Warning in merge.data.frame(playerrow, reference, by.x = "O4", by.y =
## "player.number", : column names 'pre.rating.x', 'pre.rating.y' are duplicated
## in the result
playerrow <- merge(playerrow,reference,by.x = "O5",by.y = "player.number", sort = FALSE, all.x=TRUE)
## Warning in merge.data.frame(playerrow, reference, by.x = "O5", by.y =
## "player.number", : column names 'pre.rating.x', 'pre.rating.y' are duplicated
## in the result
playerrow <- merge(playerrow,reference,by.x = "O6",by.y = "player.number", sort = FALSE, all.x=TRUE)
## Warning in merge.data.frame(playerrow, reference, by.x = "O6", by.y =
## "player.number", : column names 'pre.rating.x', 'pre.rating.y', 'pre.rating.x',
## 'pre.rating.y' are duplicated in the result
playerrow <- merge(playerrow,reference,by.x = "O7",by.y = "player.number", sort = FALSE, all.x=TRUE)
## Warning in merge.data.frame(playerrow, reference, by.x = "O7", by.y =
## "player.number", : column names 'pre.rating.x', 'pre.rating.y', 'pre.rating.x',
## 'pre.rating.y' are duplicated in the result
#Dropping unneeded columns and renaming to be able to calculate average prerating of opponents
playerrow <- playerrow[,-c(1:7)]
colnames(playerrow) <- c("player.number","name","total.points","O1PR","O2PR","O3PR","O4PR","O5PR","O6PR","O7PR")
#converting vairables to numeric to be able to calculate average.
playerrow <- playerrow %>%
mutate_at(c(1,3:10),as.numeric)
#calculating average pre-rating of opponents
playerrow <- mutate(playerrow, "avg_prerating_opp" = rowMeans(playerrow[,4:10], na.rm = TRUE))
#Removing opponent pre-ratings
playerrow <- playerrow[,-c(4:10)]
#Creating and formatting the final table to be exported to csv
output.tb <- merge(playerrow,staterow, by = "player.number")
#organizing variable positions to match the requirements of the project
output.tb <- output.tb[,c(2,5,3,7,4)]
#converting column pre.rating into number data type, and rounding avg_prerating_opp
output.tb <- output.tb %>%
mutate_at(c(4),as.numeric)
output.tb <- output.tb %>%
mutate(avg_prerating_opp = round(avg_prerating_opp))
head(output.tb)
## name state total.points pre.rating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## avg_prerating_opp
## 1 1605
## 2 1469
## 3 1564
## 4 1574
## 5 1501
## 6 1519
write.csv(output.tb,"finalchessdf.csv")