Introduction

In this project I will read a text file of chess tournament results into R, tidy the resulting table, and produce a csv file with the following information of all players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

#Loading Packages

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Reading the text file into R
rawdata <- read.table("C:\\Users\\chung\\Downloads\\tournamentinfo.txt", sep = "|", fill = TRUE)

head(rawdata)
##                                                                                          V1
## 1 -----------------------------------------------------------------------------------------
## 2                                                                                     Pair 
## 3                                                                                     Num  
## 4 -----------------------------------------------------------------------------------------
## 5                                                                                        1 
## 6                                                                                       ON 
##                                  V2    V3    V4    V5    V6    V7    V8    V9
## 1                                                                            
## 2  Player Name                      Total Round Round Round Round Round Round
## 3  USCF ID / Rtg (Pre->Post)         Pts    1     2     3     4     5     6  
## 4                                                                            
## 5  GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7 D  12
## 6  15445895 / R: 1794   ->1817      N:2   W     B     W     B     W     B    
##     V10 V11
## 1        NA
## 2 Round  NA
## 3   7    NA
## 4        NA
## 5 D   4  NA
## 6 W      NA
#Removing the first three rows of header information, and every third row (used in the text file to seperate information from one player to the next). This leaves us with increments of two rows (each pair corresponds to the player and their performance in the tournament)

data <- rawdata[-c(1:3),-11]

data <- data[-grep('^----', data$V1),]

#Extracting player IDs and player prechess ratings from every second row.

staterow <- data[seq(2,128,2),]

#Removing unneeded columns
staterow <- staterow[,-c(3:10)]

#Splitting column V2 to extract the player id and prechess ratings
staterow <- staterow %>%
  separate(V2, c('c1','c2','c3','c4'))
## Warning: Expected 4 pieces. Additional pieces discarded in 64 rows [1, 2, 3, 4, 5, 6, 7,
## 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
#Removing unneeded columns
staterow <- staterow[,c(-2,-4)]

#Removing provisional "P" and game count of each player if applicable.
staterow <- staterow %>%
  separate(c4, c('pre.rating','c5'), sep = 'P')
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 54 rows [1, 2, 3, 4, 5,
## 6, 7, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, ...].
#removing unneeded columns and renamming 
staterow <- staterow[,-4]
staterow <- staterow %>%
  rename(state = V1,
         p.id = c2)

#Adding in a player number column so that we can reference the prechess ratings 

staterow$player.number <- seq.int(nrow(staterow))

#Extracting player name and total points from every other row of the “data” dataframe

#Creating new dataframe with the desired rows

playerrow <- data[seq(1,128,2),]
#Extracting the opponent numebrs for each round

playerrow <- extract (playerrow, V4, c('result1','O1'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V5, c('result2','O2'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V6, c('result3','O3'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V7, c('result4','O4'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V8, c('result5','O5'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V9, c('result6','O6'), '([A-Z]{0,1}) +([0-9]{0,2})')
playerrow <- extract (playerrow, V10, c('result7','O7'), '([A-Z]{0,1}) +([0-9]{0,2})')
#Remove unneeded columns and rename 

playerrow <- playerrow[,-c(4,6,8,10,12,14,16)]

playerrow <- playerrow %>%
  rename(player.number = V1,
         player.name = V2,
         total.points = V3)
#Creating a reference table to use left join to join the preratings into the player row data table

reference <- staterow[,c(4,3)]
#Join opponent pre chess ratings into player row to be able to calculate the average of opponent ratings.

playerrow <- merge(playerrow,reference,by.x = "O1",by.y = "player.number", sort = FALSE, all.x=TRUE)
playerrow <- merge(playerrow,reference,by.x = "O2",by.y = "player.number", sort = FALSE, all.x=TRUE)
playerrow <- merge(playerrow,reference,by.x = "O3",by.y = "player.number", sort = FALSE, all.x=TRUE)
playerrow <- merge(playerrow,reference,by.x = "O4",by.y = "player.number", sort = FALSE, all.x=TRUE)
## Warning in merge.data.frame(playerrow, reference, by.x = "O4", by.y =
## "player.number", : column names 'pre.rating.x', 'pre.rating.y' are duplicated
## in the result
playerrow <- merge(playerrow,reference,by.x = "O5",by.y = "player.number", sort = FALSE, all.x=TRUE)
## Warning in merge.data.frame(playerrow, reference, by.x = "O5", by.y =
## "player.number", : column names 'pre.rating.x', 'pre.rating.y' are duplicated
## in the result
playerrow <- merge(playerrow,reference,by.x = "O6",by.y = "player.number", sort = FALSE, all.x=TRUE)
## Warning in merge.data.frame(playerrow, reference, by.x = "O6", by.y =
## "player.number", : column names 'pre.rating.x', 'pre.rating.y', 'pre.rating.x',
## 'pre.rating.y' are duplicated in the result
playerrow <- merge(playerrow,reference,by.x = "O7",by.y = "player.number", sort = FALSE, all.x=TRUE)
## Warning in merge.data.frame(playerrow, reference, by.x = "O7", by.y =
## "player.number", : column names 'pre.rating.x', 'pre.rating.y', 'pre.rating.x',
## 'pre.rating.y' are duplicated in the result

#Dropping unneeded columns and renaming to be able to calculate average prerating of opponents

playerrow <- playerrow[,-c(1:7)]
colnames(playerrow) <- c("player.number","name","total.points","O1PR","O2PR","O3PR","O4PR","O5PR","O6PR","O7PR")
#converting vairables to numeric to be able to calculate average.

playerrow <- playerrow %>%
  mutate_at(c(1,3:10),as.numeric)

#calculating average pre-rating of opponents

playerrow <- mutate(playerrow, "avg_prerating_opp" = rowMeans(playerrow[,4:10], na.rm = TRUE))
#Removing opponent pre-ratings
playerrow <- playerrow[,-c(4:10)]

#Creating and formatting the final table to be exported to csv

output.tb <- merge(playerrow,staterow, by = "player.number")

#organizing variable positions to match the requirements of the project

output.tb <- output.tb[,c(2,5,3,7,4)]

#converting column pre.rating into number data type, and rounding avg_prerating_opp

output.tb <- output.tb %>%
  mutate_at(c(4),as.numeric)

output.tb <- output.tb %>%
  mutate(avg_prerating_opp = round(avg_prerating_opp))

head(output.tb)
##                                name  state total.points pre.rating
## 1  GARY HUA                            ON           6.0       1794
## 2  DAKSHESH DARURI                     MI           6.0       1553
## 3  ADITYA BAJAJ                        MI           6.0       1384
## 4  PATRICK H SCHILLING                 MI           5.5       1716
## 5  HANSHI ZUO                          MI           5.5       1655
## 6  HANSEN SONG                         OH           5.0       1686
##   avg_prerating_opp
## 1              1605
## 2              1469
## 3              1564
## 4              1574
## 5              1501
## 6              1519

Exporting final tb to csv

write.csv(output.tb,"finalchessdf.csv")