Libraries

library(tidyverse)
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Preliminary Cleaning

# import file, convert to tibble
rawData <- as_tibble(read.delim('tournamentInfo.txt', header = FALSE, sep = '|', stringsAsFactors =FALSE))

# remove rows with dashes
rawData <- rawData %>% 
  filter(!str_detect(V1,'-----'))

# rename columns
colnames(rawData) = c('PAIR_NUM', 'PLAYER_NAME','TOTAL_PTS','R1', 'R2', 'R3','R4','R5','R6','R7', 'REMOVE')

# filter out rows/columns that do not contain data 
rawData <- rawData %>% 
  filter(str_detect(rawData$PAIR_NUM,'[A-Z]{2}|[[:digit:]]')) %>%
  select(-'REMOVE')

Combine the records

In order to make sure that we can aggregate the data in the proper way, we will need to alter the records so that each row contains all of the information for 1 player. To do this, we will:

# create 2 new columns
rawData$STATE <- 1:nrow(rawData)
rawData$PRE_RATING <- 1:nrow(rawData)

# update STATE column
rawData[str_detect(rawData$PAIR_NUM,'[[:digit:]]'),11] <- rawData[str_detect(rawData$PAIR_NUM,'[A-Z]{2}'),1]

# update PRE_RATING column
rawData[str_detect(rawData$PAIR_NUM,'[[:digit:]]'),12] <- rawData[str_detect(rawData$PAIR_NUM,'[A-Z]{2}'),2]

# remove columns without player number in first column
rawData <- rawData %>% 
  filter(str_detect(rawData$PAIR_NUM, '[[:digit:]]'))

Reformat data

Now that the data frame has 1 record per player, we need to extract out the pre-rating score and the opponents in each round.

# re-define pre-game rating
rawData$PRE_RATING <- str_extract(str_extract(rawData$PRE_RATING,'R\\:[[:space:]]+[[:digit:]]{1,5}'),'[[:digit:]]+')


# replace values in each of the rounds with just the indiviual that the player faced
cols <- c('R1','R2','R3','R4','R5','R6','R7')

for (c in cols){
  rawData[[c]] <- str_extract(rawData[[c]],'[[:digit:]]+')
}

Calculate the opponent scores

Before compiling all of the opponent scores, we will need to make sure that all of our columns containing numeric values are converted from strings.

newCols <- c('R1','R2','R3','R4','R5','R6','R7', 'PRE_RATING', 'TOTAL_PTS')
rawData[newCols] <- sapply(rawData[newCols],as.numeric)

Now that the data is numeric, we can start to compile the scores of the opponents. We will first create 2 empty lists - one that will end up storing the summed scores of all the opponents and one that will store the total number of opponents per player.

summaryList <- matrix(0, nrow(rawData), 1)
numOpponents <- matrix(0, nrow(rawData), 1)

Finally, we’ll create a loop that iterates through each of the player’s opponents and operations in the following manner:

for (c in cols){
  toAdd <- rawData[rawData[[c]],12]
  opponentList <- matrix(0,nrow(rawData),1)
  opponentList[!is.na(toAdd)] <- 1
  toAdd[is.na(toAdd)] <- 0
  summaryList <- summaryList + toAdd
  numOpponents <- numOpponents + opponentList
}

Now we will calculate the average pre-chess rating of each of the opponents.

rawData$AVG_OPPONENT_RATING <- summaryList$PRE_RATING / numOpponents

Final data set

Our final data set will include Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents and will be saved as a csv.

finalCols <- c('PLAYER_NAME', 'STATE', 'TOTAL_PTS', 'PRE_RATING', 'AVG_OPPONENT_RATING')

finalData <- rawData[,finalCols]
write.csv(finalData, 'finalData.csv')
head(finalData)
## # A tibble: 6 x 5
##   PLAYER_NAME              STATE   TOTAL_PTS PRE_RATING AVG_OPPONENT_RATIN~
##   <chr>                    <chr>       <dbl>      <dbl>               <dbl>
## 1 " GARY HUA             ~ "   ON~       6         1794               1605.
## 2 " DAKSHESH DARURI      ~ "   MI~       6         1553               1469.
## 3 " ADITYA BAJAJ         ~ "   MI~       6         1384               1564.
## 4 " PATRICK H SCHILLING  ~ "   MI~       5.5       1716               1574.
## 5 " HANSHI ZUO           ~ "   MI~       5.5       1655               1501.
## 6 " HANSEN SONG          ~ "   OH~       5         1686               1519.