library(tidyverse)
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# import file, convert to tibble
rawData <- as_tibble(read.delim('tournamentInfo.txt', header = FALSE, sep = '|', stringsAsFactors =FALSE))
# remove rows with dashes
rawData <- rawData %>%
filter(!str_detect(V1,'-----'))
# rename columns
colnames(rawData) = c('PAIR_NUM', 'PLAYER_NAME','TOTAL_PTS','R1', 'R2', 'R3','R4','R5','R6','R7', 'REMOVE')
# filter out rows/columns that do not contain data
rawData <- rawData %>%
filter(str_detect(rawData$PAIR_NUM,'[A-Z]{2}|[[:digit:]]')) %>%
select(-'REMOVE')
In order to make sure that we can aggregate the data in the proper way, we will need to alter the records so that each row contains all of the information for 1 player. To do this, we will:
# create 2 new columns
rawData$STATE <- 1:nrow(rawData)
rawData$PRE_RATING <- 1:nrow(rawData)
# update STATE column
rawData[str_detect(rawData$PAIR_NUM,'[[:digit:]]'),11] <- rawData[str_detect(rawData$PAIR_NUM,'[A-Z]{2}'),1]
# update PRE_RATING column
rawData[str_detect(rawData$PAIR_NUM,'[[:digit:]]'),12] <- rawData[str_detect(rawData$PAIR_NUM,'[A-Z]{2}'),2]
# remove columns without player number in first column
rawData <- rawData %>%
filter(str_detect(rawData$PAIR_NUM, '[[:digit:]]'))
Now that the data frame has 1 record per player, we need to extract out the pre-rating score and the opponents in each round.
# re-define pre-game rating
rawData$PRE_RATING <- str_extract(str_extract(rawData$PRE_RATING,'R\\:[[:space:]]+[[:digit:]]{1,5}'),'[[:digit:]]+')
# replace values in each of the rounds with just the indiviual that the player faced
cols <- c('R1','R2','R3','R4','R5','R6','R7')
for (c in cols){
rawData[[c]] <- str_extract(rawData[[c]],'[[:digit:]]+')
}
Before compiling all of the opponent scores, we will need to make sure that all of our columns containing numeric values are converted from strings.
newCols <- c('R1','R2','R3','R4','R5','R6','R7', 'PRE_RATING', 'TOTAL_PTS')
rawData[newCols] <- sapply(rawData[newCols],as.numeric)
Now that the data is numeric, we can start to compile the scores of the opponents. We will first create 2 empty lists - one that will end up storing the summed scores of all the opponents and one that will store the total number of opponents per player.
summaryList <- matrix(0, nrow(rawData), 1)
numOpponents <- matrix(0, nrow(rawData), 1)
Finally, we’ll create a loop that iterates through each of the player’s opponents and operations in the following manner:
for (c in cols){
toAdd <- rawData[rawData[[c]],12]
opponentList <- matrix(0,nrow(rawData),1)
opponentList[!is.na(toAdd)] <- 1
toAdd[is.na(toAdd)] <- 0
summaryList <- summaryList + toAdd
numOpponents <- numOpponents + opponentList
}
Now we will calculate the average pre-chess rating of each of the opponents.
rawData$AVG_OPPONENT_RATING <- summaryList$PRE_RATING / numOpponents
Our final data set will include Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents and will be saved as a csv.
finalCols <- c('PLAYER_NAME', 'STATE', 'TOTAL_PTS', 'PRE_RATING', 'AVG_OPPONENT_RATING')
finalData <- rawData[,finalCols]
write.csv(finalData, 'finalData.csv')
head(finalData)
## # A tibble: 6 x 5
## PLAYER_NAME STATE TOTAL_PTS PRE_RATING AVG_OPPONENT_RATIN~
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 " GARY HUA ~ " ON~ 6 1794 1605.
## 2 " DAKSHESH DARURI ~ " MI~ 6 1553 1469.
## 3 " ADITYA BAJAJ ~ " MI~ 6 1384 1564.
## 4 " PATRICK H SCHILLING ~ " MI~ 5.5 1716 1574.
## 5 " HANSHI ZUO ~ " MI~ 5.5 1655 1501.
## 6 " HANSEN SONG ~ " OH~ 5 1686 1519.