Pull in data. Instead of pulling in line by line, I’m going to just pull it into one big chunk.
library(stringr)
library(dplyr)
raw.data <- readChar('tournamentinfo.txt',file.info('tournamentinfo.txt')$size)
Get rid of the —-’s and pull each player’s data onto one line
data.raw.2 <-unlist(str_replace_all(str_replace_all(raw.data,'\n',''),'-{89}','\n'))
data.lines <- unlist(strsplit(data.raw.2,'\n'))
head(data.lines)
## [1] ""
## [2] "\r Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| \r Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | \r"
## [3] "\r 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|\r ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |\r"
## [4] "\r 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|\r MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |\r"
## [5] "\r 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|\r MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |\r"
## [6] "\r 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|\r MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |\r"
Remove the top line as it’s blank. Split out some fields that are deliminated in different ways and clean up results
data.lines <- data.lines[-1]
data.lines <- str_replace_all(data.lines,'\r','')
data.lines.2 <- str_replace_all(data.lines,'/','|')
data.lines.2 <- str_replace_all(data.lines.2,'->','|')
data.lines.2 <- str_trim(str_replace_all(data.lines.2,' *\\| *','|'))
head(data.lines.2)
## [1] "Pair|Player Name|Total|Round|Round|Round|Round|Round|Round|Round|Num|USCF ID|Rtg (Pre|Post)|Pts|1|2|3|4|5|6|7|"
## [2] "1|GARY HUA|6.0|W 39|W 21|W 18|W 14|W 7|D 12|D 4|ON|15445895|R: 1794|1817|N:2|W|B|W|B|W|B|W|"
## [3] "2|DAKSHESH DARURI|6.0|W 63|W 58|L 4|W 17|W 16|W 20|W 7|MI|14598900|R: 1553|1663|N:2|B|W|B|W|B|W|B|"
## [4] "3|ADITYA BAJAJ|6.0|L 8|W 61|W 25|W 21|W 11|W 13|W 12|MI|14959604|R: 1384|1640|N:2|W|B|W|B|W|B|W|"
## [5] "4|PATRICK H SCHILLING|5.5|W 23|D 28|W 2|W 26|D 5|W 19|D 1|MI|12616049|R: 1716|1744|N:2|W|B|W|B|W|B|B|"
## [6] "5|HANSHI ZUO|5.5|W 45|W 37|D 12|D 13|D 4|W 14|W 17|MI|14601533|R: 1655|1690|N:2|B|W|B|W|B|W|B|"
Now each of the lines have the fields separated by a |. The headers don’t all make sense but I’ll leave them for now. Let’s pull just the data we need into a dataset. I’ll keep opponents as a comma separated list for now and pull the opponent’s rating after.
#This function will split out just the data we need and return it as a vector
pullData = function(v) {
data = unlist(str_split(v, '\\|'))
id = data[1]
name = data[2]
state = data[11]
points = data[3]
prerating = unlist(str_split(unlist(str_split(data[13], ' +'))[2], 'P'))[1]
opponents = sapply(data[4:10],function(x) unlist(str_extract_all(x, '\\d+')))
opponentString = str_replace_all(paste(opponents,collapse=","),'character\\(0\\)','')
return(c(id,name,state,points,prerating,opponentString))
}
#Apply the function over our list to create a dataframe
data.lines.3 <- data.frame(do.call(rbind,lapply(data.lines.2, pullData)),stringsAsFactors = FALSE)
#Fix column names and remove the first line, as that is the header
colnames(data.lines.3) <- c('ID','Name','State','Points','Pre-Rating','OpponentList')
data.lines.3 = data.lines.3[-1, ]
#Now that we removed the header we can fix our numeric data.
data.lines.3$ID = type.convert(data.lines.3$ID)
data.lines.3['Pre-Rating'] = type.convert(data.lines.3[,'Pre-Rating'])
#Set the rows namesto the ID
rownames(data.lines.3) = data.lines.3$ID
head(data.lines.3)
## ID Name State Points Pre-Rating OpponentList
## 1 1 GARY HUA ON 6.0 1794 39,21,18,14,7,12,4
## 2 2 DAKSHESH DARURI MI 6.0 1553 63,58,4,17,16,20,7
## 3 3 ADITYA BAJAJ MI 6.0 1384 8,61,25,21,11,13,12
## 4 4 PATRICK H SCHILLING MI 5.5 1716 23,28,2,26,5,19,1
## 5 5 HANSHI ZUO MI 5.5 1655 45,37,12,13,4,14,17
## 6 6 HANSEN SONG OH 5.0 1686 34,29,11,35,10,27,21
OK, so now I just need a function to loop through the “OpponentList” and take an average of the opponent’s prerating
#This function will pull in a comma separated list and average the ratings for that list of player IDs
opponentAverage = function(l) {
v = unlist(str_split(l,','))
v = v[v != ""]
v = type.convert(v)
ratings = round(as.double(lapply(v, function(x) data.lines.3[x,'Pre-Rating'])), 1)
return(mean(ratings))
}
#Apply over our data frame
data.lines.3$OpponentAverage <- as.double(lapply(data.lines.3$OpponentList,opponentAverage))
#Remove the ID and OpponenetList as we don't want those in our final data set.
data.final = select(data.lines.3,-c(ID,OpponentList))
head(data.final)
## Name State Points Pre-Rating OpponentAverage
## 1 GARY HUA ON 6.0 1794 1605.286
## 2 DAKSHESH DARURI MI 6.0 1553 1469.286
## 3 ADITYA BAJAJ MI 6.0 1384 1563.571
## 4 PATRICK H SCHILLING MI 5.5 1716 1573.571
## 5 HANSHI ZUO MI 5.5 1655 1500.857
## 6 HANSEN SONG OH 5.0 1686 1518.714
Looks pretty good! Let’s save our csv.
write.csv(data.final, 'chess_player_list.csv', row.names=FALSE)