Data 607 Project 1

Pull in data. Instead of pulling in line by line, I’m going to just pull it into one big chunk.

library(stringr)
library(dplyr)
raw.data <- readChar('tournamentinfo.txt',file.info('tournamentinfo.txt')$size)

Get rid of the —-’s and pull each player’s data onto one line

data.raw.2 <-unlist(str_replace_all(str_replace_all(raw.data,'\n',''),'-{89}','\n'))
data.lines <- unlist(strsplit(data.raw.2,'\n'))
head(data.lines)

## [1] ""                                                                                                                                                                                          
## [2] "\r Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| \r Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | \r"
## [3] "\r    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|\r   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |\r"  
## [4] "\r    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|\r   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |\r"  
## [5] "\r    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|\r   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |\r"  
## [6] "\r    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|\r   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |\r"

Remove the top line as it’s blank. Split out some fields that are deliminated in different ways and clean up results

data.lines <- data.lines[-1]
data.lines <- str_replace_all(data.lines,'\r','')
data.lines.2 <- str_replace_all(data.lines,'/','|')
data.lines.2 <- str_replace_all(data.lines.2,'->','|')
data.lines.2 <- str_trim(str_replace_all(data.lines.2,' *\\| *','|'))
head(data.lines.2)

## [1] "Pair|Player Name|Total|Round|Round|Round|Round|Round|Round|Round|Num|USCF ID|Rtg (Pre|Post)|Pts|1|2|3|4|5|6|7|" 
## [2] "1|GARY HUA|6.0|W  39|W  21|W  18|W  14|W   7|D  12|D   4|ON|15445895|R: 1794|1817|N:2|W|B|W|B|W|B|W|"           
## [3] "2|DAKSHESH DARURI|6.0|W  63|W  58|L   4|W  17|W  16|W  20|W   7|MI|14598900|R: 1553|1663|N:2|B|W|B|W|B|W|B|"    
## [4] "3|ADITYA BAJAJ|6.0|L   8|W  61|W  25|W  21|W  11|W  13|W  12|MI|14959604|R: 1384|1640|N:2|W|B|W|B|W|B|W|"       
## [5] "4|PATRICK H SCHILLING|5.5|W  23|D  28|W   2|W  26|D   5|W  19|D   1|MI|12616049|R: 1716|1744|N:2|W|B|W|B|W|B|B|"
## [6] "5|HANSHI ZUO|5.5|W  45|W  37|D  12|D  13|D   4|W  14|W  17|MI|14601533|R: 1655|1690|N:2|B|W|B|W|B|W|B|"

Now each of the lines have the fields separated by a |. The headers don’t all make sense but I’ll leave them for now. Let’s pull just the data we need into a dataset. I’ll keep opponents as a comma separated list for now and pull the opponent’s rating after.

#This function will split out just the data we need and return it as a vector
pullData = function(v) {
  data = unlist(str_split(v, '\\|'))
  id = data[1]
  name = data[2]
  state = data[11]
  points = data[3]
  prerating = unlist(str_split(unlist(str_split(data[13], ' +'))[2], 'P'))[1]
  opponents = sapply(data[4:10],function(x) unlist(str_extract_all(x, '\\d+')))
  opponentString = str_replace_all(paste(opponents,collapse=","),'character\\(0\\)','')
  return(c(id,name,state,points,prerating,opponentString))
}

#Apply the function over our list to create a dataframe
data.lines.3 <- data.frame(do.call(rbind,lapply(data.lines.2, pullData)),stringsAsFactors = FALSE)

#Fix column names and remove the first line, as that is the header
colnames(data.lines.3) <- c('ID','Name','State','Points','Pre-Rating','OpponentList')
data.lines.3 = data.lines.3[-1, ]

#Now that we removed the header we can fix our numeric data.
data.lines.3$ID = type.convert(data.lines.3$ID)
data.lines.3['Pre-Rating'] = type.convert(data.lines.3[,'Pre-Rating'])
#Set the rows namesto the ID
rownames(data.lines.3) = data.lines.3$ID

head(data.lines.3)

##   ID                Name State Points Pre-Rating         OpponentList
## 1  1            GARY HUA    ON    6.0       1794   39,21,18,14,7,12,4
## 2  2     DAKSHESH DARURI    MI    6.0       1553   63,58,4,17,16,20,7
## 3  3        ADITYA BAJAJ    MI    6.0       1384  8,61,25,21,11,13,12
## 4  4 PATRICK H SCHILLING    MI    5.5       1716    23,28,2,26,5,19,1
## 5  5          HANSHI ZUO    MI    5.5       1655  45,37,12,13,4,14,17
## 6  6         HANSEN SONG    OH    5.0       1686 34,29,11,35,10,27,21

OK, so now I just need a function to loop through the “OpponentList” and take an average of the opponent’s prerating

#This function will pull in a comma separated list and average the ratings for that list of player IDs
opponentAverage = function(l) {
  v = unlist(str_split(l,','))
  v = v[v != ""]
  v = type.convert(v)
  ratings = round(as.double(lapply(v, function(x) data.lines.3[x,'Pre-Rating'])), 1)
  return(mean(ratings))
}

#Apply over our data frame
data.lines.3$OpponentAverage <- as.double(lapply(data.lines.3$OpponentList,opponentAverage))

#Remove the ID and OpponenetList as we don't want those in our final data set.
data.final = select(data.lines.3,-c(ID,OpponentList))
head(data.final)

##                  Name State Points Pre-Rating OpponentAverage
## 1            GARY HUA    ON    6.0       1794        1605.286
## 2     DAKSHESH DARURI    MI    6.0       1553        1469.286
## 3        ADITYA BAJAJ    MI    6.0       1384        1563.571
## 4 PATRICK H SCHILLING    MI    5.5       1716        1573.571
## 5          HANSHI ZUO    MI    5.5       1655        1500.857
## 6         HANSEN SONG    OH    5.0       1686        1518.714

Looks pretty good! Let’s save our csv.

write.csv(data.final, 'chess_player_list.csv', row.names=FALSE)

Data 607 Project 1

Steven Ellingson

September 10, 2019