#These are the packages used, placed in a fake code environment because gdata complains about perl not being in my PATH file and that takes up a full screen worth of space
library(stringr)
library(gdata)
library(dplyr)
library(magrittr)
First loading the csv into a character vector.
#ignore the warning, I have been running this off the .txt file from the course site, I am not sure why it is giving it to me
results<-readLines("tournamentinfo.txt")
## Warning in readLines("tournamentinfo.txt"): incomplete final line found on
## 'tournamentinfo.txt'
Then parsing out the top and bottom lines of each competitor. Starting with line 5 since the rows before that have no useful data.
player.raw<-t(c(1:12))
player.raw<-as.data.frame(player.raw) #added later, once I got errors indicating that R was still treating player.raw as a vector
numbfind<-"[0-9]+\\.{0,1}[0-9]*" #looks for at least one number with at most one decimal point and perhaps some numbers after the decimal
charfind<-"[A-Z]+" #doesn't need to be more complex, as the competitor names are parsed out against the |, so we only need to worry about states. This could probably have been "[A-Z]{2}", but this works, so I'm not changing it (Maxim 43)
names(player.raw)=c("ID","Name","Points","op1","op2","op3","op4","op5","op6","op7","State","Rank")
for(i in 5:length(results)){
if(mod(i,3)==2){
#dumps all the numbers in and then adds the name. We'll rewrite over the 11th entry in the next iteration
topsplit<-str_split(results[i],"\\|") #splitting the row into 10 different strings so we can work with each one
player.raw[i,]<-rbind(unlist(sapply(topsplit, FUN = function(x) str_match(x,numbfind))),NA) #binding the NA so that there are 12 columns, otherwise get an error
player.raw[i,2]<-trim(unlist(topsplit)[2]) #name, needed because the rest is dumped as numbers
}else if(mod(i,3)==0){
#this is the second line for the player, remember to use i- for all the player.raw[row] things
player.raw[i-1,11]<-str_match(results[i],charfind) #getting the state
player.raw[i-1,12]<-as.numeric(unlist(str_match_all(unlist(str_split(results[i],"P")),numbfind)))[2] #this one is the pre-tournament rank, the interior str_split against P is to remove the xPy provisional rankings and just get the x
}
##else not needed, don't need the remaining line
}
Now for some cleanup, given that player.raw is 195 lines, most of them full of NA
#player.raw is now a 195 line data.frame with a ton of NA lines. Need to get rid of them
player.rev<-player.raw[complete.cases(player.raw[,1]),]
#player.rev now is almost there. Want to get rid of initialization line
player.rev<-player.rev[2:nrow(player.rev),]
Now creating two functions to replace the opponent’s number with the opponent’s rating.
rankreplacer<-function(x,df){ #x is the player number, df is the dataframe
a<-as.numeric(df[x,12])
return(a)
}
#well, that feels unnecessary, although this allows for a sapply
rowrankreplacer<-function(y,df){#this one goes through an entire row and does the rank replacement
a<-as.numeric(unlist(sapply(as.numeric(df[y,4:10]), FUN = function(x) rankreplacer(x,df))))
return(a)
}##for some reason, this kept reverting to strings at various points, thus the double as.numeric. Still ran into trouble later, even with this. They're kept as strings from the original str_split and it seems that they want to stay that way
I was unable to get sapply, vapply or apply to do the rowrankreplacer against each row and move down one, so there is one more loop:
player.finalrev<-player.rev
#I feel like there should be some way to do an apply or lapply or vapply to do this, but there doesn't seem to be one, similarly, rowwise doesn't seem to want to cooperate either
for(i in 1:64){
player.finalrev[i,4:10]<-rowrankreplacer(i,player.rev)
}
Now creating a final column with the average of opponents ratings.
player.finalrev%<>%rowwise%>%mutate(aveop=mean(as.numeric(c(op1,op2,op3,op4,op5,op6,op7)),na.rm = TRUE)) #so much neater and more R-like
Finally, creating the table that is written to the csv and writing it to the csv.
player.table<-player.finalrev[,1:5]#easiest way to get the right number of rows so I don't get even more warnings
names(player.table)=c("Player Name","State","Score","Pre-Rating","Average Opponent Rating")
player.table[,1]<-player.finalrev[,2]
player.table[,2]<-player.finalrev[,11]
player.table[,3]<-player.finalrev[,3]
player.table[,4]<-player.finalrev[,12]
player.table[,5]<-player.finalrev[,13]
write.csv(player.table,"Project 1 Table.csv")
Finally, in case the reader doesn’t want to run all the code, the table:
player.table
## # A tibble: 64 x 5
## `Player Name` State Score `Pre-Rating` `Average Opponent Rating`
## <chr> <chr> <chr> <chr> <dbl>
## 1 GARY HUA ON 6.0 1794 1605.
## 2 DAKSHESH DARURI MI 6.0 1553 1469.
## 3 ADITYA BAJAJ MI 6.0 1384 1564.
## 4 PATRICK H SCHILLING MI 5.5 1716 1574.
## 5 HANSHI ZUO MI 5.5 1655 1501.
## 6 HANSEN SONG OH 5.0 1686 1519.
## 7 GARY DEE SWATHELL MI 5.0 1649 1372.
## 8 EZEKIEL HOUGHTON MI 5.0 1641 1468.
## 9 STEFANO LEE ON 5.0 1411 1523.
## 10 ANVIT RAO MI 5.0 1365 1554.
## # ... with 54 more rows