Objectives

Create a dataframe summarizing a chess tournament from the raw tournament results

Output dataframe to CSV file

Initialization and read-in

library(pander)
library(stringr)
library(htmlTable)
chess <- read.csv("https://raw.githubusercontent.com/TheFedExpress/Data/master/chess.txt", sep= "|", stringsAsFactors = FALSE, header = FALSE) #the boxes are delimited by pipes, so we make a semi-structured dataframe reading it in this way
colnames(chess) <- c("Pairnum", "playerinfo", "total", "r1", "r2", "r3", "r4", "r5", "r6", "r7")
#the playerinfo column right now contains multiple variables that will be parsed out later
chess <- chess[,1:10]
head(chess,8)

One row per player

chess_new <- chess[grepl(".+---.+",chess$Pairnum)==FALSE, 1:10]
rows <-nrow(chess_new)
chess_new$index <- 1:nrow(chess_new)
for (col in seq(1,10)){ 
  for (row in seq(2,rows,2)){ 
      chess_new[row,col] <-paste(chess_new[row-1, col], chess_new[row,col], sep = " ")
      }
  }
chess_new <- chess_new[chess_new$index%%2 == 0 & chess_new$index != 2,]
#After the loop combines two rows on the even row, we remove the unecessary odd rows
head(chess_new)
#the dataset now contains one row per player

Parse out the necessary info

#Unlist is used below to avoid the dataframe vectors being replaced by lists

chess_new[,4:10] <- unlist(lapply(chess_new[,4:10], function(x) str_extract(x, "\\d{1,2}")))
#This extracts the opponent ID for each player for each round

chess_new$name <- unlist(lapply(chess_new$playerinfo, function(x) str_extract(x, "[[:alpha:]\\s\\-]+")))
#This extracts the player name and creates a new column

chess_new$temp <- unlist(lapply(chess_new$playerinfo, function(x) str_replace_all(x, ' ', '')))
#because the structure of the rating info isn't standardized, we make it more manageable by removing spaces

chess_new$rating_start <- unlist(lapply(chess_new$temp, function(x)  str_sub (str_extract(x, "R:\\d{1,}"),3)))
#create a new column containing each player's starting rating

chess_new$rating_end <- unlist(lapply(chess_new$temp, function(x)  str_sub (str_extract(x, "->\\d{1,}"),3)))
#create a new columns with each player's ending rating

chess_new[,4:10] <- lapply(chess_new[,4:10], function(x) as.integer(x))
#We want to use the opponent IDs to retrieve the index of the player rating vector so they must be integers

chess_new[,4:10] <- lapply(chess_new[,4:10], function(x) chess_new$rating_start[x])
#Instead of looping through the player ratings vector multiple times for each player, this quickly retrieves opponents ratings

chess_new[,4:10] <- lapply(chess_new[,4:10], as.numeric)
chess_new$avg_opponent <- round(rowMeans(chess_new[,4:10], na.rm = TRUE))
#Get each player's average opponent rating by averaging the row

chess_new$state <- unlist(lapply(chess_new$Pairnum, function(x) str_extract(x, "[[:alpha:]]+")))
#Create a new variable containing the player's home state

chess_new$points <- as.numeric(unlist(lapply(chess_new$total, function(x) str_extract(x, "\\d+\\.\\d+"))))
#Finally, we extract the player's total points for the tournament

head(chess_new)

Create final dataset

chess_final <- chess_new[, c("name", "state", "points", "rating_start", "avg_opponent")]
pander(head(chess_final), caption = "Just the rownames need changing")
Just the rownames need changing
  name state points rating_start avg_opponent
6 GARY HUA ON 6 1794 1605
9 DAKSHESH DARURI MI 6 1553 1469
12 ADITYA BAJAJ MI 6 1384 1564
15 PATRICK H SCHILLING MI 5.5 1716 1574
18 HANSHI ZUO MI 5.5 1655 1501
21 HANSEN SONG OH 5 1686 1519
row.names(chess_final) <- 1:nrow(chess_final)
#After subsetting the dataset, it has retained its old rownames.  We change that here.
pander(head(chess_final), caption = "Final Dataset")
Final Dataset
name state points rating_start avg_opponent
GARY HUA ON 6 1794 1605
DAKSHESH DARURI MI 6 1553 1469
ADITYA BAJAJ MI 6 1384 1564
PATRICK H SCHILLING MI 5.5 1716 1574
HANSHI ZUO MI 5.5 1655 1501
HANSEN SONG OH 5 1686 1519
write.csv(chess_final, "player_info.csv")