Create a dataframe summarizing a chess tournament from the raw tournament results
Output dataframe to CSV file
library(pander)
library(stringr)
library(htmlTable)
chess <- read.csv("https://raw.githubusercontent.com/TheFedExpress/Data/master/chess.txt", sep= "|", stringsAsFactors = FALSE, header = FALSE) #the boxes are delimited by pipes, so we make a semi-structured dataframe reading it in this way
colnames(chess) <- c("Pairnum", "playerinfo", "total", "r1", "r2", "r3", "r4", "r5", "r6", "r7")
#the playerinfo column right now contains multiple variables that will be parsed out later
chess <- chess[,1:10]
head(chess,8)
chess_new <- chess[grepl(".+---.+",chess$Pairnum)==FALSE, 1:10]
rows <-nrow(chess_new)
chess_new$index <- 1:nrow(chess_new)
for (col in seq(1,10)){
for (row in seq(2,rows,2)){
chess_new[row,col] <-paste(chess_new[row-1, col], chess_new[row,col], sep = " ")
}
}
chess_new <- chess_new[chess_new$index%%2 == 0 & chess_new$index != 2,]
#After the loop combines two rows on the even row, we remove the unecessary odd rows
head(chess_new)
#the dataset now contains one row per player
#Unlist is used below to avoid the dataframe vectors being replaced by lists
chess_new[,4:10] <- unlist(lapply(chess_new[,4:10], function(x) str_extract(x, "\\d{1,2}")))
#This extracts the opponent ID for each player for each round
chess_new$name <- unlist(lapply(chess_new$playerinfo, function(x) str_extract(x, "[[:alpha:]\\s\\-]+")))
#This extracts the player name and creates a new column
chess_new$temp <- unlist(lapply(chess_new$playerinfo, function(x) str_replace_all(x, ' ', '')))
#because the structure of the rating info isn't standardized, we make it more manageable by removing spaces
chess_new$rating_start <- unlist(lapply(chess_new$temp, function(x) str_sub (str_extract(x, "R:\\d{1,}"),3)))
#create a new column containing each player's starting rating
chess_new$rating_end <- unlist(lapply(chess_new$temp, function(x) str_sub (str_extract(x, "->\\d{1,}"),3)))
#create a new columns with each player's ending rating
chess_new[,4:10] <- lapply(chess_new[,4:10], function(x) as.integer(x))
#We want to use the opponent IDs to retrieve the index of the player rating vector so they must be integers
chess_new[,4:10] <- lapply(chess_new[,4:10], function(x) chess_new$rating_start[x])
#Instead of looping through the player ratings vector multiple times for each player, this quickly retrieves opponents ratings
chess_new[,4:10] <- lapply(chess_new[,4:10], as.numeric)
chess_new$avg_opponent <- round(rowMeans(chess_new[,4:10], na.rm = TRUE))
#Get each player's average opponent rating by averaging the row
chess_new$state <- unlist(lapply(chess_new$Pairnum, function(x) str_extract(x, "[[:alpha:]]+")))
#Create a new variable containing the player's home state
chess_new$points <- as.numeric(unlist(lapply(chess_new$total, function(x) str_extract(x, "\\d+\\.\\d+"))))
#Finally, we extract the player's total points for the tournament
head(chess_new)
chess_final <- chess_new[, c("name", "state", "points", "rating_start", "avg_opponent")]
pander(head(chess_final), caption = "Just the rownames need changing")
| name | state | points | rating_start | avg_opponent | |
|---|---|---|---|---|---|
| 6 | GARY HUA | ON | 6 | 1794 | 1605 |
| 9 | DAKSHESH DARURI | MI | 6 | 1553 | 1469 |
| 12 | ADITYA BAJAJ | MI | 6 | 1384 | 1564 |
| 15 | PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
| 18 | HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
| 21 | HANSEN SONG | OH | 5 | 1686 | 1519 |
row.names(chess_final) <- 1:nrow(chess_final)
#After subsetting the dataset, it has retained its old rownames. We change that here.
pander(head(chess_final), caption = "Final Dataset")
| name | state | points | rating_start | avg_opponent |
|---|---|---|---|---|
| GARY HUA | ON | 6 | 1794 | 1605 |
| DAKSHESH DARURI | MI | 6 | 1553 | 1469 |
| ADITYA BAJAJ | MI | 6 | 1384 | 1564 |
| PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
| HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
| HANSEN SONG | OH | 5 | 1686 | 1519 |
write.csv(chess_final, "player_info.csv")