library(stringr)
library(DT)

Load Data

# Data hosted on txt file on my GitHubr
raw_chess_data <- read.table(url("https://gist.githubusercontent.com/trishitanath334/1ad12c228de8510e25c2edbcf1a69b89/raw/c66cbd8848d847ae035543c4140f89dd58fade3f/tournamentinfo.txt"), sep = ",")

#exclude top row since it does not have relevant data
raw_chess_data <- raw_chess_data[c(5:nrow(raw_chess_data)),]
head(raw_chess_data) #preview
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [5] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "-----------------------------------------------------------------------------------------"

Extract Variables from data

Player IDs

id_regx <- "\\d{1,2}(?=\\s\\|)"
player_IDs <- unlist(str_extract_all(unlist(raw_chess_data), id_regx))

Player names

# regx = uppercase letters, followed by space, (at least two matches)
name_regx <- "([[:upper:]]+\\s){2,}"
player_names <- unlist(str_extract_all(unlist(raw_chess_data), name_regx))
str_trim(player_names, side = "right")
##  [1] "GARY HUA"                 "DAKSHESH DARURI"         
##  [3] "ADITYA BAJAJ"             "PATRICK H SCHILLING"     
##  [5] "HANSHI ZUO"               "HANSEN SONG"             
##  [7] "GARY DEE SWATHELL"        "EZEKIEL HOUGHTON"        
##  [9] "STEFANO LEE"              "ANVIT RAO"               
## [11] "CAMERON WILLIAM MC LEMAN" "KENNETH J TACK"          
## [13] "TORRANCE HENRY JR"        "BRADLEY SHAW"            
## [15] "ZACHARY JAMES HOUGHTON"   "MIKE NIKITIN"            
## [17] "RONALD GRZEGORCZYK"       "DAVID SUNDEEN"           
## [19] "DIPANKAR ROY"             "JASON ZHENG"             
## [21] "DINH DANG BUI"            "EUGENE L MCCLURE"        
## [23] "ALAN BUI"                 "MICHAEL R ALDRICH"       
## [25] "LOREN SCHWIEBERT"         "MAX ZHU"                 
## [27] "GAURAV GIDWANI"           "SOFIA ADINA"             
## [29] "CHIEDOZIE OKORIE"         "GEORGE AVERY JONES"      
## [31] "RISHI SHETTY"             "JOSHUA PHILIP MATHEWS"   
## [33] "JADE GE"                  "MICHAEL JEFFERY THOMAS"  
## [35] "JOSHUA DAVID LEE"         "SIDDHARTH JHA"           
## [37] "AMIYATOSH PWNANANDAM"     "BRIAN LIU"               
## [39] "JOEL R HENDON"            "FOREST ZHANG"            
## [41] "KYLE WILLIAM MURPHY"      "JARED GE"                
## [43] "ROBERT GLEN VASEY"        "JUSTIN D SCHILLING"      
## [45] "DEREK YAN"                "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT"              "DANIEL KHAIN"            
## [49] "MICHAEL J MARTIN"         "SHIVAM JHA"              
## [51] "TEJAS AYYAGARI"           "ETHAN GUO"               
## [53] "JOSE C YBARRA"            "LARRY HODGE"             
## [55] "ALEX KONG"                "MARISA RICCI"            
## [57] "MICHAEL LU"               "VIRAJ MOHILE"            
## [59] "SEAN M MC CORMICK"        "JULIA SHEN"              
## [61] "JEZZEL FARKAS"            "ASHWIN BALAJI"           
## [63] "THOMAS JOSEPH HOSMER"     "BEN LI"
# regx= two uppercase letters, followed by space, followed by a '|'

state_regx <- "([[:upper:]]){2}\\s(?=\\|)"
player_states <- unlist(str_extract_all(unlist(raw_chess_data), state_regx))
str_trim(player_states, side = "right")
##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI"
## [16] "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI" "MI" "ON"
## [31] "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [46] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [61] "ON" "MI" "MI" "MI"

Total Points

# regx=two digits and '.' between

points_regx <- "\\d\\.\\d"
chess_points <- unlist(str_extract_all(unlist(raw_chess_data), points_regx))
chess_points
##  [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5" "4.5"
## [13] "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [25] "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [37] "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "2.5" "2.5"
## [49] "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0" "2.0" "2.0" "2.0" "2.0" "1.5"
## [61] "1.5" "1.0" "1.0" "1.0"

Player’s Pre-Ratings

# regx = remove leading patterns with a > and a space. search for 1 or 2 spaces, or one space and a colon, followed by 3 or 4 digits, followed by a space or the letter 'P'

pre_regx <- "(?<!\\>\\s)(?<=\\s{1,2}|\\s\\:)(\\d{3,4}(?=\\s|P))"
player_prerating <- unlist(str_extract_all(unlist(raw_chess_data), pre_regx))
player_prerating <- str_trim(player_prerating)

Opponents average Pre-Rating

opponents_regx <- "(\\d{1,}|[[:blank:]]{1})(?=\\|)"
opponents <- unlist(str_extract_all(unlist(raw_chess_data), opponents_regx))
opponents[opponents==" "]  <- NA
opponent1 <- as.numeric(opponents[seq(4, length(opponents), 10)])
opponent1 <- as.numeric(opponent1[seq(1, length(opponent1), 2)])
opponent2 <- as.numeric(opponents[seq(5, length(opponents), 10)])
opponent2 <- as.numeric(opponent2[seq(1, length(opponent2), 2)])
opponent3 <- as.numeric(opponents[seq(6, length(opponents), 10)])
opponent3 <- as.numeric(opponent3[seq(1, length(opponent3), 2)])
opponent4 <- as.numeric(opponents[seq(7, length(opponents), 10)])
opponent4 <- as.numeric(opponent4[seq(1, length(opponent4), 2)])
opponent5 <- as.numeric(opponents[seq(8, length(opponents), 10)])
opponent5 <- as.numeric(opponent5[seq(1, length(opponent5), 2)])
opponent6 <- as.numeric(opponents[seq(9, length(opponents), 10)])
opponent6 <- as.numeric(opponent6[seq(1, length(opponent6), 2)])
opponent7 <- as.numeric(opponents[seq(10, length(opponents), 10)])
opponent7 <- as.numeric(opponent7[seq(1, length(opponent7), 2)])
opponents <- matrix(c(opponent1, opponent2, opponent3, opponent4, opponent5, opponent6, opponent7),nrow = 64, ncol = 7)

# Match the opponent to the player's id

chess_avg <- 0
data_table <- 0
for (i in 1:(length(player_IDs)))
{
  chess_avg[i] <- mean(as.numeric(player_prerating[opponents[i,]]), na.rm = T)
}

Generate Data Table

data_table <- data.frame(player_names, player_states, chess_points, player_prerating, chess_avg)
datatable(data_table, extensions = 'Scroller', options = list(
  deferRender = TRUE,
  scrollY = 200,
  scroller = TRUE
))

Write data to csv file

write.csv(data_table, file = "ChessData.csv")