library(stringr)
library(DT)
# Data hosted on txt file on my GitHubr
raw_chess_data <- read.table(url("https://gist.githubusercontent.com/trishitanath334/1ad12c228de8510e25c2edbcf1a69b89/raw/c66cbd8848d847ae035543c4140f89dd58fade3f/tournamentinfo.txt"), sep = ",")
#exclude top row since it does not have relevant data
raw_chess_data <- raw_chess_data[c(5:nrow(raw_chess_data)),]
head(raw_chess_data) #preview
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [5] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [6] "-----------------------------------------------------------------------------------------"
id_regx <- "\\d{1,2}(?=\\s\\|)"
player_IDs <- unlist(str_extract_all(unlist(raw_chess_data), id_regx))
# regx = uppercase letters, followed by space, (at least two matches)
name_regx <- "([[:upper:]]+\\s){2,}"
player_names <- unlist(str_extract_all(unlist(raw_chess_data), name_regx))
str_trim(player_names, side = "right")
## [1] "GARY HUA" "DAKSHESH DARURI"
## [3] "ADITYA BAJAJ" "PATRICK H SCHILLING"
## [5] "HANSHI ZUO" "HANSEN SONG"
## [7] "GARY DEE SWATHELL" "EZEKIEL HOUGHTON"
## [9] "STEFANO LEE" "ANVIT RAO"
## [11] "CAMERON WILLIAM MC LEMAN" "KENNETH J TACK"
## [13] "TORRANCE HENRY JR" "BRADLEY SHAW"
## [15] "ZACHARY JAMES HOUGHTON" "MIKE NIKITIN"
## [17] "RONALD GRZEGORCZYK" "DAVID SUNDEEN"
## [19] "DIPANKAR ROY" "JASON ZHENG"
## [21] "DINH DANG BUI" "EUGENE L MCCLURE"
## [23] "ALAN BUI" "MICHAEL R ALDRICH"
## [25] "LOREN SCHWIEBERT" "MAX ZHU"
## [27] "GAURAV GIDWANI" "SOFIA ADINA"
## [29] "CHIEDOZIE OKORIE" "GEORGE AVERY JONES"
## [31] "RISHI SHETTY" "JOSHUA PHILIP MATHEWS"
## [33] "JADE GE" "MICHAEL JEFFERY THOMAS"
## [35] "JOSHUA DAVID LEE" "SIDDHARTH JHA"
## [37] "AMIYATOSH PWNANANDAM" "BRIAN LIU"
## [39] "JOEL R HENDON" "FOREST ZHANG"
## [41] "KYLE WILLIAM MURPHY" "JARED GE"
## [43] "ROBERT GLEN VASEY" "JUSTIN D SCHILLING"
## [45] "DEREK YAN" "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT" "DANIEL KHAIN"
## [49] "MICHAEL J MARTIN" "SHIVAM JHA"
## [51] "TEJAS AYYAGARI" "ETHAN GUO"
## [53] "JOSE C YBARRA" "LARRY HODGE"
## [55] "ALEX KONG" "MARISA RICCI"
## [57] "MICHAEL LU" "VIRAJ MOHILE"
## [59] "SEAN M MC CORMICK" "JULIA SHEN"
## [61] "JEZZEL FARKAS" "ASHWIN BALAJI"
## [63] "THOMAS JOSEPH HOSMER" "BEN LI"
# regx= two uppercase letters, followed by space, followed by a '|'
state_regx <- "([[:upper:]]){2}\\s(?=\\|)"
player_states <- unlist(str_extract_all(unlist(raw_chess_data), state_regx))
str_trim(player_states, side = "right")
## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI"
## [16] "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI" "MI" "ON"
## [31] "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [46] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [61] "ON" "MI" "MI" "MI"
# regx=two digits and '.' between
points_regx <- "\\d\\.\\d"
chess_points <- unlist(str_extract_all(unlist(raw_chess_data), points_regx))
chess_points
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5" "4.5"
## [13] "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [25] "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [37] "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "2.5" "2.5"
## [49] "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0" "2.0" "2.0" "2.0" "2.0" "1.5"
## [61] "1.5" "1.0" "1.0" "1.0"
# regx = remove leading patterns with a > and a space. search for 1 or 2 spaces, or one space and a colon, followed by 3 or 4 digits, followed by a space or the letter 'P'
pre_regx <- "(?<!\\>\\s)(?<=\\s{1,2}|\\s\\:)(\\d{3,4}(?=\\s|P))"
player_prerating <- unlist(str_extract_all(unlist(raw_chess_data), pre_regx))
player_prerating <- str_trim(player_prerating)
opponents_regx <- "(\\d{1,}|[[:blank:]]{1})(?=\\|)"
opponents <- unlist(str_extract_all(unlist(raw_chess_data), opponents_regx))
opponents[opponents==" "] <- NA
opponent1 <- as.numeric(opponents[seq(4, length(opponents), 10)])
opponent1 <- as.numeric(opponent1[seq(1, length(opponent1), 2)])
opponent2 <- as.numeric(opponents[seq(5, length(opponents), 10)])
opponent2 <- as.numeric(opponent2[seq(1, length(opponent2), 2)])
opponent3 <- as.numeric(opponents[seq(6, length(opponents), 10)])
opponent3 <- as.numeric(opponent3[seq(1, length(opponent3), 2)])
opponent4 <- as.numeric(opponents[seq(7, length(opponents), 10)])
opponent4 <- as.numeric(opponent4[seq(1, length(opponent4), 2)])
opponent5 <- as.numeric(opponents[seq(8, length(opponents), 10)])
opponent5 <- as.numeric(opponent5[seq(1, length(opponent5), 2)])
opponent6 <- as.numeric(opponents[seq(9, length(opponents), 10)])
opponent6 <- as.numeric(opponent6[seq(1, length(opponent6), 2)])
opponent7 <- as.numeric(opponents[seq(10, length(opponents), 10)])
opponent7 <- as.numeric(opponent7[seq(1, length(opponent7), 2)])
opponents <- matrix(c(opponent1, opponent2, opponent3, opponent4, opponent5, opponent6, opponent7),nrow = 64, ncol = 7)
# Match the opponent to the player's id
chess_avg <- 0
data_table <- 0
for (i in 1:(length(player_IDs)))
{
chess_avg[i] <- mean(as.numeric(player_prerating[opponents[i,]]), na.rm = T)
}
data_table <- data.frame(player_names, player_states, chess_points, player_prerating, chess_avg)
datatable(data_table, extensions = 'Scroller', options = list(
deferRender = TRUE,
scrollY = 200,
scroller = TRUE
))
write.csv(data_table, file = "ChessData.csv")