Read in text file as string and import Libraries

library(stringr)
library(readr)
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
mystring <- read_file("tournamentinfo.txt")

Extract Column Names and state names using regular expressions

states <- c()
column_names <- unlist(str_extract_all(mystring,"\\w+"))[1:11]
states_2 <- str_split(unlist(str_extract_all(mystring,"[A-Z]+[:blank:]\\|")),"\\s+")
for (x in 1:64){
states <- append(states,states_2[[x]][1])
}
column_names
##  [1] "Pair"   "Player" "Name"   "Total"  "Round"  "Round"  "Round" 
##  [8] "Round"  "Round"  "Round"  "Round"

Extract data in between the | |

trial <- unlist(str_extract_all(mystring,"\\|.*|$"))[1:400]
splited_trial <- str_split(trial[3:130],"\\|")
splited_trial[1:5]
## [[1]]
##  [1] ""                                 
##  [2] " GARY HUA                        "
##  [3] "6.0  "                            
##  [4] "W  39"                            
##  [5] "W  21"                            
##  [6] "W  18"                            
##  [7] "W  14"                            
##  [8] "W   7"                            
##  [9] "D  12"                            
## [10] "D   4"                            
## [11] ""                                 
## 
## [[2]]
##  [1] ""                                 
##  [2] " 15445895 / R: 1794   ->1817     "
##  [3] "N:2  "                            
##  [4] "W    "                            
##  [5] "B    "                            
##  [6] "W    "                            
##  [7] "B    "                            
##  [8] "W    "                            
##  [9] "B    "                            
## [10] "W    "                            
## [11] ""                                 
## 
## [[3]]
##  [1] ""                                 
##  [2] " DAKSHESH DARURI                 "
##  [3] "6.0  "                            
##  [4] "W  63"                            
##  [5] "W  58"                            
##  [6] "L   4"                            
##  [7] "W  17"                            
##  [8] "W  16"                            
##  [9] "W  20"                            
## [10] "W   7"                            
## [11] ""                                 
## 
## [[4]]
##  [1] ""                                 
##  [2] " 14598900 / R: 1553   ->1663     "
##  [3] "N:2  "                            
##  [4] "B    "                            
##  [5] "W    "                            
##  [6] "B    "                            
##  [7] "W    "                            
##  [8] "B    "                            
##  [9] "W    "                            
## [10] "B    "                            
## [11] ""                                 
## 
## [[5]]
##  [1] ""                                 
##  [2] " ADITYA BAJAJ                    "
##  [3] "6.0  "                            
##  [4] "L   8"                            
##  [5] "W  61"                            
##  [6] "W  25"                            
##  [7] "W  21"                            
##  [8] "W  11"                            
##  [9] "W  13"                            
## [10] "W  12"                            
## [11] ""

Create blank vectors to populate with chess names, aggregated matches played, player starting rank, and player chess points

trial <- unlist(str_extract_all(mystring,"\\|.*|$"))
splited_trial <- str_split(trial[3:130],"\\|")
chess_names<- c()
matches <- c()
start_ranks <- c()
chess_points <- c()
splited_trial[[4]][2]
## [1] " 14598900 / R: 1553   ->1663     "
##If block grabs ranks from splited_trial[[even#'s]] 
##Else catches the rest or splited_trial[[odd#'s]]
for (x in 1:length(splited_trial)){
    if (x%%2==0){
        my_ranks<-splited_trial[[x]][2]
        my_ranks <- unlist(str_split(unlist(str_extract(my_ranks,":\\s+[[:digit:]]+")),"\\s+"))[2]
        start_ranks <- append(start_ranks,my_ranks)
    }
    else {
        chess_names <- append(chess_names,splited_trial[[x]][2])
        chess_points <- append(chess_points,splited_trial[[x]][3])
        splited_trial[[x]] <- str_split(splited_trial[[x]] ,"[:blank:]+")
        for (index in 4:10){
            matches <- append(matches,splited_trial[[x]][[index]][2])
        }
    }
}

Trim and Clean data

chess_points <- as.numeric(str_trim(chess_points))
chess_names <- str_trim(chess_names)
matches[matches==""] <- NA
start_ranks <- as.numeric(start_ranks)
matches <- as.numeric(matches) 
chess_names <- str_trim(chess_names)

Split aggreagated matches data into matches by player

matches_by_player <- split(matches, ceiling(seq_along(matches)/7))
element_1 <- matches_by_player
element_1 <- as_data_frame(element_1)
df_start_ranks <- as_data_frame(start_ranks)
df_start_ranks <- cbind(1:64,df_start_ranks)

temp_df<- as.data.frame(lapply(element_1, function(x)  ifelse(grepl(paste(df_start_ranks$`1:64`, collapse = '|'), x), 
                                                  df_start_ranks$value[match(x, df_start_ranks$`1:64`)], x)))
## find means of the rank of opponents
average_opp_rank <- round(colMeans(temp_df,na.rm=TRUE),0)

Display all the vectors that were created and combine them into a dataframe

average_opp_rank <- as.vector(average_opp_rank)
chess_names
##  [1] "GARY HUA"                   "DAKSHESH DARURI"           
##  [3] "ADITYA BAJAJ"               "PATRICK H SCHILLING"       
##  [5] "HANSHI ZUO"                 "HANSEN SONG"               
##  [7] "GARY DEE SWATHELL"          "EZEKIEL HOUGHTON"          
##  [9] "STEFANO LEE"                "ANVIT RAO"                 
## [11] "CAMERON WILLIAM MC LEMAN"   "KENNETH J TACK"            
## [13] "TORRANCE HENRY JR"          "BRADLEY SHAW"              
## [15] "ZACHARY JAMES HOUGHTON"     "MIKE NIKITIN"              
## [17] "RONALD GRZEGORCZYK"         "DAVID SUNDEEN"             
## [19] "DIPANKAR ROY"               "JASON ZHENG"               
## [21] "DINH DANG BUI"              "EUGENE L MCCLURE"          
## [23] "ALAN BUI"                   "MICHAEL R ALDRICH"         
## [25] "LOREN SCHWIEBERT"           "MAX ZHU"                   
## [27] "GAURAV GIDWANI"             "SOFIA ADINA STANESCU-BELLU"
## [29] "CHIEDOZIE OKORIE"           "GEORGE AVERY JONES"        
## [31] "RISHI SHETTY"               "JOSHUA PHILIP MATHEWS"     
## [33] "JADE GE"                    "MICHAEL JEFFERY THOMAS"    
## [35] "JOSHUA DAVID LEE"           "SIDDHARTH JHA"             
## [37] "AMIYATOSH PWNANANDAM"       "BRIAN LIU"                 
## [39] "JOEL R HENDON"              "FOREST ZHANG"              
## [41] "KYLE WILLIAM MURPHY"        "JARED GE"                  
## [43] "ROBERT GLEN VASEY"          "JUSTIN D SCHILLING"        
## [45] "DEREK YAN"                  "JACOB ALEXANDER LAVALLEY"  
## [47] "ERIC WRIGHT"                "DANIEL KHAIN"              
## [49] "MICHAEL J MARTIN"           "SHIVAM JHA"                
## [51] "TEJAS AYYAGARI"             "ETHAN GUO"                 
## [53] "JOSE C YBARRA"              "LARRY HODGE"               
## [55] "ALEX KONG"                  "MARISA RICCI"              
## [57] "MICHAEL LU"                 "VIRAJ MOHILE"              
## [59] "SEAN M MC CORMICK"          "JULIA SHEN"                
## [61] "JEZZEL FARKAS"              "ASHWIN BALAJI"             
## [63] "THOMAS JOSEPH HOSMER"       "BEN LI"
start_ranks
##  [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610
## [15] 1220 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507
## [29] 1602 1522 1494 1441 1449 1399 1438 1355  980 1423 1436 1348 1403 1332
## [43] 1283 1199 1242  377 1362 1382 1291 1056 1011  935 1393 1270 1186 1153
## [57] 1092  917  853  967  955 1530 1175 1163
chess_points
##  [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0
## [18] 4.0 4.0 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5
## [35] 3.5 3.5 3.5 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5
## [52] 2.5 2.0 2.0 2.0 2.0 2.0 2.0 2.0 1.5 1.5 1.0 1.0 1.0
states
##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
finished_chess_rankings <- as_data_frame(cbind(chess_names,states,chess_points,start_ranks,average_opp_rank))

finished_chess_rankings
## # A tibble: 64 x 5
##    chess_names         states chess_points start_ranks average_opp_rank
##    <chr>               <chr>  <chr>        <chr>       <chr>           
##  1 GARY HUA            ON     6            1794        1605            
##  2 DAKSHESH DARURI     MI     6            1553        1469            
##  3 ADITYA BAJAJ        MI     6            1384        1564            
##  4 PATRICK H SCHILLING MI     5.5          1716        1574            
##  5 HANSHI ZUO          MI     5.5          1655        1501            
##  6 HANSEN SONG         OH     5            1686        1519            
##  7 GARY DEE SWATHELL   MI     5            1649        1372            
##  8 EZEKIEL HOUGHTON    MI     5            1641        1468            
##  9 STEFANO LEE         ON     5            1411        1523            
## 10 ANVIT RAO           MI     5            1365        1554            
## # ... with 54 more rows
write.csv(finished_chess_rankings, file="finished_chess_rankings.csv")