Read in text file as string and import Libraries

library(stringr)
library(readr)
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
mystring <- read_file("tournamentinfo.txt")

Extract Column Names and state names using regular expressions

states <- c()
column_names <- unlist(str_extract_all(mystring,"\\w+"))[1:11]
states_2 <- str_split(unlist(str_extract_all(mystring,"[A-Z]+[:blank:]\\|")),"\\s+")
for (x in 1:64){
states <- append(states,states_2[[x]][1])
}
states
##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
column_names
##  [1] "Pair"   "Player" "Name"   "Total"  "Round"  "Round"  "Round" 
##  [8] "Round"  "Round"  "Round"  "Round"

Extract data in between the | |

trial <- unlist(str_extract_all(mystring,"\\|.*|$"))
splited_trial <- str_split(trial[3:130],"\\|")
splited_trial[1:5]
## [[1]]
##  [1] ""                                 
##  [2] " GARY HUA                        "
##  [3] "6.0  "                            
##  [4] "W  39"                            
##  [5] "W  21"                            
##  [6] "W  18"                            
##  [7] "W  14"                            
##  [8] "W   7"                            
##  [9] "D  12"                            
## [10] "D   4"                            
## [11] ""                                 
## 
## [[2]]
##  [1] ""                                 
##  [2] " 15445895 / R: 1794   ->1817     "
##  [3] "N:2  "                            
##  [4] "W    "                            
##  [5] "B    "                            
##  [6] "W    "                            
##  [7] "B    "                            
##  [8] "W    "                            
##  [9] "B    "                            
## [10] "W    "                            
## [11] ""                                 
## 
## [[3]]
##  [1] ""                                 
##  [2] " DAKSHESH DARURI                 "
##  [3] "6.0  "                            
##  [4] "W  63"                            
##  [5] "W  58"                            
##  [6] "L   4"                            
##  [7] "W  17"                            
##  [8] "W  16"                            
##  [9] "W  20"                            
## [10] "W   7"                            
## [11] ""                                 
## 
## [[4]]
##  [1] ""                                 
##  [2] " 14598900 / R: 1553   ->1663     "
##  [3] "N:2  "                            
##  [4] "B    "                            
##  [5] "W    "                            
##  [6] "B    "                            
##  [7] "W    "                            
##  [8] "B    "                            
##  [9] "W    "                            
## [10] "B    "                            
## [11] ""                                 
## 
## [[5]]
##  [1] ""                                 
##  [2] " ADITYA BAJAJ                    "
##  [3] "6.0  "                            
##  [4] "L   8"                            
##  [5] "W  61"                            
##  [6] "W  25"                            
##  [7] "W  21"                            
##  [8] "W  11"                            
##  [9] "W  13"                            
## [10] "W  12"                            
## [11] ""

Create blank vectors to populate with chess names, aggregated matches played, player starting rank, and player chess points

trial <- unlist(str_extract_all(mystring,"\\|.*|$"))
splited_trial <- str_split(trial[3:130],"\\|")
chess_names<- c()
matches <- c()
start_ranks <- c()
chess_points <- c()
splited_trial[[4]][2]
## [1] " 14598900 / R: 1553   ->1663     "
##If block grabs ranks from splited_trial[[even#'s]] 
##Else catches the rest or splited_trial[[odd#'s]]
for (x in 1:length(splited_trial)){
    if (x%%2==0){
        my_ranks<-splited_trial[[x]][2]
        my_ranks <- unlist(str_split(unlist(str_extract(my_ranks,":\\s+[[:digit:]]+")),"\\s+"))[2]
        start_ranks <- append(start_ranks,my_ranks)
    }
    else {
        chess_names <- append(chess_names,splited_trial[[x]][2])
        chess_points <- append(chess_points,splited_trial[[x]][3])
        splited_trial[[x]] <- str_split(splited_trial[[x]] ,"[:blank:]+")
        for (index in 4:10){
            matches <- append(matches,splited_trial[[x]][[index]][2])
        }
    }
}

Alternate example of how I could have used vectors and lapply instead of for loops.

odds <- splited_trial[seq(from=1,to=127,by=2)]
evens <-splited_trial[seq(from=2,to=127,by=2)]
vector_ranks <- lapply(evens, `[[`, 2)
vector_ranks<- unlist(str_split(unlist(str_extract(vector_ranks,":\\s+[[:digit:]]+")),"\\s+"))
vector_ranks[seq(from=2,to=127,by=2)]
##  [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" "1365"
## [11] "1712" "1663" "1666" "1610" "1220" "1604" "1629" "1600" "1564" "1595"
## [21] "1563" "1555" "1363" "1229" "1745" "1579" "1552" "1507" "1602" "1522"
## [31] "1494" "1441" "1449" "1399" "1438" "1355" "980"  "1423" "1436" "1348"
## [41] "1403" "1332" "1283" "1199" "1242" "377"  "1362" "1382" "1291" "1056"
## [51] "1011" "935"  "1393" "1270" "1186" "1153" "1092" "917"  "853"  "967" 
## [61] "955"  "1530" "1175"

Trim and Clean data

chess_points <- as.numeric(str_trim(chess_points))
chess_names <- str_trim(chess_names)
matches[matches==""] <- NA
start_ranks <- as.numeric(start_ranks)
matches <- as.numeric(matches) 
chess_names <- str_trim(chess_names)

Split aggreagated matches data into matches by player

##split into  7*64 matches_by_player
## map replacement values to new split df
matches_by_player <- split(matches, ceiling(seq_along(matches)/7))
element_1 <- matches_by_player
element_1 <- as_data_frame(element_1)
df_start_ranks <- as_data_frame(start_ranks)
df_start_ranks <- cbind(1:64,df_start_ranks)

temp_df<- as.data.frame(lapply(element_1, function(x)  ifelse(grepl(paste(df_start_ranks$`1:64`, collapse = '|'), x), 
                                                  df_start_ranks$value[match(x, df_start_ranks$`1:64`)], x)))
## find means of the rank of opponents

average_opp_rank <- round(colMeans(temp_df,na.rm=TRUE),0)

Display all the vectors that were created and combine them into a dataframe

average_opp_rank <- as.vector(average_opp_rank)
chess_names
##  [1] "GARY HUA"                   "DAKSHESH DARURI"           
##  [3] "ADITYA BAJAJ"               "PATRICK H SCHILLING"       
##  [5] "HANSHI ZUO"                 "HANSEN SONG"               
##  [7] "GARY DEE SWATHELL"          "EZEKIEL HOUGHTON"          
##  [9] "STEFANO LEE"                "ANVIT RAO"                 
## [11] "CAMERON WILLIAM MC LEMAN"   "KENNETH J TACK"            
## [13] "TORRANCE HENRY JR"          "BRADLEY SHAW"              
## [15] "ZACHARY JAMES HOUGHTON"     "MIKE NIKITIN"              
## [17] "RONALD GRZEGORCZYK"         "DAVID SUNDEEN"             
## [19] "DIPANKAR ROY"               "JASON ZHENG"               
## [21] "DINH DANG BUI"              "EUGENE L MCCLURE"          
## [23] "ALAN BUI"                   "MICHAEL R ALDRICH"         
## [25] "LOREN SCHWIEBERT"           "MAX ZHU"                   
## [27] "GAURAV GIDWANI"             "SOFIA ADINA STANESCU-BELLU"
## [29] "CHIEDOZIE OKORIE"           "GEORGE AVERY JONES"        
## [31] "RISHI SHETTY"               "JOSHUA PHILIP MATHEWS"     
## [33] "JADE GE"                    "MICHAEL JEFFERY THOMAS"    
## [35] "JOSHUA DAVID LEE"           "SIDDHARTH JHA"             
## [37] "AMIYATOSH PWNANANDAM"       "BRIAN LIU"                 
## [39] "JOEL R HENDON"              "FOREST ZHANG"              
## [41] "KYLE WILLIAM MURPHY"        "JARED GE"                  
## [43] "ROBERT GLEN VASEY"          "JUSTIN D SCHILLING"        
## [45] "DEREK YAN"                  "JACOB ALEXANDER LAVALLEY"  
## [47] "ERIC WRIGHT"                "DANIEL KHAIN"              
## [49] "MICHAEL J MARTIN"           "SHIVAM JHA"                
## [51] "TEJAS AYYAGARI"             "ETHAN GUO"                 
## [53] "JOSE C YBARRA"              "LARRY HODGE"               
## [55] "ALEX KONG"                  "MARISA RICCI"              
## [57] "MICHAEL LU"                 "VIRAJ MOHILE"              
## [59] "SEAN M MC CORMICK"          "JULIA SHEN"                
## [61] "JEZZEL FARKAS"              "ASHWIN BALAJI"             
## [63] "THOMAS JOSEPH HOSMER"       "BEN LI"
start_ranks
##  [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610
## [15] 1220 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507
## [29] 1602 1522 1494 1441 1449 1399 1438 1355  980 1423 1436 1348 1403 1332
## [43] 1283 1199 1242  377 1362 1382 1291 1056 1011  935 1393 1270 1186 1153
## [57] 1092  917  853  967  955 1530 1175 1163
chess_points
##  [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0
## [18] 4.0 4.0 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5
## [35] 3.5 3.5 3.5 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5
## [52] 2.5 2.0 2.0 2.0 2.0 2.0 2.0 2.0 1.5 1.5 1.0 1.0 1.0
states
##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
finished_chess_rankings <- as_data_frame(cbind(chess_names,states,chess_points,start_ranks,average_opp_rank))



write.csv(finished_chess_rankings, file="finished_chess_rankings.csv")

Curious experimentation

##Sort by rank

rankings_for_plot <- finished_chess_rankings[order(finished_chess_rankings$start_ranks),]
rankings_for_plot$start_ranks <- as.integer(rankings_for_plot$start_ranks)
rankings_for_plot$average_opp_rank <- as.integer(rankings_for_plot$average_opp_rank)
rankigns_2 <- arrange(rankings_for_plot,start_ranks)
#rankigns_2 <- as.data.frame(rankigns_2)
str(rankings_for_plot)
## Classes 'tbl_df', 'tbl' and 'data.frame':    64 obs. of  5 variables:
##  $ chess_names     : chr  "TEJAS AYYAGARI" "SHIVAM JHA" "MICHAEL LU" "MARISA RICCI" ...
##  $ states          : chr  "MI" "MI" "MI" "MI" ...
##  $ chess_points    : chr  "2.5" "2.5" "2" "2" ...
##  $ start_ranks     : int  1011 1056 1092 1153 1163 1175 1186 1199 1220 1229 ...
##  $ average_opp_rank: int  1356 1296 1363 1414 1263 1350 1406 1327 1484 1357 ...
library(plyr)
library(ggplot2)
ggplot(rankings_for_plot, aes(x=start_ranks,y=average_opp_rank))+
    geom_point()+
    geom_smooth(method="lm")