packages used

library(tidyverse)
library(stringr)
library(readr)

import raw data from github and parse it out. This set of code gets rid of most of the data I dont need. Keeping only Total PTS, Round 1-7, the state, the name and the pre-tournament rating. The Pre and Post are still joined I will need to figure that out.

my_data_raw <- read.csv('https://raw.githubusercontent.com/jonburns2454/DATA607/main/project%201%20data.txt', header = F, skip = 3)

chess_data <- str_split(my_data_raw[,], "-", simplify = T)

#print(chess_table[1:20])

Pulling all of the data we need

#Names
Names <- unlist(str_extract_all(chess_data[,], "\\D+\\w+[[:space:]]\\w+([[:space:]]\\w+)*", simplify = T))

Names <- gsub('[^[:alnum:] ]', '', Names)#Removed the "|" from the front of the names

Names <- Names[!apply(Names == "", 1, all),]

print(Names)
##  [1] "  GARY HUA"                 "  DAKSHESH DARURI"         
##  [3] "  ADITYA BAJAJ"             "  PATRICK H SCHILLING"     
##  [5] "  HANSHI ZUO"               "  HANSEN SONG"             
##  [7] "  GARY DEE SWATHELL"        "  EZEKIEL HOUGHTON"        
##  [9] "  STEFANO LEE"              "  ANVIT RAO"               
## [11] "  CAMERON WILLIAM MC LEMAN" "  KENNETH J TACK"          
## [13] "  TORRANCE HENRY JR"        "  BRADLEY SHAW"            
## [15] "  ZACHARY JAMES HOUGHTON"   "  MIKE NIKITIN"            
## [17] "  RONALD GRZEGORCZYK"       "  DAVID SUNDEEN"           
## [19] "  DIPANKAR ROY"             "  JASON ZHENG"             
## [21] "  DINH DANG BUI"            "  EUGENE L MCCLURE"        
## [23] "  ALAN BUI"                 "  MICHAEL R ALDRICH"       
## [25] "  LOREN SCHWIEBERT"         "  MAX ZHU"                 
## [27] "  GAURAV GIDWANI"           "  SOFIA ADINA STANESCU"    
## [29] "  CHIEDOZIE OKORIE"         "  GEORGE AVERY JONES"      
## [31] "  RISHI SHETTY"             "  JOSHUA PHILIP MATHEWS"   
## [33] "  JADE GE"                  "  MICHAEL JEFFERY THOMAS"  
## [35] "  JOSHUA DAVID LEE"         "  SIDDHARTH JHA"           
## [37] "  AMIYATOSH PWNANANDAM"     "  BRIAN LIU"               
## [39] "  JOEL R HENDON"            "  FOREST ZHANG"            
## [41] "  KYLE WILLIAM MURPHY"      "  JARED GE"                
## [43] "  ROBERT GLEN VASEY"        "  JUSTIN D SCHILLING"      
## [45] "  DEREK YAN"                "  JACOB ALEXANDER LAVALLEY"
## [47] "  ERIC WRIGHT"              "  DANIEL KHAIN"            
## [49] "  MICHAEL J MARTIN"         "  SHIVAM JHA"              
## [51] "  TEJAS AYYAGARI"           "  ETHAN GUO"               
## [53] "  JOSE C YBARRA"            "  LARRY HODGE"             
## [55] "  ALEX KONG"                "  MARISA RICCI"            
## [57] "  MICHAEL LU"               "  VIRAJ MOHILE"            
## [59] "  SEAN M MC CORMICK"        "  JULIA SHEN"              
## [61] "  JEZZEL FARKAS"            "  ASHWIN BALAJI"           
## [63] "  THOMAS JOSEPH HOSMER"     "  BEN LI"
#State data
States <- unlist(str_extract_all(my_data_raw, "[[:upper:]]{2}(?=\\s\\|)"))
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
#Total Points
Total_Points <- unlist(str_extract_all(my_data_raw[,], "\\d\\.\\d"))

#Pre-rating

Pre_Ratings <- unlist(str_extract_all(my_data_raw, "[R:]([[:space:]]+\\d+)"))
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
Pre_Ratings <- gsub('[^[:alnum:] ]', '', Pre_Ratings)

print(Pre_Ratings) 
##  [1] " 1794" " 1553" " 1384" " 1716" " 1655" " 1686" " 1649" " 1641" " 1411"
## [10] " 1365" " 1712" " 1663" " 1666" " 1610" " 1220" " 1604" " 1629" " 1600"
## [19] " 1564" " 1595" " 1563" " 1555" " 1363" " 1229" " 1745" " 1579" " 1552"
## [28] " 1507" " 1602" " 1522" " 1494" " 1441" " 1449" " 1399" " 1438" " 1355"
## [37] "  980" " 1423" " 1436" " 1348" " 1403" " 1332" " 1283" " 1199" " 1242"
## [46] "  377" " 1362" " 1382" " 1291" " 1056" " 1011" "  935" " 1393" " 1270"
## [55] " 1186" " 1153" " 1092" "  917" "  853" "  967" "  955" " 1530" " 1175"
## [64] " 1163"
#Data.Frame Creation + Average Pre Chess Rating of Opponents
Tournament_df <- data.frame(Names, States, Total_Points, Pre_Ratings)

Tournament_df <- Tournament_df %>% 
    add_column(Pair_Num = 1:64) %>% 
    relocate(Pair_Num, .before = Names)

Opponents - I started this section but I could not for the life of me figure out the final opponent rating.

# Opponents <- unlist(str_extract_all(my_data_raw[,], "\\d{2}+[|]|\\d{1}+[|]", simplify = T))
# 
# Opponents_Matrix <- matrix(ncol = 7)
# 
# Opponents <- unlist(str_extract_all(Opponents[,], "\\d+", simplify=TRUE))
# Opponents_Filled_Matrix <- Opponents[rowSums(Opponents=="")!=ncol(Opponents)]
# 
# print(Opponents)
data.table::data.table(Tournament_df)
##     Pair_Num                      Names States Total_Points Pre_Ratings
##  1:        1                   GARY HUA     ON          6.0        1794
##  2:        2            DAKSHESH DARURI     MI          6.0        1553
##  3:        3               ADITYA BAJAJ     MI          6.0        1384
##  4:        4        PATRICK H SCHILLING     MI          5.5        1716
##  5:        5                 HANSHI ZUO     MI          5.5        1655
##  6:        6                HANSEN SONG     OH          5.0        1686
##  7:        7          GARY DEE SWATHELL     MI          5.0        1649
##  8:        8           EZEKIEL HOUGHTON     MI          5.0        1641
##  9:        9                STEFANO LEE     ON          5.0        1411
## 10:       10                  ANVIT RAO     MI          5.0        1365
## 11:       11   CAMERON WILLIAM MC LEMAN     MI          4.5        1712
## 12:       12             KENNETH J TACK     MI          4.5        1663
## 13:       13          TORRANCE HENRY JR     MI          4.5        1666
## 14:       14               BRADLEY SHAW     MI          4.5        1610
## 15:       15     ZACHARY JAMES HOUGHTON     MI          4.5        1220
## 16:       16               MIKE NIKITIN     MI          4.0        1604
## 17:       17         RONALD GRZEGORCZYK     MI          4.0        1629
## 18:       18              DAVID SUNDEEN     MI          4.0        1600
## 19:       19               DIPANKAR ROY     MI          4.0        1564
## 20:       20                JASON ZHENG     MI          4.0        1595
## 21:       21              DINH DANG BUI     ON          4.0        1563
## 22:       22           EUGENE L MCCLURE     MI          4.0        1555
## 23:       23                   ALAN BUI     ON          4.0        1363
## 24:       24          MICHAEL R ALDRICH     MI          4.0        1229
## 25:       25           LOREN SCHWIEBERT     MI          3.5        1745
## 26:       26                    MAX ZHU     ON          3.5        1579
## 27:       27             GAURAV GIDWANI     MI          3.5        1552
## 28:       28       SOFIA ADINA STANESCU     MI          3.5        1507
## 29:       29           CHIEDOZIE OKORIE     MI          3.5        1602
## 30:       30         GEORGE AVERY JONES     ON          3.5        1522
## 31:       31               RISHI SHETTY     MI          3.5        1494
## 32:       32      JOSHUA PHILIP MATHEWS     ON          3.5        1441
## 33:       33                    JADE GE     MI          3.5        1449
## 34:       34     MICHAEL JEFFERY THOMAS     MI          3.5        1399
## 35:       35           JOSHUA DAVID LEE     MI          3.5        1438
## 36:       36              SIDDHARTH JHA     MI          3.5        1355
## 37:       37       AMIYATOSH PWNANANDAM     MI          3.5         980
## 38:       38                  BRIAN LIU     MI          3.0        1423
## 39:       39              JOEL R HENDON     MI          3.0        1436
## 40:       40               FOREST ZHANG     MI          3.0        1348
## 41:       41        KYLE WILLIAM MURPHY     MI          3.0        1403
## 42:       42                   JARED GE     MI          3.0        1332
## 43:       43          ROBERT GLEN VASEY     MI          3.0        1283
## 44:       44         JUSTIN D SCHILLING     MI          3.0        1199
## 45:       45                  DEREK YAN     MI          3.0        1242
## 46:       46   JACOB ALEXANDER LAVALLEY     MI          3.0         377
## 47:       47                ERIC WRIGHT     MI          2.5        1362
## 48:       48               DANIEL KHAIN     MI          2.5        1382
## 49:       49           MICHAEL J MARTIN     MI          2.5        1291
## 50:       50                 SHIVAM JHA     MI          2.5        1056
## 51:       51             TEJAS AYYAGARI     MI          2.5        1011
## 52:       52                  ETHAN GUO     MI          2.5         935
## 53:       53              JOSE C YBARRA     MI          2.0        1393
## 54:       54                LARRY HODGE     MI          2.0        1270
## 55:       55                  ALEX KONG     MI          2.0        1186
## 56:       56               MARISA RICCI     MI          2.0        1153
## 57:       57                 MICHAEL LU     MI          2.0        1092
## 58:       58               VIRAJ MOHILE     MI          2.0         917
## 59:       59          SEAN M MC CORMICK     MI          2.0         853
## 60:       60                 JULIA SHEN     MI          1.5         967
## 61:       61              JEZZEL FARKAS     ON          1.5         955
## 62:       62              ASHWIN BALAJI     MI          1.0        1530
## 63:       63       THOMAS JOSEPH HOSMER     MI          1.0        1175
## 64:       64                     BEN LI     MI          1.0        1163
##     Pair_Num                      Names States Total_Points Pre_Ratings