# load data in
data = readLines('tournamentinfo.txt')
## Warning in readLines("tournamentinfo.txt"): incomplete final line found on
## 'tournamentinfo.txt'
# get rid of the long sets of dashes
dashes = "-----------------------------------------------------------------------------------------"
# make whitespace more manageable
data = setdiff(data, dashes)
data = str_replace_all(data, "[ ]{2,}", "")
# consecutive rows are about same person
# split consecutive rows
df1 = data[c(TRUE, FALSE)]
df2 = data[c(FALSE, TRUE)]

# (?=|) grabs upto the right "|" symbol
# get names
names = trimws(str_extract_all(tail(df1,-1), "[A-Z- ]{2,}(?=|)"))
# get state
states = trimws(str_extract_all(tail(df2,-1), "[A-Z]{2}(?=|)"))
# get points
points = trimws(str_extract_all(tail(df1,-1), "[0-9].[0-9](?=|)"))
# get prerating
prerating = trimws(str_extract(tail(df2,-1), "[0-9]+(?=P|-)"))
fdf = as.data.frame(cbind(names, states, points, prerating))
fdf$points = as.numeric(as.character(fdf$points))
fdf$prerating = as.numeric(as.character(fdf$prerating))

# get avg_opp_pre
# need ratings of opponents played and how many games played
opp_ids = str_extract_all(tail(df1,-1), "(?<=[A-Z])[0-9]+(?=|)")
games_played = lengths(opp_ids)
opp_ids = lapply(opp_ids, as.numeric)
sums = c()
for(i in 1:length(opp_ids)) {
  p_row = unlist(opp_ids[i])
  p_sum = sum(fdf$prerating[p_row])
  sums[i] = p_sum
}
avg_opp_pre = sums/games_played
fdf$avg_opp_pre = avg_opp_pre
fdf
##                         names states points prerating avg_opp_pre
## 1                    GARY HUA     ON    6.0      1794    1605.286
## 2             DAKSHESH DARURI     MI    6.0      1553    1469.286
## 3                ADITYA BAJAJ     MI    6.0      1384    1563.571
## 4         PATRICK H SCHILLING     MI    5.5      1716    1573.571
## 5                  HANSHI ZUO     MI    5.5      1655    1500.857
## 6                 HANSEN SONG     OH    5.0      1686    1518.714
## 7           GARY DEE SWATHELL     MI    5.0      1649    1372.143
## 8            EZEKIEL HOUGHTON     MI    5.0      1641    1468.429
## 9                 STEFANO LEE     ON    5.0      1411    1523.143
## 10                  ANVIT RAO     MI    5.0      1365    1554.143
## 11   CAMERON WILLIAM MC LEMAN     MI    4.5      1712    1467.571
## 12             KENNETH J TACK     MI    4.5      1663    1506.167
## 13          TORRANCE HENRY JR     MI    4.5      1666    1497.857
## 14               BRADLEY SHAW     MI    4.5      1610    1515.000
## 15     ZACHARY JAMES HOUGHTON     MI    4.5      1220    1483.857
## 16               MIKE NIKITIN     MI    4.0      1604    1385.800
## 17         RONALD GRZEGORCZYK     MI    4.0      1629    1498.571
## 18              DAVID SUNDEEN     MI    4.0      1600    1480.000
## 19               DIPANKAR ROY     MI    4.0      1564    1426.286
## 20                JASON ZHENG     MI    4.0      1595    1410.857
## 21              DINH DANG BUI     ON    4.0      1563    1470.429
## 22           EUGENE L MCCLURE     MI    4.0      1555    1300.333
## 23                   ALAN BUI     ON    4.0      1363    1213.857
## 24          MICHAEL R ALDRICH     MI    4.0      1229    1357.000
## 25           LOREN SCHWIEBERT     MI    3.5      1745    1363.286
## 26                    MAX ZHU     ON    3.5      1579    1506.857
## 27             GAURAV GIDWANI     MI    3.5      1552    1221.667
## 28 SOFIA ADINA STANESCU-BELLU     MI    3.5      1507    1522.143
## 29           CHIEDOZIE OKORIE     MI    3.5      1602    1313.500
## 30         GEORGE AVERY JONES     ON    3.5      1522    1144.143
## 31               RISHI SHETTY     MI    3.5      1494    1259.857
## 32      JOSHUA PHILIP MATHEWS     ON    3.5      1441    1378.714
## 33                    JADE GE     MI    3.5      1449    1276.857
## 34     MICHAEL JEFFERY THOMAS     MI    3.5      1399    1375.286
## 35           JOSHUA DAVID LEE     MI    3.5      1438    1149.714
## 36              SIDDHARTH JHA     MI    3.5      1355    1388.167
## 37       AMIYATOSH PWNANANDAM     MI    3.5       980    1384.800
## 38                  BRIAN LIU     MI    3.0      1423    1539.167
## 39              JOEL R HENDON     MI    3.0      1436    1429.571
## 40               FOREST ZHANG     MI    3.0      1348    1390.571
## 41        KYLE WILLIAM MURPHY     MI    3.0      1403    1248.500
## 42                   JARED GE     MI    3.0      1332    1149.857
## 43          ROBERT GLEN VASEY     MI    3.0      1283    1106.571
## 44         JUSTIN D SCHILLING     MI    3.0      1199    1327.000
## 45                  DEREK YAN     MI    3.0      1242    1152.000
## 46   JACOB ALEXANDER LAVALLEY     MI    3.0       377    1357.714
## 47                ERIC WRIGHT     MI    2.5      1362    1392.000
## 48               DANIEL KHAIN     MI    2.5      1382    1355.800
## 49           MICHAEL J MARTIN     MI    2.5      1291    1285.800
## 50                 SHIVAM JHA     MI    2.5      1056    1296.000
## 51             TEJAS AYYAGARI     MI    2.5      1011    1356.143
## 52                  ETHAN GUO     MI    2.5       935    1494.571
## 53              JOSE C YBARRA     MI    2.0      1393    1345.333
## 54                LARRY HODGE     MI    2.0      1270    1206.167
## 55                  ALEX KONG     MI    2.0      1186    1406.000
## 56               MARISA RICCI     MI    2.0      1153    1414.400
## 57                 MICHAEL LU     MI    2.0      1092    1363.000
## 58               VIRAJ MOHILE     MI    2.0       917    1391.000
## 59          SEAN M MC CORMICK     MI    2.0       853    1319.000
## 60                 JULIA SHEN     MI    1.5       967    1330.200
## 61              JEZZEL FARKAS     ON    1.5       955    1327.286
## 62              ASHWIN BALAJI     MI    1.0      1530    1186.000
## 63       THOMAS JOSEPH HOSMER     MI    1.0      1175    1350.200
## 64                     BEN LI     MI    1.0      1163    1263.000
write.csv(fdf, 'tournament_info.csv')
# verify
df = read_csv('tournament_info.csv')
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   names = col_character(),
##   states = col_character(),
##   points = col_double(),
##   prerating = col_double(),
##   avg_opp_pre = col_double()
## )
df
## # A tibble: 64 x 6
##       X1 names               states points prerating avg_opp_pre
##    <dbl> <chr>               <chr>   <dbl>     <dbl>       <dbl>
##  1     1 GARY HUA            ON        6        1794       1605.
##  2     2 DAKSHESH DARURI     MI        6        1553       1469.
##  3     3 ADITYA BAJAJ        MI        6        1384       1564.
##  4     4 PATRICK H SCHILLING MI        5.5      1716       1574.
##  5     5 HANSHI ZUO          MI        5.5      1655       1501.
##  6     6 HANSEN SONG         OH        5        1686       1519.
##  7     7 GARY DEE SWATHELL   MI        5        1649       1372.
##  8     8 EZEKIEL HOUGHTON    MI        5        1641       1468.
##  9     9 STEFANO LEE         ON        5        1411       1523.
## 10    10 ANVIT RAO           MI        5        1365       1554.
## # … with 54 more rows