R Markdown

This is an R Markdown document for Sean’s first DATA 607 Project where the intent is to extract data on chess matches from a cross table and produce a CSV file that captures the players name, state, total score, pre-rating, and the average of their opponents pre-ratings.

# First I want to remove any arrows as the dash in "->" is going to mess us up later.
file_str_2 <- str_replace_all(file_str, "->", "|")

# Here we're using gsub to remove any dashes surrounded by letters.
file_str_3 <- gsub("(?<=[A-Z])-(?=[A-Z])", " ", file_str_2, perl = TRUE)

# Extract any text that isn't a dash and make it a list.
extraction <- str_extract_all(file_str_3, '([^-]+)')
rows <- unlist(extraction)

# Clean up the list items for further processing.
rows_2 <- str_replace_all(rows, "/", "|")
rows_3 <- str_remove_all(rows_2, "\n")

# Shorten each row to remove unnecessary data.
rows_4 <- sub("^([^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|).*$", "\\1", rows_3)

# Break up each string by a delimiter to create a list of lists
rows_5 <- str_split(rows_4, "\\|")

#convert a list of lists by row into a data frame.
df <- as.data.frame(do.call(rbind, rows_5))

# Change the order of the columns. 
df2 <- df[,c("V1", "V2", "V11", "V3", "V13", "V4", "V5", "V6", "V7", "V8", "V9", "V10")]

# Rename and reorder all the columns.
colnames(df2) <- c("Pair", "Player Name", "State", "Point Total", "Pre-rating", "Round 1", "Round 2", "Round 3", "Round 4", "Round 5", "Round 6", "Round 7")

# Removing the first row as it doesn't contain player data and removing any characters that are not numbers in the Pre-rating and Round [#] columns.
df3 <- df2[-1,]
df3$`Pre-rating` <- as.numeric(str_extract(df3$`Pre-rating`, "\\d+"))
df3$`Round 1` <- as.integer(str_remove_all(df3$`Round 1`, "[^0-9]"))
df3$`Round 2` <- as.integer(str_remove_all(df3$`Round 2`, "[^0-9]"))
df3$`Round 3` <- as.integer(str_remove_all(df3$`Round 3`, "[^0-9]"))
df3$`Round 4` <- as.integer(str_remove_all(df3$`Round 4`, "[^0-9]"))
df3$`Round 5` <- as.integer(str_remove_all(df3$`Round 5`, "[^0-9]"))
df3$`Round 6` <- as.integer(str_remove_all(df3$`Round 6`, "[^0-9]"))
df3$`Round 7` <- as.integer(str_remove_all(df3$`Round 7`, "[^0-9]"))

# Replacing the index number for each opponent with their corresponding score for mean calculations.
df3$`Point Total`<- as.numeric(df3$`Point Total`)
df3$`Round 1` <- as.numeric(df3$`Pre-rating`[df3$`Round 1`])
df3$`Round 2` <- as.numeric(df3$`Pre-rating`[df3$`Round 2`])
df3$`Round 3` <- as.numeric(df3$`Pre-rating`[df3$`Round 3`])
df3$`Round 4` <- as.numeric(df3$`Pre-rating`[df3$`Round 4`])
df3$`Round 5` <- as.numeric(df3$`Pre-rating`[df3$`Round 5`])
df3$`Round 6` <- as.numeric(df3$`Pre-rating`[df3$`Round 6`])
df3$`Round 7` <- as.numeric(df3$`Pre-rating`[df3$`Round 7`])

# Finally we calculate the mean the average pre-rating of the opponents they faced.
for (i in 1:nrow(df3)){
  x <- c(df3$`Round 1`[i], df3$`Round 2`[i], df3$`Round 3`[i], df3$`Round 4`[i], df3$`Round 5`[i], df3$`Round 6`[i], df3$`Round 7`[i])
  df3$`Average Opponent Rating`[i] <- mean(na.exclude(x))
}

# Final steps are formatting and removing columns, then putting the df into a .csv format.
df3$`Average Opponent Rating` <- as.integer(df3$`Average Opponent Rating`)

df4 <- df3[,c(-1, -6, -7, -8, -9, -10, -11, -12)]
df4$`Player Name` <- gsub("^\\s+|\\s+$", "", df4$`Player Name`)
df4$State <- trimws(df4$State)

df5 <- data.frame(df4, row.names = NULL)

print(df5)
##                   Player.Name State Point.Total Pre.rating
## 1                    GARY HUA    ON         6.0       1794
## 2             DAKSHESH DARURI    MI         6.0       1553
## 3                ADITYA BAJAJ    MI         6.0       1384
## 4         PATRICK H SCHILLING    MI         5.5       1716
## 5                  HANSHI ZUO    MI         5.5       1655
## 6                 HANSEN SONG    OH         5.0       1686
## 7           GARY DEE SWATHELL    MI         5.0       1649
## 8            EZEKIEL HOUGHTON    MI         5.0       1641
## 9                 STEFANO LEE    ON         5.0       1411
## 10                  ANVIT RAO    MI         5.0       1365
## 11   CAMERON WILLIAM MC LEMAN    MI         4.5       1712
## 12             KENNETH J TACK    MI         4.5       1663
## 13          TORRANCE HENRY JR    MI         4.5       1666
## 14               BRADLEY SHAW    MI         4.5       1610
## 15     ZACHARY JAMES HOUGHTON    MI         4.5       1220
## 16               MIKE NIKITIN    MI         4.0       1604
## 17         RONALD GRZEGORCZYK    MI         4.0       1629
## 18              DAVID SUNDEEN    MI         4.0       1600
## 19               DIPANKAR ROY    MI         4.0       1564
## 20                JASON ZHENG    MI         4.0       1595
## 21              DINH DANG BUI    ON         4.0       1563
## 22           EUGENE L MCCLURE    MI         4.0       1555
## 23                   ALAN BUI    ON         4.0       1363
## 24          MICHAEL R ALDRICH    MI         4.0       1229
## 25           LOREN SCHWIEBERT    MI         3.5       1745
## 26                    MAX ZHU    ON         3.5       1579
## 27             GAURAV GIDWANI    MI         3.5       1552
## 28 SOFIA ADINA STANESCU BELLU    MI         3.5       1507
## 29           CHIEDOZIE OKORIE    MI         3.5       1602
## 30         GEORGE AVERY JONES    ON         3.5       1522
## 31               RISHI SHETTY    MI         3.5       1494
## 32      JOSHUA PHILIP MATHEWS    ON         3.5       1441
## 33                    JADE GE    MI         3.5       1449
## 34     MICHAEL JEFFERY THOMAS    MI         3.5       1399
## 35           JOSHUA DAVID LEE    MI         3.5       1438
## 36              SIDDHARTH JHA    MI         3.5       1355
## 37       AMIYATOSH PWNANANDAM    MI         3.5        980
## 38                  BRIAN LIU    MI         3.0       1423
## 39              JOEL R HENDON    MI         3.0       1436
## 40               FOREST ZHANG    MI         3.0       1348
## 41        KYLE WILLIAM MURPHY    MI         3.0       1403
## 42                   JARED GE    MI         3.0       1332
## 43          ROBERT GLEN VASEY    MI         3.0       1283
## 44         JUSTIN D SCHILLING    MI         3.0       1199
## 45                  DEREK YAN    MI         3.0       1242
## 46   JACOB ALEXANDER LAVALLEY    MI         3.0        377
## 47                ERIC WRIGHT    MI         2.5       1362
## 48               DANIEL KHAIN    MI         2.5       1382
## 49           MICHAEL J MARTIN    MI         2.5       1291
## 50                 SHIVAM JHA    MI         2.5       1056
## 51             TEJAS AYYAGARI    MI         2.5       1011
## 52                  ETHAN GUO    MI         2.5        935
## 53              JOSE C YBARRA    MI         2.0       1393
## 54                LARRY HODGE    MI         2.0       1270
## 55                  ALEX KONG    MI         2.0       1186
## 56               MARISA RICCI    MI         2.0       1153
## 57                 MICHAEL LU    MI         2.0       1092
## 58               VIRAJ MOHILE    MI         2.0        917
## 59          SEAN M MC CORMICK    MI         2.0        853
## 60                 JULIA SHEN    MI         1.5        967
## 61              JEZZEL FARKAS    ON         1.5        955
## 62              ASHWIN BALAJI    MI         1.0       1530
## 63       THOMAS JOSEPH HOSMER    MI         1.0       1175
## 64                     BEN LI    MI         1.0       1163
##    Average.Opponent.Rating
## 1                     1605
## 2                     1469
## 3                     1563
## 4                     1573
## 5                     1500
## 6                     1518
## 7                     1372
## 8                     1468
## 9                     1523
## 10                    1554
## 11                    1467
## 12                    1506
## 13                    1497
## 14                    1515
## 15                    1483
## 16                    1385
## 17                    1498
## 18                    1480
## 19                    1426
## 20                    1410
## 21                    1470
## 22                    1300
## 23                    1213
## 24                    1357
## 25                    1363
## 26                    1506
## 27                    1221
## 28                    1522
## 29                    1313
## 30                    1144
## 31                    1259
## 32                    1378
## 33                    1276
## 34                    1375
## 35                    1149
## 36                    1388
## 37                    1384
## 38                    1539
## 39                    1429
## 40                    1390
## 41                    1248
## 42                    1149
## 43                    1106
## 44                    1327
## 45                    1152
## 46                    1357
## 47                    1392
## 48                    1355
## 49                    1285
## 50                    1296
## 51                    1356
## 52                    1494
## 53                    1345
## 54                    1206
## 55                    1406
## 56                    1414
## 57                    1363
## 58                    1391
## 59                    1319
## 60                    1330
## 61                    1327
## 62                    1186
## 63                    1350
## 64                    1263
write.csv(df5, file = "chess_data.csv", row.names = FALSE)