This is an R Markdown document for Sean’s first DATA 607 Project where the intent is to extract data on chess matches from a cross table and produce a CSV file that captures the players name, state, total score, pre-rating, and the average of their opponents pre-ratings.
# First I want to remove any arrows as the dash in "->" is going to mess us up later.
file_str_2 <- str_replace_all(file_str, "->", "|")
# Here we're using gsub to remove any dashes surrounded by letters.
file_str_3 <- gsub("(?<=[A-Z])-(?=[A-Z])", " ", file_str_2, perl = TRUE)
# Extract any text that isn't a dash and make it a list.
extraction <- str_extract_all(file_str_3, '([^-]+)')
rows <- unlist(extraction)
# Clean up the list items for further processing.
rows_2 <- str_replace_all(rows, "/", "|")
rows_3 <- str_remove_all(rows_2, "\n")
# Shorten each row to remove unnecessary data.
rows_4 <- sub("^([^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|[^|]+\\|).*$", "\\1", rows_3)
# Break up each string by a delimiter to create a list of lists
rows_5 <- str_split(rows_4, "\\|")
#convert a list of lists by row into a data frame.
df <- as.data.frame(do.call(rbind, rows_5))
# Change the order of the columns.
df2 <- df[,c("V1", "V2", "V11", "V3", "V13", "V4", "V5", "V6", "V7", "V8", "V9", "V10")]
# Rename and reorder all the columns.
colnames(df2) <- c("Pair", "Player Name", "State", "Point Total", "Pre-rating", "Round 1", "Round 2", "Round 3", "Round 4", "Round 5", "Round 6", "Round 7")
# Removing the first row as it doesn't contain player data and removing any characters that are not numbers in the Pre-rating and Round [#] columns.
df3 <- df2[-1,]
df3$`Pre-rating` <- as.numeric(str_extract(df3$`Pre-rating`, "\\d+"))
df3$`Round 1` <- as.integer(str_remove_all(df3$`Round 1`, "[^0-9]"))
df3$`Round 2` <- as.integer(str_remove_all(df3$`Round 2`, "[^0-9]"))
df3$`Round 3` <- as.integer(str_remove_all(df3$`Round 3`, "[^0-9]"))
df3$`Round 4` <- as.integer(str_remove_all(df3$`Round 4`, "[^0-9]"))
df3$`Round 5` <- as.integer(str_remove_all(df3$`Round 5`, "[^0-9]"))
df3$`Round 6` <- as.integer(str_remove_all(df3$`Round 6`, "[^0-9]"))
df3$`Round 7` <- as.integer(str_remove_all(df3$`Round 7`, "[^0-9]"))
# Replacing the index number for each opponent with their corresponding score for mean calculations.
df3$`Point Total`<- as.numeric(df3$`Point Total`)
df3$`Round 1` <- as.numeric(df3$`Pre-rating`[df3$`Round 1`])
df3$`Round 2` <- as.numeric(df3$`Pre-rating`[df3$`Round 2`])
df3$`Round 3` <- as.numeric(df3$`Pre-rating`[df3$`Round 3`])
df3$`Round 4` <- as.numeric(df3$`Pre-rating`[df3$`Round 4`])
df3$`Round 5` <- as.numeric(df3$`Pre-rating`[df3$`Round 5`])
df3$`Round 6` <- as.numeric(df3$`Pre-rating`[df3$`Round 6`])
df3$`Round 7` <- as.numeric(df3$`Pre-rating`[df3$`Round 7`])
# Finally we calculate the mean the average pre-rating of the opponents they faced.
for (i in 1:nrow(df3)){
x <- c(df3$`Round 1`[i], df3$`Round 2`[i], df3$`Round 3`[i], df3$`Round 4`[i], df3$`Round 5`[i], df3$`Round 6`[i], df3$`Round 7`[i])
df3$`Average Opponent Rating`[i] <- mean(na.exclude(x))
}
# Final steps are formatting and removing columns, then putting the df into a .csv format.
df3$`Average Opponent Rating` <- as.integer(df3$`Average Opponent Rating`)
df4 <- df3[,c(-1, -6, -7, -8, -9, -10, -11, -12)]
df4$`Player Name` <- gsub("^\\s+|\\s+$", "", df4$`Player Name`)
df4$State <- trimws(df4$State)
df5 <- data.frame(df4, row.names = NULL)
print(df5)
## Player.Name State Point.Total Pre.rating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## 7 GARY DEE SWATHELL MI 5.0 1649
## 8 EZEKIEL HOUGHTON MI 5.0 1641
## 9 STEFANO LEE ON 5.0 1411
## 10 ANVIT RAO MI 5.0 1365
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712
## 12 KENNETH J TACK MI 4.5 1663
## 13 TORRANCE HENRY JR MI 4.5 1666
## 14 BRADLEY SHAW MI 4.5 1610
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220
## 16 MIKE NIKITIN MI 4.0 1604
## 17 RONALD GRZEGORCZYK MI 4.0 1629
## 18 DAVID SUNDEEN MI 4.0 1600
## 19 DIPANKAR ROY MI 4.0 1564
## 20 JASON ZHENG MI 4.0 1595
## 21 DINH DANG BUI ON 4.0 1563
## 22 EUGENE L MCCLURE MI 4.0 1555
## 23 ALAN BUI ON 4.0 1363
## 24 MICHAEL R ALDRICH MI 4.0 1229
## 25 LOREN SCHWIEBERT MI 3.5 1745
## 26 MAX ZHU ON 3.5 1579
## 27 GAURAV GIDWANI MI 3.5 1552
## 28 SOFIA ADINA STANESCU BELLU MI 3.5 1507
## 29 CHIEDOZIE OKORIE MI 3.5 1602
## 30 GEORGE AVERY JONES ON 3.5 1522
## 31 RISHI SHETTY MI 3.5 1494
## 32 JOSHUA PHILIP MATHEWS ON 3.5 1441
## 33 JADE GE MI 3.5 1449
## 34 MICHAEL JEFFERY THOMAS MI 3.5 1399
## 35 JOSHUA DAVID LEE MI 3.5 1438
## 36 SIDDHARTH JHA MI 3.5 1355
## 37 AMIYATOSH PWNANANDAM MI 3.5 980
## 38 BRIAN LIU MI 3.0 1423
## 39 JOEL R HENDON MI 3.0 1436
## 40 FOREST ZHANG MI 3.0 1348
## 41 KYLE WILLIAM MURPHY MI 3.0 1403
## 42 JARED GE MI 3.0 1332
## 43 ROBERT GLEN VASEY MI 3.0 1283
## 44 JUSTIN D SCHILLING MI 3.0 1199
## 45 DEREK YAN MI 3.0 1242
## 46 JACOB ALEXANDER LAVALLEY MI 3.0 377
## 47 ERIC WRIGHT MI 2.5 1362
## 48 DANIEL KHAIN MI 2.5 1382
## 49 MICHAEL J MARTIN MI 2.5 1291
## 50 SHIVAM JHA MI 2.5 1056
## 51 TEJAS AYYAGARI MI 2.5 1011
## 52 ETHAN GUO MI 2.5 935
## 53 JOSE C YBARRA MI 2.0 1393
## 54 LARRY HODGE MI 2.0 1270
## 55 ALEX KONG MI 2.0 1186
## 56 MARISA RICCI MI 2.0 1153
## 57 MICHAEL LU MI 2.0 1092
## 58 VIRAJ MOHILE MI 2.0 917
## 59 SEAN M MC CORMICK MI 2.0 853
## 60 JULIA SHEN MI 1.5 967
## 61 JEZZEL FARKAS ON 1.5 955
## 62 ASHWIN BALAJI MI 1.0 1530
## 63 THOMAS JOSEPH HOSMER MI 1.0 1175
## 64 BEN LI MI 1.0 1163
## Average.Opponent.Rating
## 1 1605
## 2 1469
## 3 1563
## 4 1573
## 5 1500
## 6 1518
## 7 1372
## 8 1468
## 9 1523
## 10 1554
## 11 1467
## 12 1506
## 13 1497
## 14 1515
## 15 1483
## 16 1385
## 17 1498
## 18 1480
## 19 1426
## 20 1410
## 21 1470
## 22 1300
## 23 1213
## 24 1357
## 25 1363
## 26 1506
## 27 1221
## 28 1522
## 29 1313
## 30 1144
## 31 1259
## 32 1378
## 33 1276
## 34 1375
## 35 1149
## 36 1388
## 37 1384
## 38 1539
## 39 1429
## 40 1390
## 41 1248
## 42 1149
## 43 1106
## 44 1327
## 45 1152
## 46 1357
## 47 1392
## 48 1355
## 49 1285
## 50 1296
## 51 1356
## 52 1494
## 53 1345
## 54 1206
## 55 1406
## 56 1414
## 57 1363
## 58 1391
## 59 1319
## 60 1330
## 61 1327
## 62 1186
## 63 1350
## 64 1263
write.csv(df5, file = "chess_data.csv", row.names = FALSE)