Input: A text file with chess tournament results where the information has some structure.
Output: A .CSV file with the following information for all of the players:
# Load packages
library(stringr)
library(data.table)
# Read in data file skipping header block
tournament <- read.csv("https://raw.githubusercontent.com/ilyakats/CUNY-DATA607/master/tournamentinfo.txt",
sep = ",", skip = 3)
# Sample raw input data
head(tournament)
## X.........................................................................................
## 1 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 2 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 3 -----------------------------------------------------------------------------------------
## 4 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## 5 MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## 6 -----------------------------------------------------------------------------------------
# Remove all lines not containing pipes
tournament <- tournament[str_detect(tournament[,1], "\\|"), ]
# Combine first and second lines of each entry into a single line
tournament <- str_c(tournament[seq(1, length(tournament), 2)], tournament[seq(2, length(tournament), 2)], sep = "")
# Split into columns by pipes and at the same time split ratings
# Trim any whitespaces
tournament <- data.frame(str_split_fixed(tournament, "\\||/|->", n = Inf))
tournament <- data.frame(lapply(tournament, trimws))
# Drop last column
tournament <- tournament[, 1:ncol(tournament)-1]
# Change column names to meaningful values
colnames(tournament) <- c("playerId", "name", "points",
"game1", "game2", "game3", "game4", "game5", "game6", "game7",
"state", "uscfId" , "ratingPre", "ratingPost", "nValue",
"color1", "color2", "color3", "color4", "color5", "color6", "color7")
# Normalize rating pre match - Remove 'R:' and provisional value
tournament$ratingPre <- as.numeric(str_replace(tournament$ratingPre, "R: *([:digit:]+)(P.*)?", "\\1"))
# Normalize rating post match - Remove provisional value
tournament$ratingPost <- as.numeric(str_replace(tournament$ratingPost, "([:digit:]+)(P.*)?", "\\1"))
# Convert ID to numeric
tournament$playerId <- as.numeric(as.character(tournament$playerId))
Note: The table below lists only results of the first game.
| ID | Name | Points | Game 1 | State | USCF ID | Rating Pre | Rating Post | Color 1 |
|---|---|---|---|---|---|---|---|---|
| 1 | GARY HUA | 6.0 | W 39 | ON | 15445895 | 1794 | 1817 | W |
| 2 | DAKSHESH DARURI | 6.0 | W 63 | MI | 14598900 | 1553 | 1663 | B |
| 3 | ADITYA BAJAJ | 6.0 | L 8 | MI | 14959604 | 1384 | 1640 | W |
| 4 | PATRICK H SCHILLING | 5.5 | W 23 | MI | 12616049 | 1716 | 1744 | W |
| 5 | HANSHI ZUO | 5.5 | W 45 | MI | 14601533 | 1655 | 1690 | B |
| 6 | HANSEN SONG | 5.0 | W 34 | OH | 15055204 | 1686 | 1687 | W |
| 7 | GARY DEE SWATHELL | 5.0 | W 57 | MI | 11146376 | 1649 | 1673 | W |
| 8 | EZEKIEL HOUGHTON | 5.0 | W 3 | MI | 15142253 | 1641 | 1657 | B |
| 9 | STEFANO LEE | 5.0 | W 25 | ON | 14954524 | 1411 | 1564 | W |
| 10 | ANVIT RAO | 5.0 | D 16 | MI | 14150362 | 1365 | 1544 | W |
| 11 | CAMERON WILLIAM MC LEMAN | 4.5 | D 38 | MI | 12581589 | 1712 | 1696 | B |
| 12 | KENNETH J TACK | 4.5 | W 42 | MI | 12681257 | 1663 | 1670 | W |
| 13 | TORRANCE HENRY JR | 4.5 | W 36 | MI | 15082995 | 1666 | 1662 | B |
| 14 | BRADLEY SHAW | 4.5 | W 54 | MI | 10131499 | 1610 | 1618 | W |
| 15 | ZACHARY JAMES HOUGHTON | 4.5 | D 19 | MI | 15619130 | 1220 | 1416 | B |
| 16 | MIKE NIKITIN | 4.0 | D 10 | MI | 10295068 | 1604 | 1613 | B |
| 17 | RONALD GRZEGORCZYK | 4.0 | W 48 | MI | 10297702 | 1629 | 1610 | W |
| 18 | DAVID SUNDEEN | 4.0 | W 47 | MI | 11342094 | 1600 | 1600 | B |
| 19 | DIPANKAR ROY | 4.0 | D 15 | MI | 14862333 | 1564 | 1570 | W |
| 20 | JASON ZHENG | 4.0 | L 40 | MI | 14529060 | 1595 | 1569 | W |
| 21 | DINH DANG BUI | 4.0 | W 43 | ON | 15495066 | 1563 | 1562 | B |
| 22 | EUGENE L MCCLURE | 4.0 | W 64 | MI | 12405534 | 1555 | 1529 | W |
| 23 | ALAN BUI | 4.0 | L 4 | ON | 15030142 | 1363 | 1371 | B |
| 24 | MICHAEL R ALDRICH | 4.0 | L 28 | MI | 13469010 | 1229 | 1300 | B |
| 25 | LOREN SCHWIEBERT | 3.5 | L 9 | MI | 12486656 | 1745 | 1681 | B |
| 26 | MAX ZHU | 3.5 | W 49 | ON | 15131520 | 1579 | 1564 | B |
| 27 | GAURAV GIDWANI | 3.5 | W 51 | MI | 14476567 | 1552 | 1539 | W |
| 28 | SOFIA ADINA STANESCU-BELLU | 3.5 | W 24 | MI | 14882954 | 1507 | 1513 | W |
| 29 | CHIEDOZIE OKORIE | 3.5 | W 50 | MI | 15323285 | 1602 | 1508 | B |
| 30 | GEORGE AVERY JONES | 3.5 | L 52 | ON | 12577178 | 1522 | 1444 | W |
| 31 | RISHI SHETTY | 3.5 | L 58 | MI | 15131618 | 1494 | 1444 | B |
| 32 | JOSHUA PHILIP MATHEWS | 3.5 | W 61 | ON | 14073750 | 1441 | 1433 | W |
| 33 | JADE GE | 3.5 | W 60 | MI | 14691842 | 1449 | 1421 | B |
| 34 | MICHAEL JEFFERY THOMAS | 3.5 | L 6 | MI | 15051807 | 1399 | 1400 | B |
| 35 | JOSHUA DAVID LEE | 3.5 | L 46 | MI | 14601397 | 1438 | 1392 | W |
| 36 | SIDDHARTH JHA | 3.5 | L 13 | MI | 14773163 | 1355 | 1367 | W |
| 37 | AMIYATOSH PWNANANDAM | 3.5 | B | MI | 15489571 | 980 | 1077 | |
| 38 | BRIAN LIU | 3.0 | D 11 | MI | 15108523 | 1423 | 1439 | W |
| 39 | JOEL R HENDON | 3.0 | L 1 | MI | 12923035 | 1436 | 1413 | B |
| 40 | FOREST ZHANG | 3.0 | W 20 | MI | 14892710 | 1348 | 1346 | B |
| 41 | KYLE WILLIAM MURPHY | 3.0 | W 59 | MI | 15761443 | 1403 | 1341 | B |
| 42 | JARED GE | 3.0 | L 12 | MI | 14462326 | 1332 | 1256 | B |
| 43 | ROBERT GLEN VASEY | 3.0 | L 21 | MI | 14101068 | 1283 | 1244 | W |
| 44 | JUSTIN D SCHILLING | 3.0 | B | MI | 15323504 | 1199 | 1199 | |
| 45 | DEREK YAN | 3.0 | L 5 | MI | 15372807 | 1242 | 1191 | W |
| 46 | JACOB ALEXANDER LAVALLEY | 3.0 | W 35 | MI | 15490981 | 377 | 1076 | B |
| 47 | ERIC WRIGHT | 2.5 | L 18 | MI | 12533115 | 1362 | 1341 | W |
| 48 | DANIEL KHAIN | 2.5 | L 17 | MI | 14369165 | 1382 | 1335 | B |
| 49 | MICHAEL J MARTIN | 2.5 | L 26 | MI | 12531685 | 1291 | 1259 | W |
| 50 | SHIVAM JHA | 2.5 | L 29 | MI | 14773178 | 1056 | 1111 | W |
| 51 | TEJAS AYYAGARI | 2.5 | L 27 | MI | 15205474 | 1011 | 1097 | B |
| 52 | ETHAN GUO | 2.5 | W 30 | MI | 14918803 | 935 | 1092 | B |
| 53 | JOSE C YBARRA | 2.0 | H | MI | 12578849 | 1393 | 1359 | |
| 54 | LARRY HODGE | 2.0 | L 14 | MI | 12836773 | 1270 | 1200 | B |
| 55 | ALEX KONG | 2.0 | L 62 | MI | 15412571 | 1186 | 1163 | W |
| 56 | MARISA RICCI | 2.0 | H | MI | 14679887 | 1153 | 1140 | |
| 57 | MICHAEL LU | 2.0 | L 7 | MI | 15113330 | 1092 | 1079 | B |
| 58 | VIRAJ MOHILE | 2.0 | W 31 | MI | 14700365 | 917 | 941 | W |
| 59 | SEAN M MC CORMICK | 2.0 | L 41 | MI | 12841036 | 853 | 878 | W |
| 60 | JULIA SHEN | 1.5 | L 33 | MI | 14579262 | 967 | 984 | W |
| 61 | JEZZEL FARKAS | 1.5 | L 32 | ON | 15771592 | 955 | 979 | B |
| 62 | ASHWIN BALAJI | 1.0 | W 55 | MI | 15219542 | 1530 | 1535 | B |
| 63 | THOMAS JOSEPH HOSMER | 1.0 | L 2 | MI | 15057092 | 1175 | 1125 | W |
| 64 | BEN LI | 1.0 | L 22 | MI | 15006561 | 1163 | 1112 | B |
# Melt tournament data to work on individual games data
games <- melt(tournament,
id=c("playerId", "name"),
measure=c("game1", "game2", "game3", "game4", "game5", "game6", "game7"))
# Split results into result value and opponent id
games <- cbind(games[, 1:3],data.frame(str_split_fixed(games[, 4], " +", n = Inf)))
# Assign friendly names
colnames(games) <- c("playerId", "name", "gameNo", "result", "opponentId")
# Clean up game number column
games$gameNo <- str_replace(games$gameNo, "game", "")
| Player ID | Name | Game | Result | Opponent ID |
|---|---|---|---|---|
| 1 | GARY HUA | 1 | W | 39 |
| 2 | DAKSHESH DARURI | 1 | W | 63 |
| 3 | ADITYA BAJAJ | 1 | L | 8 |
| 4 | PATRICK H SCHILLING | 1 | W | 23 |
| 5 | HANSHI ZUO | 1 | W | 45 |
| 6 | HANSEN SONG | 1 | W | 34 |
Key to the result field:
# Merge tournament and games information
# The merge drops any games without a W/L/D decision (opponent ID is blank)
games_expand <- data.table(merge(games, tournament, by.x = "opponentId", by.y = "playerId"))
# Calculate average pre-tournament rating
avg_pre_rating <- games_expand[, mean(ratingPre), by = playerId]
# Data cleanup
colnames(avg_pre_rating) <- c("playerId", "oppAverage")
avg_pre_rating$oppAverage <- round(avg_pre_rating$oppAverage)
# Add average to the tournament data
tournament <- merge(tournament, avg_pre_rating, by = "playerId")
| ID | Name | Points | State | Pre-Tournament Rating | Opp Avg Rating |
|---|---|---|---|---|---|
| 1 | GARY HUA | 6.0 | ON | 1794 | 1605 |
| 2 | DAKSHESH DARURI | 6.0 | MI | 1553 | 1469 |
| 3 | ADITYA BAJAJ | 6.0 | MI | 1384 | 1564 |
| 4 | PATRICK H SCHILLING | 5.5 | MI | 1716 | 1574 |
| 5 | HANSHI ZUO | 5.5 | MI | 1655 | 1501 |
| 6 | HANSEN SONG | 5.0 | OH | 1686 | 1519 |
| 7 | GARY DEE SWATHELL | 5.0 | MI | 1649 | 1372 |
| 8 | EZEKIEL HOUGHTON | 5.0 | MI | 1641 | 1468 |
| 9 | STEFANO LEE | 5.0 | ON | 1411 | 1523 |
| 10 | ANVIT RAO | 5.0 | MI | 1365 | 1554 |
| 11 | CAMERON WILLIAM MC LEMAN | 4.5 | MI | 1712 | 1468 |
| 12 | KENNETH J TACK | 4.5 | MI | 1663 | 1506 |
| 13 | TORRANCE HENRY JR | 4.5 | MI | 1666 | 1498 |
| 14 | BRADLEY SHAW | 4.5 | MI | 1610 | 1515 |
| 15 | ZACHARY JAMES HOUGHTON | 4.5 | MI | 1220 | 1484 |
| 16 | MIKE NIKITIN | 4.0 | MI | 1604 | 1386 |
| 17 | RONALD GRZEGORCZYK | 4.0 | MI | 1629 | 1499 |
| 18 | DAVID SUNDEEN | 4.0 | MI | 1600 | 1480 |
| 19 | DIPANKAR ROY | 4.0 | MI | 1564 | 1426 |
| 20 | JASON ZHENG | 4.0 | MI | 1595 | 1411 |
| 21 | DINH DANG BUI | 4.0 | ON | 1563 | 1470 |
| 22 | EUGENE L MCCLURE | 4.0 | MI | 1555 | 1300 |
| 23 | ALAN BUI | 4.0 | ON | 1363 | 1214 |
| 24 | MICHAEL R ALDRICH | 4.0 | MI | 1229 | 1357 |
| 25 | LOREN SCHWIEBERT | 3.5 | MI | 1745 | 1363 |
| 26 | MAX ZHU | 3.5 | ON | 1579 | 1507 |
| 27 | GAURAV GIDWANI | 3.5 | MI | 1552 | 1222 |
| 28 | SOFIA ADINA STANESCU-BELLU | 3.5 | MI | 1507 | 1522 |
| 29 | CHIEDOZIE OKORIE | 3.5 | MI | 1602 | 1314 |
| 30 | GEORGE AVERY JONES | 3.5 | ON | 1522 | 1144 |
| 31 | RISHI SHETTY | 3.5 | MI | 1494 | 1260 |
| 32 | JOSHUA PHILIP MATHEWS | 3.5 | ON | 1441 | 1379 |
| 33 | JADE GE | 3.5 | MI | 1449 | 1277 |
| 34 | MICHAEL JEFFERY THOMAS | 3.5 | MI | 1399 | 1375 |
| 35 | JOSHUA DAVID LEE | 3.5 | MI | 1438 | 1150 |
| 36 | SIDDHARTH JHA | 3.5 | MI | 1355 | 1388 |
| 37 | AMIYATOSH PWNANANDAM | 3.5 | MI | 980 | 1385 |
| 38 | BRIAN LIU | 3.0 | MI | 1423 | 1539 |
| 39 | JOEL R HENDON | 3.0 | MI | 1436 | 1430 |
| 40 | FOREST ZHANG | 3.0 | MI | 1348 | 1391 |
| 41 | KYLE WILLIAM MURPHY | 3.0 | MI | 1403 | 1248 |
| 42 | JARED GE | 3.0 | MI | 1332 | 1150 |
| 43 | ROBERT GLEN VASEY | 3.0 | MI | 1283 | 1107 |
| 44 | JUSTIN D SCHILLING | 3.0 | MI | 1199 | 1327 |
| 45 | DEREK YAN | 3.0 | MI | 1242 | 1152 |
| 46 | JACOB ALEXANDER LAVALLEY | 3.0 | MI | 377 | 1358 |
| 47 | ERIC WRIGHT | 2.5 | MI | 1362 | 1392 |
| 48 | DANIEL KHAIN | 2.5 | MI | 1382 | 1356 |
| 49 | MICHAEL J MARTIN | 2.5 | MI | 1291 | 1286 |
| 50 | SHIVAM JHA | 2.5 | MI | 1056 | 1296 |
| 51 | TEJAS AYYAGARI | 2.5 | MI | 1011 | 1356 |
| 52 | ETHAN GUO | 2.5 | MI | 935 | 1495 |
| 53 | JOSE C YBARRA | 2.0 | MI | 1393 | 1345 |
| 54 | LARRY HODGE | 2.0 | MI | 1270 | 1206 |
| 55 | ALEX KONG | 2.0 | MI | 1186 | 1406 |
| 56 | MARISA RICCI | 2.0 | MI | 1153 | 1414 |
| 57 | MICHAEL LU | 2.0 | MI | 1092 | 1363 |
| 58 | VIRAJ MOHILE | 2.0 | MI | 917 | 1391 |
| 59 | SEAN M MC CORMICK | 2.0 | MI | 853 | 1319 |
| 60 | JULIA SHEN | 1.5 | MI | 967 | 1330 |
| 61 | JEZZEL FARKAS | 1.5 | ON | 955 | 1327 |
| 62 | ASHWIN BALAJI | 1.0 | MI | 1530 | 1186 |
| 63 | THOMAS JOSEPH HOSMER | 1.0 | MI | 1175 | 1350 |
| 64 | BEN LI | 1.0 | MI | 1163 | 1263 |
write.table(tournament[, c(2, 3, 11, 13, 23)], file="tournament_summary.csv", sep = ",",
row.names = FALSE,
col.names = c("Name", "Points", "State", "Pre-Tournament Rating", "Opp Avg Rating"))
Comments
This project relies on input data to be in a certain format which, given uncommon input format, may limit parsing code’s usefulness and make it unreliable in reproducibility. For example, the input text file must have a header block consisting of 3 lines and each player’s information must be split over 2 lines.