Project 1: Tournament Data

Input: A text file with chess tournament results where the information has some structure.

Output: A .CSV file with the following information for all of the players:

Setup and Data Import

# Load packages
library(stringr)
library(data.table)

# Read in data file skipping header block
tournament <- read.csv("https://raw.githubusercontent.com/ilyakats/CUNY-DATA607/master/tournamentinfo.txt", 
                       sep = ",", skip = 3)

# Sample raw input data
head(tournament)
##   X.........................................................................................
## 1      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 2     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 3  -----------------------------------------------------------------------------------------
## 4      2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## 5     MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 6  -----------------------------------------------------------------------------------------

Data Parsing: Part 1

# Remove all lines not containing pipes
tournament <- tournament[str_detect(tournament[,1], "\\|"), ]

# Combine first and second lines of each entry into a single line
tournament <- str_c(tournament[seq(1, length(tournament), 2)], tournament[seq(2, length(tournament), 2)], sep = "")

# Split into columns by pipes and at the same time split ratings
# Trim any whitespaces
tournament <- data.frame(str_split_fixed(tournament, "\\||/|->", n = Inf))
tournament <- data.frame(lapply(tournament, trimws))

# Drop last column
tournament <- tournament[, 1:ncol(tournament)-1]

# Change column names to meaningful values
colnames(tournament) <- c("playerId", "name", "points", 
                          "game1", "game2", "game3", "game4", "game5", "game6", "game7",
                          "state", "uscfId" , "ratingPre", "ratingPost", "nValue",
                          "color1", "color2", "color3", "color4", "color5", "color6", "color7")

# Normalize rating pre match - Remove 'R:' and provisional value
tournament$ratingPre <- as.numeric(str_replace(tournament$ratingPre, "R: *([:digit:]+)(P.*)?", "\\1"))

# Normalize rating post match - Remove provisional value
tournament$ratingPost <- as.numeric(str_replace(tournament$ratingPost, "([:digit:]+)(P.*)?", "\\1"))

# Convert ID to numeric
tournament$playerId <- as.numeric(as.character(tournament$playerId))

Note: The table below lists only results of the first game.

Table 1. Main data table.
ID Name Points Game 1 State USCF ID Rating Pre Rating Post Color 1
1 GARY HUA 6.0 W 39 ON 15445895 1794 1817 W
2 DAKSHESH DARURI 6.0 W 63 MI 14598900 1553 1663 B
3 ADITYA BAJAJ 6.0 L 8 MI 14959604 1384 1640 W
4 PATRICK H SCHILLING 5.5 W 23 MI 12616049 1716 1744 W
5 HANSHI ZUO 5.5 W 45 MI 14601533 1655 1690 B
6 HANSEN SONG 5.0 W 34 OH 15055204 1686 1687 W
7 GARY DEE SWATHELL 5.0 W 57 MI 11146376 1649 1673 W
8 EZEKIEL HOUGHTON 5.0 W 3 MI 15142253 1641 1657 B
9 STEFANO LEE 5.0 W 25 ON 14954524 1411 1564 W
10 ANVIT RAO 5.0 D 16 MI 14150362 1365 1544 W
11 CAMERON WILLIAM MC LEMAN 4.5 D 38 MI 12581589 1712 1696 B
12 KENNETH J TACK 4.5 W 42 MI 12681257 1663 1670 W
13 TORRANCE HENRY JR 4.5 W 36 MI 15082995 1666 1662 B
14 BRADLEY SHAW 4.5 W 54 MI 10131499 1610 1618 W
15 ZACHARY JAMES HOUGHTON 4.5 D 19 MI 15619130 1220 1416 B
16 MIKE NIKITIN 4.0 D 10 MI 10295068 1604 1613 B
17 RONALD GRZEGORCZYK 4.0 W 48 MI 10297702 1629 1610 W
18 DAVID SUNDEEN 4.0 W 47 MI 11342094 1600 1600 B
19 DIPANKAR ROY 4.0 D 15 MI 14862333 1564 1570 W
20 JASON ZHENG 4.0 L 40 MI 14529060 1595 1569 W
21 DINH DANG BUI 4.0 W 43 ON 15495066 1563 1562 B
22 EUGENE L MCCLURE 4.0 W 64 MI 12405534 1555 1529 W
23 ALAN BUI 4.0 L 4 ON 15030142 1363 1371 B
24 MICHAEL R ALDRICH 4.0 L 28 MI 13469010 1229 1300 B
25 LOREN SCHWIEBERT 3.5 L 9 MI 12486656 1745 1681 B
26 MAX ZHU 3.5 W 49 ON 15131520 1579 1564 B
27 GAURAV GIDWANI 3.5 W 51 MI 14476567 1552 1539 W
28 SOFIA ADINA STANESCU-BELLU 3.5 W 24 MI 14882954 1507 1513 W
29 CHIEDOZIE OKORIE 3.5 W 50 MI 15323285 1602 1508 B
30 GEORGE AVERY JONES 3.5 L 52 ON 12577178 1522 1444 W
31 RISHI SHETTY 3.5 L 58 MI 15131618 1494 1444 B
32 JOSHUA PHILIP MATHEWS 3.5 W 61 ON 14073750 1441 1433 W
33 JADE GE 3.5 W 60 MI 14691842 1449 1421 B
34 MICHAEL JEFFERY THOMAS 3.5 L 6 MI 15051807 1399 1400 B
35 JOSHUA DAVID LEE 3.5 L 46 MI 14601397 1438 1392 W
36 SIDDHARTH JHA 3.5 L 13 MI 14773163 1355 1367 W
37 AMIYATOSH PWNANANDAM 3.5 B MI 15489571 980 1077
38 BRIAN LIU 3.0 D 11 MI 15108523 1423 1439 W
39 JOEL R HENDON 3.0 L 1 MI 12923035 1436 1413 B
40 FOREST ZHANG 3.0 W 20 MI 14892710 1348 1346 B
41 KYLE WILLIAM MURPHY 3.0 W 59 MI 15761443 1403 1341 B
42 JARED GE 3.0 L 12 MI 14462326 1332 1256 B
43 ROBERT GLEN VASEY 3.0 L 21 MI 14101068 1283 1244 W
44 JUSTIN D SCHILLING 3.0 B MI 15323504 1199 1199
45 DEREK YAN 3.0 L 5 MI 15372807 1242 1191 W
46 JACOB ALEXANDER LAVALLEY 3.0 W 35 MI 15490981 377 1076 B
47 ERIC WRIGHT 2.5 L 18 MI 12533115 1362 1341 W
48 DANIEL KHAIN 2.5 L 17 MI 14369165 1382 1335 B
49 MICHAEL J MARTIN 2.5 L 26 MI 12531685 1291 1259 W
50 SHIVAM JHA 2.5 L 29 MI 14773178 1056 1111 W
51 TEJAS AYYAGARI 2.5 L 27 MI 15205474 1011 1097 B
52 ETHAN GUO 2.5 W 30 MI 14918803 935 1092 B
53 JOSE C YBARRA 2.0 H MI 12578849 1393 1359
54 LARRY HODGE 2.0 L 14 MI 12836773 1270 1200 B
55 ALEX KONG 2.0 L 62 MI 15412571 1186 1163 W
56 MARISA RICCI 2.0 H MI 14679887 1153 1140
57 MICHAEL LU 2.0 L 7 MI 15113330 1092 1079 B
58 VIRAJ MOHILE 2.0 W 31 MI 14700365 917 941 W
59 SEAN M MC CORMICK 2.0 L 41 MI 12841036 853 878 W
60 JULIA SHEN 1.5 L 33 MI 14579262 967 984 W
61 JEZZEL FARKAS 1.5 L 32 ON 15771592 955 979 B
62 ASHWIN BALAJI 1.0 W 55 MI 15219542 1530 1535 B
63 THOMAS JOSEPH HOSMER 1.0 L 2 MI 15057092 1175 1125 W
64 BEN LI 1.0 L 22 MI 15006561 1163 1112 B

Data Parsing: Part 2

# Melt tournament data to work on individual games data
games <- melt(tournament, 
              id=c("playerId", "name"), 
              measure=c("game1", "game2", "game3", "game4", "game5", "game6", "game7"))

# Split results into result value and opponent id
games <- cbind(games[, 1:3],data.frame(str_split_fixed(games[, 4], " +", n = Inf)))

# Assign friendly names
colnames(games) <- c("playerId", "name", "gameNo", "result", "opponentId")

# Clean up game number column
games$gameNo <- str_replace(games$gameNo, "game", "")
Table 2. Sample of individual games information.
Player ID Name Game Result Opponent ID
1 GARY HUA 1 W 39
2 DAKSHESH DARURI 1 W 63
3 ADITYA BAJAJ 1 L 8
4 PATRICK H SCHILLING 1 W 23
5 HANSHI ZUO 1 W 45
6 HANSEN SONG 1 W 34

Key to the result field:

  • W = Win
  • L = Loss
  • D = Draw
  • X = Win by forfeit (opponent failed to appear)
  • F = Loss by forfeit (player failed to appear)
  • U = Unplayed game
  • H = Half point bye
  • B = Full point bye

Generate Opponent’s Average Pre-Tournament Rating

# Merge tournament and games information
# The merge drops any games without a W/L/D decision (opponent ID is blank)
games_expand <- data.table(merge(games, tournament, by.x = "opponentId", by.y = "playerId"))

# Calculate average pre-tournament rating
avg_pre_rating <- games_expand[, mean(ratingPre), by = playerId]

# Data cleanup 
colnames(avg_pre_rating) <- c("playerId", "oppAverage")
avg_pre_rating$oppAverage <- round(avg_pre_rating$oppAverage)

# Add average to the tournament data
tournament <- merge(tournament, avg_pre_rating, by = "playerId")
Table 3. Target values.
ID Name Points State Pre-Tournament Rating Opp Avg Rating
1 GARY HUA 6.0 ON 1794 1605
2 DAKSHESH DARURI 6.0 MI 1553 1469
3 ADITYA BAJAJ 6.0 MI 1384 1564
4 PATRICK H SCHILLING 5.5 MI 1716 1574
5 HANSHI ZUO 5.5 MI 1655 1501
6 HANSEN SONG 5.0 OH 1686 1519
7 GARY DEE SWATHELL 5.0 MI 1649 1372
8 EZEKIEL HOUGHTON 5.0 MI 1641 1468
9 STEFANO LEE 5.0 ON 1411 1523
10 ANVIT RAO 5.0 MI 1365 1554
11 CAMERON WILLIAM MC LEMAN 4.5 MI 1712 1468
12 KENNETH J TACK 4.5 MI 1663 1506
13 TORRANCE HENRY JR 4.5 MI 1666 1498
14 BRADLEY SHAW 4.5 MI 1610 1515
15 ZACHARY JAMES HOUGHTON 4.5 MI 1220 1484
16 MIKE NIKITIN 4.0 MI 1604 1386
17 RONALD GRZEGORCZYK 4.0 MI 1629 1499
18 DAVID SUNDEEN 4.0 MI 1600 1480
19 DIPANKAR ROY 4.0 MI 1564 1426
20 JASON ZHENG 4.0 MI 1595 1411
21 DINH DANG BUI 4.0 ON 1563 1470
22 EUGENE L MCCLURE 4.0 MI 1555 1300
23 ALAN BUI 4.0 ON 1363 1214
24 MICHAEL R ALDRICH 4.0 MI 1229 1357
25 LOREN SCHWIEBERT 3.5 MI 1745 1363
26 MAX ZHU 3.5 ON 1579 1507
27 GAURAV GIDWANI 3.5 MI 1552 1222
28 SOFIA ADINA STANESCU-BELLU 3.5 MI 1507 1522
29 CHIEDOZIE OKORIE 3.5 MI 1602 1314
30 GEORGE AVERY JONES 3.5 ON 1522 1144
31 RISHI SHETTY 3.5 MI 1494 1260
32 JOSHUA PHILIP MATHEWS 3.5 ON 1441 1379
33 JADE GE 3.5 MI 1449 1277
34 MICHAEL JEFFERY THOMAS 3.5 MI 1399 1375
35 JOSHUA DAVID LEE 3.5 MI 1438 1150
36 SIDDHARTH JHA 3.5 MI 1355 1388
37 AMIYATOSH PWNANANDAM 3.5 MI 980 1385
38 BRIAN LIU 3.0 MI 1423 1539
39 JOEL R HENDON 3.0 MI 1436 1430
40 FOREST ZHANG 3.0 MI 1348 1391
41 KYLE WILLIAM MURPHY 3.0 MI 1403 1248
42 JARED GE 3.0 MI 1332 1150
43 ROBERT GLEN VASEY 3.0 MI 1283 1107
44 JUSTIN D SCHILLING 3.0 MI 1199 1327
45 DEREK YAN 3.0 MI 1242 1152
46 JACOB ALEXANDER LAVALLEY 3.0 MI 377 1358
47 ERIC WRIGHT 2.5 MI 1362 1392
48 DANIEL KHAIN 2.5 MI 1382 1356
49 MICHAEL J MARTIN 2.5 MI 1291 1286
50 SHIVAM JHA 2.5 MI 1056 1296
51 TEJAS AYYAGARI 2.5 MI 1011 1356
52 ETHAN GUO 2.5 MI 935 1495
53 JOSE C YBARRA 2.0 MI 1393 1345
54 LARRY HODGE 2.0 MI 1270 1206
55 ALEX KONG 2.0 MI 1186 1406
56 MARISA RICCI 2.0 MI 1153 1414
57 MICHAEL LU 2.0 MI 1092 1363
58 VIRAJ MOHILE 2.0 MI 917 1391
59 SEAN M MC CORMICK 2.0 MI 853 1319
60 JULIA SHEN 1.5 MI 967 1330
61 JEZZEL FARKAS 1.5 ON 955 1327
62 ASHWIN BALAJI 1.0 MI 1530 1186
63 THOMAS JOSEPH HOSMER 1.0 MI 1175 1350
64 BEN LI 1.0 MI 1163 1263

Data Output

write.table(tournament[, c(2, 3, 11, 13, 23)], file="tournament_summary.csv", sep = ",", 
            row.names = FALSE,
            col.names = c("Name", "Points", "State", "Pre-Tournament Rating", "Opp Avg Rating"))

Comments

This project relies on input data to be in a certain format which, given uncommon input format, may limit parsing code’s usefulness and make it unreliable in reproducibility. For example, the input text file must have a header block consisting of 3 lines and each player’s information must be split over 2 lines.