Overview

In this project, we are given a text file with chess tournament results where the information has some structure. Our job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605

1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

# Libraries to be used
library(tidyverse)
library(data.table)

Read Chess Tournament Source Data

tourney_file_name <- "C:\\tmp\\tournamentinfo.txt"

# Read the data from the tournament file and store it in a data.table (skip the first 4 rows containing unnecessary header data)
mdt <- data.table(read_csv(tourney_file_name, col_types =  list(col_character()), col_names = c("data"),  skip = 4))

Let’s peak into the data read from the file.

# Take a peak at the data from the table
head(mdt,10)
##                                                                                          data
##  1:     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
##  2:    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
##  3: -----------------------------------------------------------------------------------------
##  4:     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
##  5:    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
##  6: -----------------------------------------------------------------------------------------
##  7:     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
##  8:    MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
##  9: -----------------------------------------------------------------------------------------
## 10:     4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|

Parsing out the data

Extract the lines of text corresponding to the players and their ratings.

# From the table, extract the text rows that correspond to the players (skip the dashed lines)
players <- mdt[seq(1, nrow(mdt), 3)]

# From the table, extract the text rows that correspond to the players' ratings (skip the dashed lines)
player_ratings <- mdt[seq(2, nrow(mdt), 3)]

Show the results of the extracted lines of text.

# take a peak at the players data lines
head(players, 10)
##                                                                                       data
##  1:  1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
##  2:  2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
##  3:  3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
##  4:  4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|
##  5:  5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|
##  6:  6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|
##  7:  7 | GARY DEE SWATHELL               |5.0  |W  57|W  46|W  13|W  11|L   1|W   9|L   2|
##  8:  8 | EZEKIEL HOUGHTON                |5.0  |W   3|W  32|L  14|L   9|W  47|W  28|W  19|
##  9:  9 | STEFANO LEE                     |5.0  |W  25|L  18|W  59|W   8|W  26|L   7|W  20|
## 10: 10 | ANVIT RAO                       |5.0  |D  16|L  19|W  55|W  31|D   6|W  25|W  18|
# take a peak at the players' ratings data lines
head(player_ratings, 10)
##                                                                                       data
##  1: ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
##  2: MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
##  3: MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
##  4: MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |
##  5: MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |
##  6: OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |
##  7: MI | 11146376 / R: 1649   ->1673     |N:3  |W    |B    |W    |B    |B    |W    |W    |
##  8: MI | 15142253 / R: 1641P17->1657P24  |N:3  |B    |W    |B    |W    |B    |W    |W    |
##  9: ON | 14954524 / R: 1411   ->1564     |N:2  |W    |B    |W    |B    |W    |B    |B    |
## 10: MI | 14150362 / R: 1365   ->1544     |N:3  |W    |W    |B    |B    |W    |B    |W    |

Using regular expressions, parse out the values from each line of text.

# Extract the values from the players table
Pair_Num <- as.integer(str_extract(players$data, "\\d+"))                      # 1st sequence of numeric characters
Player_Name <- str_trim(str_extract(players$data, "(\\w+\\s){2,3}"))           # 1st instance of 2 or 3 words followed by white space
Total_Points <- as.numeric(str_extract(players$data, "\\d+\\.\\d+"))           # 2nd sequence of numeric characters
Opponents <- str_extract_all(str_extract_all(players$data, "\\d+\\|"), "\\d+") # All instances of numeric characters followed by a pipe |

# Extract the values from the player ratings table
Player_State <- str_extract(player_ratings$data, "\\w+")              # 1st word on the line
Pre_Rating <- as.integer(str_extract(str_extract(player_ratings$data, "[^\\d]\\d{3,4}[^\\d]"), "\\d+")) # 1st instance of a number of at least 3 but no more than 4 digits 
Post_Rating <- as.integer(str_extract(str_extract(player_ratings$data, "->\\s*\\d{3,4}"), "\\d+")) # 1st instance of a number of at least 3 but no more than 4 digits, which come after "-> ".

Calculate the Arpad Elo’s mean rating

Calculate the Arpad Elo’s mean rating for each player’s opponents.

For this, we traverse the vector of Pre-Ratings of each player and get the pre-ratings of their opponents. Because the opponents’ IDs (Pair number) are stored in a vector inside a vector, we need to flatten it before we can use the values from the inner vector. Using those IDs, we look up the pre-rating corresponding to each of those IDs (Pair number) and then calculate the mean of all of them. We round the mean to the nearest point (no decimal digits).

# Calculate the Arpad Elo's mean rating for each player's opponents
total_Players = nrow(players)

Avg_Opponent_Pre_Rating <- total_Players

for (i in 1:total_Players) { 
  # for each player
  Avg_Opponent_Pre_Rating[i] <- round(
    mean(
      Pre_Rating[as.numeric(
        flatten(
          Opponents[Pair_Num[i]]
        )
        )]
    ), digits = 0
  ) 
}

Build a table with all the final results

# build a table to be used as the source for the CSV file
DT = data.table(
  Player_Name,
  Player_State,
  Total_Points,
  Pre_Rating,
  Avg_Opponent_Pre_Rating
)
Final results table
Player_Name Player_State Total_Points Pre_Rating Avg_Opponent_Pre_Rating
GARY HUA ON 6.0 1794 1605
DAKSHESH DARURI MI 6.0 1553 1469
ADITYA BAJAJ MI 6.0 1384 1564
PATRICK H SCHILLING MI 5.5 1716 1574
HANSHI ZUO MI 5.5 1655 1501
HANSEN SONG OH 5.0 1686 1519
GARY DEE SWATHELL MI 5.0 1649 1372
EZEKIEL HOUGHTON MI 5.0 1641 1468
STEFANO LEE ON 5.0 1411 1523
ANVIT RAO MI 5.0 1365 1554

Export the results to a CSV file

# Write the data to CSV file
write_csv(
  x = DT,
  file = "c:\\tmp\\chess_tournament_player_output.csv",
  quote_escape = "double"
)

Player scored the most points relative expected result

# Which player scored the most points relative to his or her expected result?
Point_Diff <- Post_Rating - Pre_Rating

elo_rating = data.table (
  Player_Name,
  Pre_Rating,
  Post_Rating,
  Point_Diff,
  Avg_Opponent_Pre_Rating
)


elo_rating <- elo_rating[order(-Point_Diff)]
ELO ratings
Player_Name Pre_Rating Post_Rating Point_Diff Avg_Opponent_Pre_Rating
JACOB ALEXANDER LAVALLEY 377 1076 699 1358
ADITYA BAJAJ 1384 1640 256 1564
ZACHARY JAMES HOUGHTON 1220 1416 196 1484
ANVIT RAO 1365 1544 179 1554
ETHAN GUO 935 1092 157 1495
STEFANO LEE 1411 1564 153 1523
DAKSHESH DARURI 1553 1663 110 1469
AMIYATOSH PWNANANDAM 980 1077 97 1385
TEJAS AYYAGARI 1011 1097 86 1356
MICHAEL R ALDRICH 1229 1300 71 1357
SHIVAM JHA 1056 1111 55 1296
HANSHI ZUO 1655 1690 35 1501
PATRICK H SCHILLING 1716 1744 28 1574
SEAN M MC 853 878 25 1319
GARY DEE SWATHELL 1649 1673 24 1372
VIRAJ MOHILE 917 941 24 1391
JEZZEL FARKAS 955 979 24 1327
GARY HUA 1794 1817 23 1605
JULIA SHEN 967 984 17 1330
EZEKIEL HOUGHTON 1641 1657 16 1468
BRIAN LIU 1423 1439 16 1539
SIDDHARTH JHA 1355 1367 12 1388
MIKE NIKITIN 1604 1613 9 1386
BRADLEY SHAW 1610 1618 8 1515
ALAN BUI 1363 1371 8 1214
KENNETH J TACK 1663 1670 7 1506
DIPANKAR ROY 1564 1570 6 1426
SOFIA ADINA 1507 1513 6 1522
ASHWIN BALAJI 1530 1535 5 1186
HANSEN SONG 1686 1687 1 1519
MICHAEL JEFFERY THOMAS 1399 1400 1 1375
DAVID SUNDEEN 1600 1600 0 1480
JUSTIN D SCHILLING 1199 1199 0 1327
DINH DANG BUI 1563 1562 -1 1470
FOREST ZHANG 1348 1346 -2 1391
TORRANCE HENRY JR 1666 1662 -4 1498
JOSHUA PHILIP MATHEWS 1441 1433 -8 1379
GAURAV GIDWANI 1552 1539 -13 1222
MARISA RICCI 1153 1140 -13 1414
MICHAEL LU 1092 1079 -13 1363
MAX ZHU 1579 1564 -15 1507
CAMERON WILLIAM MC 1712 1696 -16 1468
RONALD GRZEGORCZYK 1629 1610 -19 1499
ERIC WRIGHT 1362 1341 -21 1392
JOEL R HENDON 1436 1413 -23 1430
ALEX KONG 1186 1163 -23 1406
JASON ZHENG 1595 1569 -26 1411
EUGENE L MCCLURE 1555 1529 -26 1300
JADE GE 1449 1421 -28 1277
MICHAEL J MARTIN 1291 1259 -32 1286
JOSE C YBARRA 1393 1359 -34 1345
ROBERT GLEN VASEY 1283 1244 -39 1107
JOSHUA DAVID LEE 1438 1392 -46 1150
DANIEL KHAIN 1382 1335 -47 1356
RISHI SHETTY 1494 1444 -50 1260
THOMAS JOSEPH HOSMER 1175 1125 -50 1350
DEREK YAN 1242 1191 -51 1152
BEN LI 1163 1112 -51 1263
KYLE WILLIAM MURPHY 1403 1341 -62 1248
LOREN SCHWIEBERT 1745 1681 -64 1363
LARRY HODGE 1270 1200 -70 1206
JARED GE 1332 1256 -76 1150
GEORGE AVERY JONES 1522 1444 -78 1144
CHIEDOZIE OKORIE 1602 1508 -94 1314

PLayer with the highest difference between his/her Pre-rating and Post-rating

ELO ratings
Player_Name Pre_Rating Post_Rating Point_Diff Avg_Opponent_Pre_Rating
JACOB ALEXANDER LAVALLEY 377 1076 699 1358