Project 01 - Data Analysis

Overview

In this project, we are given a text file with chess tournament results where the information has some structure. Our job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605

1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

# Libraries to be used
library(tidyverse)
library(data.table)

Read Chess Tournament Source Data

tourney_file_name <- "C:\\tmp\\tournamentinfo.txt"

# Read the data from the tournament file and store it in a data.table (skip the first 4 rows containing unnecessary header data)
mdt <- data.table(read_csv(tourney_file_name, col_types =  list(col_character()), col_names = c("data"),  skip = 4))

Let’s peak into the data read from the file.

# Take a peak at the data from the table
head(mdt,10)

##                                                                                          data
##  1:     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
##  2:    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
##  3: -----------------------------------------------------------------------------------------
##  4:     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
##  5:    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
##  6: -----------------------------------------------------------------------------------------
##  7:     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
##  8:    MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
##  9: -----------------------------------------------------------------------------------------
## 10:     4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|

Parsing out the data

Extract the lines of text corresponding to the players and their ratings.

# From the table, extract the text rows that correspond to the players (skip the dashed lines)
players <- mdt[seq(1, nrow(mdt), 3)]

# From the table, extract the text rows that correspond to the players' ratings (skip the dashed lines)
player_ratings <- mdt[seq(2, nrow(mdt), 3)]

Show the results of the extracted lines of text.

# take a peak at the players data lines
head(players, 10)

##                                                                                       data
##  1:  1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
##  2:  2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
##  3:  3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
##  4:  4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|
##  5:  5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|
##  6:  6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|
##  7:  7 | GARY DEE SWATHELL               |5.0  |W  57|W  46|W  13|W  11|L   1|W   9|L   2|
##  8:  8 | EZEKIEL HOUGHTON                |5.0  |W   3|W  32|L  14|L   9|W  47|W  28|W  19|
##  9:  9 | STEFANO LEE                     |5.0  |W  25|L  18|W  59|W   8|W  26|L   7|W  20|
## 10: 10 | ANVIT RAO                       |5.0  |D  16|L  19|W  55|W  31|D   6|W  25|W  18|

# take a peak at the players' ratings data lines
head(player_ratings, 10)

##                                                                                       data
##  1: ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
##  2: MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
##  3: MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
##  4: MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |
##  5: MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |
##  6: OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |
##  7: MI | 11146376 / R: 1649   ->1673     |N:3  |W    |B    |W    |B    |B    |W    |W    |
##  8: MI | 15142253 / R: 1641P17->1657P24  |N:3  |B    |W    |B    |W    |B    |W    |W    |
##  9: ON | 14954524 / R: 1411   ->1564     |N:2  |W    |B    |W    |B    |W    |B    |B    |
## 10: MI | 14150362 / R: 1365   ->1544     |N:3  |W    |W    |B    |B    |W    |B    |W    |

Using regular expressions, parse out the values from each line of text.

# Extract the values from the players table
Pair_Num <- as.integer(str_extract(players$data, "\\d+"))                      # 1st sequence of numeric characters
Player_Name <- str_trim(str_extract(players$data, "(\\w+\\s){2,3}"))           # 1st instance of 2 or 3 words followed by white space
Total_Points <- as.numeric(str_extract(players$data, "\\d+\\.\\d+"))           # 2nd sequence of numeric characters
Opponents <- str_extract_all(str_extract_all(players$data, "\\d+\\|"), "\\d+") # All instances of numeric characters followed by a pipe |

# Extract the values from the player ratings table
Player_State <- str_extract(player_ratings$data, "\\w+")              # 1st word on the line
Pre_Rating <- as.integer(str_extract(str_extract(player_ratings$data, "[^\\d]\\d{3,4}[^\\d]"), "\\d+")) # 1st instance of a number of at least 3 but no more than 4 digits 
Post_Rating <- as.integer(str_extract(str_extract(player_ratings$data, "->\\s*\\d{3,4}"), "\\d+")) # 1st instance of a number of at least 3 but no more than 4 digits, which come after "-> ".

Calculate the Arpad Elo’s mean rating

Calculate the Arpad Elo’s mean rating for each player’s opponents.

For this, we traverse the vector of Pre-Ratings of each player and get the pre-ratings of their opponents. Because the opponents’ IDs (Pair number) are stored in a vector inside a vector, we need to flatten it before we can use the values from the inner vector. Using those IDs, we look up the pre-rating corresponding to each of those IDs (Pair number) and then calculate the mean of all of them. We round the mean to the nearest point (no decimal digits).

# Calculate the Arpad Elo's mean rating for each player's opponents
total_Players = nrow(players)

Avg_Opponent_Pre_Rating <- total_Players

for (i in 1:total_Players) { 
  # for each player
  Avg_Opponent_Pre_Rating[i] <- round(
    mean(
      Pre_Rating[as.numeric(
        flatten(
          Opponents[Pair_Num[i]]
        )
        )]
    ), digits = 0
  ) 
}

Build a table with all the final results

# build a table to be used as the source for the CSV file
DT = data.table(
  Player_Name,
  Player_State,
  Total_Points,
  Pre_Rating,
  Avg_Opponent_Pre_Rating
)

Final results table
Player_Name	Player_State	Total_Points	Pre_Rating	Avg_Opponent_Pre_Rating
GARY HUA	ON	6.0	1794	1605
DAKSHESH DARURI	MI	6.0	1553	1469
ADITYA BAJAJ	MI	6.0	1384	1564
PATRICK H SCHILLING	MI	5.5	1716	1574
HANSHI ZUO	MI	5.5	1655	1501
HANSEN SONG	OH	5.0	1686	1519
GARY DEE SWATHELL	MI	5.0	1649	1372
EZEKIEL HOUGHTON	MI	5.0	1641	1468
STEFANO LEE	ON	5.0	1411	1523
ANVIT RAO	MI	5.0	1365	1554

Export the results to a CSV file

# Write the data to CSV file
write_csv(
  x = DT,
  file = "c:\\tmp\\chess_tournament_player_output.csv",
  quote_escape = "double"
)

Player scored the most points relative expected result

# Which player scored the most points relative to his or her expected result?
Point_Diff <- Post_Rating - Pre_Rating

elo_rating = data.table (
  Player_Name,
  Pre_Rating,
  Post_Rating,
  Point_Diff,
  Avg_Opponent_Pre_Rating
)


elo_rating <- elo_rating[order(-Point_Diff)]

ELO ratings
Player_Name	Pre_Rating	Post_Rating	Point_Diff	Avg_Opponent_Pre_Rating
JACOB ALEXANDER LAVALLEY	377	1076	699	1358
ADITYA BAJAJ	1384	1640	256	1564
ZACHARY JAMES HOUGHTON	1220	1416	196	1484
ANVIT RAO	1365	1544	179	1554
ETHAN GUO	935	1092	157	1495
STEFANO LEE	1411	1564	153	1523
DAKSHESH DARURI	1553	1663	110	1469
AMIYATOSH PWNANANDAM	980	1077	97	1385
TEJAS AYYAGARI	1011	1097	86	1356
MICHAEL R ALDRICH	1229	1300	71	1357
SHIVAM JHA	1056	1111	55	1296
HANSHI ZUO	1655	1690	35	1501
PATRICK H SCHILLING	1716	1744	28	1574
SEAN M MC	853	878	25	1319
GARY DEE SWATHELL	1649	1673	24	1372
VIRAJ MOHILE	917	941	24	1391
JEZZEL FARKAS	955	979	24	1327
GARY HUA	1794	1817	23	1605
JULIA SHEN	967	984	17	1330
EZEKIEL HOUGHTON	1641	1657	16	1468
BRIAN LIU	1423	1439	16	1539
SIDDHARTH JHA	1355	1367	12	1388
MIKE NIKITIN	1604	1613	9	1386
BRADLEY SHAW	1610	1618	8	1515
ALAN BUI	1363	1371	8	1214
KENNETH J TACK	1663	1670	7	1506
DIPANKAR ROY	1564	1570	6	1426
SOFIA ADINA	1507	1513	6	1522
ASHWIN BALAJI	1530	1535	5	1186
HANSEN SONG	1686	1687	1	1519
MICHAEL JEFFERY THOMAS	1399	1400	1	1375
DAVID SUNDEEN	1600	1600	0	1480
JUSTIN D SCHILLING	1199	1199	0	1327
DINH DANG BUI	1563	1562	-1	1470
FOREST ZHANG	1348	1346	-2	1391
TORRANCE HENRY JR	1666	1662	-4	1498
JOSHUA PHILIP MATHEWS	1441	1433	-8	1379
GAURAV GIDWANI	1552	1539	-13	1222
MARISA RICCI	1153	1140	-13	1414
MICHAEL LU	1092	1079	-13	1363
MAX ZHU	1579	1564	-15	1507
CAMERON WILLIAM MC	1712	1696	-16	1468
RONALD GRZEGORCZYK	1629	1610	-19	1499
ERIC WRIGHT	1362	1341	-21	1392
JOEL R HENDON	1436	1413	-23	1430
ALEX KONG	1186	1163	-23	1406
JASON ZHENG	1595	1569	-26	1411
EUGENE L MCCLURE	1555	1529	-26	1300
JADE GE	1449	1421	-28	1277
MICHAEL J MARTIN	1291	1259	-32	1286
JOSE C YBARRA	1393	1359	-34	1345
ROBERT GLEN VASEY	1283	1244	-39	1107
JOSHUA DAVID LEE	1438	1392	-46	1150
DANIEL KHAIN	1382	1335	-47	1356
RISHI SHETTY	1494	1444	-50	1260
THOMAS JOSEPH HOSMER	1175	1125	-50	1350
DEREK YAN	1242	1191	-51	1152
BEN LI	1163	1112	-51	1263
KYLE WILLIAM MURPHY	1403	1341	-62	1248
LOREN SCHWIEBERT	1745	1681	-64	1363
LARRY HODGE	1270	1200	-70	1206
JARED GE	1332	1256	-76	1150
GEORGE AVERY JONES	1522	1444	-78	1144
CHIEDOZIE OKORIE	1602	1508	-94	1314

PLayer with the highest difference between his/her Pre-rating and Post-rating

ELO ratings
Player_Name	Pre_Rating	Post_Rating	Point_Diff	Avg_Opponent_Pre_Rating
JACOB ALEXANDER LAVALLEY	377	1076	699	1358