Problem Statement

A document contianing information from a chess tournament is provided as an unstructured text file. The code below attempts to tidy the raw string data into a normalized table that is exported as a comma separated value file. The information to extract for each chess player includes name, state, pre-torunament rating, points accumulated during the tournament, and average rating of the seven opponents faced during the tournament. Tidyverse packages including dplyr and stringr are primarily used to tidy the dataset. The raw data can be located at the GitHub repository and was loaded using version control in an R Studio Project.

Load Packages

library(readr)
library(tidyverse)

Load tournament results text file

The file can be loaded with with the read.table function

# Load the file directly from GitHub repository and remove unneeded dashes
data.in <- read.table('tournamentinfo.txt', header = FALSE, sep = ",") %>% unlist() %>% str_remove_all('-')

#remove the header lines (1-4) and take a look at the structure of the object
raw.data <- data.in[-c(1:4)]
head(raw.data)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   >1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [3] ""                                                                                         
## [4] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [5] "   MI | 14598900 / R: 1553   >1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [6] ""
str(raw.data)
##  chr [1:192] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" ...

Loop through the raw.data object

There are 192 rows, 2 strings for each of the 64 players plus a blank row in between. To store the information that is requested, a for loop will be applied to the raw data object. For each player the requested information will be extracted and stored in a blank table.

#create a blank table that will contain the information 
table.store <- matrix(nrow=64, ncol=6, NA) %>% as_tibble() 
colnames(table.store) <- c('PlayerID', 'PlayerName', 'State','Points', 'PreRating','OpponentRating')

for (i in 1:64){
  
  #This will select only the information regarding a single player
  player.info <- str_c(raw.data[(i*3)-2],raw.data[(i*3)-1]) %>% str_split('\\|') %>% unlist()
  
  #playerID 
  table.store[i,1] <- i
  #get name
  table.store[i,2] <- player.info[2] %>% str_trim(side='both')
  #get state
  table.store[i,3]<- player.info[11] %>% str_trim(side='both')
  #get points 
  table.store[i,4] <- player.info[3] %>% str_trim(side='both')
  #get pre-rating
  table.store[i,5] <- player.info[12] %>% str_split('[:blank:]') %>% unlist() %>% str_extract_all('\\d{3,4}') %>% unlist() %>% nth(3)
  
}
# examine the results
table.store
## # A tibble: 64 x 6
##    PlayerID PlayerName          State Points PreRating OpponentRating
##       <int> <chr>               <chr> <chr>  <chr>     <lgl>         
##  1        1 GARY HUA            ON    6.0    1794      NA            
##  2        2 DAKSHESH DARURI     MI    6.0    1553      NA            
##  3        3 ADITYA BAJAJ        MI    6.0    1384      NA            
##  4        4 PATRICK H SCHILLING MI    5.5    1716      NA            
##  5        5 HANSHI ZUO          MI    5.5    1655      NA            
##  6        6 HANSEN SONG         OH    5.0    1686      NA            
##  7        7 GARY DEE SWATHELL   MI    5.0    1649      NA            
##  8        8 EZEKIEL HOUGHTON    MI    5.0    1641      NA            
##  9        9 STEFANO LEE         ON    5.0    1411      NA            
## 10       10 ANVIT RAO           MI    5.0    1365      NA            
## # ... with 54 more rows

One more loop

The only requested information that is missing is averge rating of each players opponents. To calculate this average, a table is made by merging each players opponents using the opponent ID field.

for (i in 1:64){
  
  #gather information for each player
  player.info <- str_c(raw.data[(i*3)-2], raw.data[(i*3)-1]) %>% str_split('\\|') %>% unlist()
  
  #extract each of the opponentIDs from each round of the tournamanet  
  opp.id <-  player.info[4:10] %>% str_extract_all('\\d{1,2}') %>% unlist()
  
  #creat a table to store the ID and calculated average
  opp.table <- matrix(nrow=length(opp.id), ncol=2, NA) %>% as_tibble

  #name the table
  names(opp.table) <- c('PlayerID','OppScore')

  #change data type to numeric so that the join will work
  opp.table$PlayerID <- as.numeric(opp.id)
  
  #join opponents pre rating to opponent id 
  opp.table$OppScore <- left_join(opp.table, table.store, by = 'PlayerID') %>% select(PreRating) %>% unlist() %>% as.numeric() 

  #find the mean of OppScore and write to the main storage table 
  table.store[i,6] <- mean(opp.table$OppScore) %>% round(0)
}
table.store %>% print(n=64)
## # A tibble: 64 x 6
##    PlayerID PlayerName                State Points PreRating OpponentRating
##       <int> <chr>                     <chr> <chr>  <chr>              <dbl>
##  1        1 GARY HUA                  ON    6.0    1794                1605
##  2        2 DAKSHESH DARURI           MI    6.0    1553                1469
##  3        3 ADITYA BAJAJ              MI    6.0    1384                1564
##  4        4 PATRICK H SCHILLING       MI    5.5    1716                1574
##  5        5 HANSHI ZUO                MI    5.5    1655                1501
##  6        6 HANSEN SONG               OH    5.0    1686                1519
##  7        7 GARY DEE SWATHELL         MI    5.0    1649                1372
##  8        8 EZEKIEL HOUGHTON          MI    5.0    1641                1468
##  9        9 STEFANO LEE               ON    5.0    1411                1523
## 10       10 ANVIT RAO                 MI    5.0    1365                1554
## 11       11 CAMERON WILLIAM MC LEMAN  MI    4.5    1712                1468
## 12       12 KENNETH J TACK            MI    4.5    1663                1506
## 13       13 TORRANCE HENRY JR         MI    4.5    1666                1498
## 14       14 BRADLEY SHAW              MI    4.5    1610                1515
## 15       15 ZACHARY JAMES HOUGHTON    MI    4.5    1220                1484
## 16       16 MIKE NIKITIN              MI    4.0    1604                1386
## 17       17 RONALD GRZEGORCZYK        MI    4.0    1629                1499
## 18       18 DAVID SUNDEEN             MI    4.0    1600                1480
## 19       19 DIPANKAR ROY              MI    4.0    1564                1426
## 20       20 JASON ZHENG               MI    4.0    1595                1411
## 21       21 DINH DANG BUI             ON    4.0    1563                1470
## 22       22 EUGENE L MCCLURE          MI    4.0    1555                1300
## 23       23 ALAN BUI                  ON    4.0    1363                1214
## 24       24 MICHAEL R ALDRICH         MI    4.0    1229                1357
## 25       25 LOREN SCHWIEBERT          MI    3.5    1745                1363
## 26       26 MAX ZHU                   ON    3.5    1579                1507
## 27       27 GAURAV GIDWANI            MI    3.5    1552                1222
## 28       28 SOFIA ADINA STANESCUBELLU MI    3.5    1507                1522
## 29       29 CHIEDOZIE OKORIE          MI    3.5    1602                1314
## 30       30 GEORGE AVERY JONES        ON    3.5    1522                1144
## 31       31 RISHI SHETTY              MI    3.5    1494                1260
## 32       32 JOSHUA PHILIP MATHEWS     ON    3.5    1441                1379
## 33       33 JADE GE                   MI    3.5    1449                1277
## 34       34 MICHAEL JEFFERY THOMAS    MI    3.5    1399                1375
## 35       35 JOSHUA DAVID LEE          MI    3.5    1438                1150
## 36       36 SIDDHARTH JHA             MI    3.5    1355                1388
## 37       37 AMIYATOSH PWNANANDAM      MI    3.5    980                 1385
## 38       38 BRIAN LIU                 MI    3.0    1423                1539
## 39       39 JOEL R HENDON             MI    3.0    1436                1430
## 40       40 FOREST ZHANG              MI    3.0    1348                1391
## 41       41 KYLE WILLIAM MURPHY       MI    3.0    1403                1248
## 42       42 JARED GE                  MI    3.0    1332                1150
## 43       43 ROBERT GLEN VASEY         MI    3.0    1283                1107
## 44       44 JUSTIN D SCHILLING        MI    3.0    1199                1327
## 45       45 DEREK YAN                 MI    3.0    1242                1152
## 46       46 JACOB ALEXANDER LAVALLEY  MI    3.0    377                 1358
## 47       47 ERIC WRIGHT               MI    2.5    1362                1392
## 48       48 DANIEL KHAIN              MI    2.5    1382                1356
## 49       49 MICHAEL J MARTIN          MI    2.5    1291                1286
## 50       50 SHIVAM JHA                MI    2.5    1056                1296
## 51       51 TEJAS AYYAGARI            MI    2.5    1011                1356
## 52       52 ETHAN GUO                 MI    2.5    935                 1495
## 53       53 JOSE C YBARRA             MI    2.0    1393                1345
## 54       54 LARRY HODGE               MI    2.0    1270                1206
## 55       55 ALEX KONG                 MI    2.0    1186                1406
## 56       56 MARISA RICCI              MI    2.0    1153                1414
## 57       57 MICHAEL LU                MI    2.0    1092                1363
## 58       58 VIRAJ MOHILE              MI    2.0    917                 1391
## 59       59 SEAN M MC CORMICK         MI    2.0    853                 1319
## 60       60 JULIA SHEN                MI    1.5    967                 1330
## 61       61 JEZZEL FARKAS             ON    1.5    955                 1327
## 62       62 ASHWIN BALAJI             MI    1.0    1530                1186
## 63       63 THOMAS JOSEPH HOSMER      MI    1.0    1175                1350
## 64       64 BEN LI                    MI    1.0    1163                1263

Export as a csv

The table is now normalized and can be written to GitHub as a .csv

write.csv(table.store, "TournamentResults.csv" )

Best Player of the Tournament

Based on the results, which player had the strongest tournament? This should be considered the player that tallied the most points while facing the strongest competition.

table.store %>% filter(Points==max(Points) & OpponentRating ==max(OpponentRating)) 
## # A tibble: 1 x 6
##   PlayerID PlayerName State Points PreRating OpponentRating
##      <int> <chr>      <chr> <chr>  <chr>              <dbl>
## 1        1 GARY HUA   ON    6.0    1794                1605