Project 1 - Chess Tournament Results

The assignment is to take a text file containing chess tournament results and generate a .csv containing the following information for all of the players in the file: the player’s name, the player’s state, their total number of points, the player’s pre-rating, and the average pre-chess rating of all their opponents.

I started by reading the text file into R as a character vector:

# Connect to file, read it into raw_string object, close 
con <- file("tournamentinfo.txt")
raw_string <- readLines(con)
## Warning in readLines(con): incomplete final line found on
## 'tournamentinfo.txt'
close(con)

Looking at the first nine rows of the file, the challenges of extracting the desired data become apparent: the file has header records, separator rows, and each player’s information spans two rows.

head(raw_string, 9)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round|"
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  |"
## [4] "-----------------------------------------------------------------------------------------"
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"

I decided to extract the data using the following methodology:

  1. Strip header rows and separator lines from the data
  2. Subset the odd rows from the remaining data
  3. Subset the even rows from the remaining data
  4. Combine the odd and even rows into a two-column data frame
  5. Parse the desired data from the two-column data frame, computing values where necessary
  6. Add columns to compute the average pre-chess rating of their opponents
  7. Finalize data frame by selecting five desired columns
# Convert character vector to data frame  
raw_string_df <- as.data.frame(raw_string, col.names = "raw_string", stringsAsFactors = FALSE)  
# Subset to exclude header rows and separator rows  
raw_string_df_ss <- subset(raw_string_df, substr(raw_string,1,6) != "------" & substr(raw_string,1,6) != " Pair " & substr(raw_string,1,6) != " Num  ")
# Create vector of odd rows
c1 <- raw_string_df_ss[1:nrow(raw_string_df_ss) %% 2 == 1, ]  
# Create vector of even rows
c2 <- raw_string_df_ss[1:nrow(raw_string_df_ss) %% 2 == 0, ]
# Combine c1 and c2 into two-column data frame  
raw_string_final <- as.data.frame(cbind(c1, c2), stringsAsFactors = FALSE)
# Now there is one row per player and the data is ready to parse
head(raw_string_final)
##                                                                                          c1
## 1     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 2     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## 3     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
## 4     4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|
## 5     5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|
## 6     6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|
##                                                                                          c2
## 1    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 2    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 3    MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 4    MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |
## 5    MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 6    OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |

Next, I loaded the ‘stringr’ package so I could use its functions to parse the data.

install.packages("stringr",repos='http://mirrors.nics.utk.edu/cran/')
library(stringr)
# Extract player_id
player_id <- as.integer(substr(raw_string_final$c1, 1, 5))
# Extract player name
player_name <- str_trim(substr(raw_string_final$c1, 9, 40))
# Extract state
state <- str_trim(substr(raw_string_final$c2, 1, 5))
# Extract points
pts <- as.double(substr(raw_string_final$c1, 42, 46))
# Extract pre-rating
pre_rating <- as.integer(substr(raw_string_final$c2, 22, 26))
# Compute number of games played by counting Ws, Ls and Ds from portion of string containing rounds 1-7
num_games <- str_count(substr(raw_string_final$c1, 47, 89), "W") + str_count(substr(raw_string_final$c1, 47, 89), "L") + str_count(substr(raw_string_final$c1, 47, 89), "D")
# Extract Round 1 thru Round 7 opponents
rd1_opp <- as.integer(substr(raw_string_final$c1, 49, 52))
rd2_opp <- as.integer(substr(raw_string_final$c1, 55, 58))
rd3_opp <- as.integer(substr(raw_string_final$c1, 61, 64))
rd4_opp <- as.integer(substr(raw_string_final$c1, 67, 70))
rd5_opp <- as.integer(substr(raw_string_final$c1, 73, 76))
rd6_opp <- as.integer(substr(raw_string_final$c1, 79, 82))
rd7_opp <- as.integer(substr(raw_string_final$c1, 85, 88))

At this point, all of the information needed to build the final data frame has been extracted. The next steps are to combine the extract vectors into a data frame, calculate the average pre-chess rating of their opponents, subset the final data frame, and write the final data frame to a .csv file.

# Combine the extract vectors into a data frame  
chess_df_pre <- as.data.frame(cbind(player_id, player_name, state, pts, pre_rating, num_games, rd1_opp, rd2_opp, rd3_opp, rd4_opp, rd5_opp, rd6_opp, rd7_opp), stringsAsFactors = FALSE)
# Create an empty vector to fill with calculated average pre-chess ratings of opponents  
ttl_vect <- vector()
# Use loop to calculate total pre-rating of opponents
for(i in 1:nrow(chess_df_pre)) {
    v1 <- as.numeric(as.vector(chess_df_pre[i, 7:13]))
    v2 <- v1[!is.na(v1)]
    ss <- subset(chess_df_pre, chess_df_pre$player_id %in% v2)
    ttl <- sum(as.integer(ss$pre_rating))
    ttl_vect <- c(ttl_vect, ttl)
}
# Create a new column populated with total pre-rating of opponents
chess_df_pre$total_opponents_pre_rating <- ttl_vect
# Create a new column populated with the average pre-chess rating of their opponents
chess_df_pre$opp_ave_pre_rating <- round(chess_df_pre$total_opponents_pre_rating / as.integer(chess_df_pre$num_games), digits = 0)
# Subset desired columns
chess_df <- subset(chess_df_pre, select = c("player_name","state","pts","pre_rating","opp_ave_pre_rating"))  
# Write result to .csv
write.csv(chess_df, file = "chess_df.csv", row.names = FALSE)
head(chess_df)
##           player_name state pts pre_rating opp_ave_pre_rating
## 1            GARY HUA    ON   6       1794               1605
## 2     DAKSHESH DARURI    MI   6       1553               1469
## 3        ADITYA BAJAJ    MI   6       1384               1564
## 4 PATRICK H SCHILLING    MI 5.5       1716               1574
## 5          HANSHI ZUO    MI 5.5       1655               1501
## 6         HANSEN SONG    OH   5       1686               1519