The assignment is to take a text file containing chess tournament results and generate a .csv containing the following information for all of the players in the file: the player’s name, the player’s state, their total number of points, the player’s pre-rating, and the average pre-chess rating of all their opponents.
I started by reading the text file into R as a character vector:
# Connect to file, read it into raw_string object, close
con <- file("tournamentinfo.txt")
raw_string <- readLines(con)
## Warning in readLines(con): incomplete final line found on
## 'tournamentinfo.txt'
close(con)
Looking at the first nine rows of the file, the challenges of extracting the desired data become apparent: the file has header records, separator rows, and each player’s information spans two rows.
head(raw_string, 9)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|"
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |"
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
I decided to extract the data using the following methodology:
# Convert character vector to data frame
raw_string_df <- as.data.frame(raw_string, col.names = "raw_string", stringsAsFactors = FALSE)
# Subset to exclude header rows and separator rows
raw_string_df_ss <- subset(raw_string_df, substr(raw_string,1,6) != "------" & substr(raw_string,1,6) != " Pair " & substr(raw_string,1,6) != " Num ")
# Create vector of odd rows
c1 <- raw_string_df_ss[1:nrow(raw_string_df_ss) %% 2 == 1, ]
# Create vector of even rows
c2 <- raw_string_df_ss[1:nrow(raw_string_df_ss) %% 2 == 0, ]
# Combine c1 and c2 into two-column data frame
raw_string_final <- as.data.frame(cbind(c1, c2), stringsAsFactors = FALSE)
# Now there is one row per player and the data is ready to parse
head(raw_string_final)
## c1
## 1 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 2 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## 3 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|
## 4 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|
## 5 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|
## 6 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|
## c2
## 1 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 2 MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## 3 MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |
## 4 MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |
## 5 MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |
## 6 OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |
Next, I loaded the ‘stringr’ package so I could use its functions to parse the data.
install.packages("stringr",repos='http://mirrors.nics.utk.edu/cran/')
library(stringr)
# Extract player_id
player_id <- as.integer(substr(raw_string_final$c1, 1, 5))
# Extract player name
player_name <- str_trim(substr(raw_string_final$c1, 9, 40))
# Extract state
state <- str_trim(substr(raw_string_final$c2, 1, 5))
# Extract points
pts <- as.double(substr(raw_string_final$c1, 42, 46))
# Extract pre-rating
pre_rating <- as.integer(substr(raw_string_final$c2, 22, 26))
# Compute number of games played by counting Ws, Ls and Ds from portion of string containing rounds 1-7
num_games <- str_count(substr(raw_string_final$c1, 47, 89), "W") + str_count(substr(raw_string_final$c1, 47, 89), "L") + str_count(substr(raw_string_final$c1, 47, 89), "D")
# Extract Round 1 thru Round 7 opponents
rd1_opp <- as.integer(substr(raw_string_final$c1, 49, 52))
rd2_opp <- as.integer(substr(raw_string_final$c1, 55, 58))
rd3_opp <- as.integer(substr(raw_string_final$c1, 61, 64))
rd4_opp <- as.integer(substr(raw_string_final$c1, 67, 70))
rd5_opp <- as.integer(substr(raw_string_final$c1, 73, 76))
rd6_opp <- as.integer(substr(raw_string_final$c1, 79, 82))
rd7_opp <- as.integer(substr(raw_string_final$c1, 85, 88))
At this point, all of the information needed to build the final data frame has been extracted. The next steps are to combine the extract vectors into a data frame, calculate the average pre-chess rating of their opponents, subset the final data frame, and write the final data frame to a .csv file.
# Combine the extract vectors into a data frame
chess_df_pre <- as.data.frame(cbind(player_id, player_name, state, pts, pre_rating, num_games, rd1_opp, rd2_opp, rd3_opp, rd4_opp, rd5_opp, rd6_opp, rd7_opp), stringsAsFactors = FALSE)
# Create an empty vector to fill with calculated average pre-chess ratings of opponents
ttl_vect <- vector()
# Use loop to calculate total pre-rating of opponents
for(i in 1:nrow(chess_df_pre)) {
v1 <- as.numeric(as.vector(chess_df_pre[i, 7:13]))
v2 <- v1[!is.na(v1)]
ss <- subset(chess_df_pre, chess_df_pre$player_id %in% v2)
ttl <- sum(as.integer(ss$pre_rating))
ttl_vect <- c(ttl_vect, ttl)
}
# Create a new column populated with total pre-rating of opponents
chess_df_pre$total_opponents_pre_rating <- ttl_vect
# Create a new column populated with the average pre-chess rating of their opponents
chess_df_pre$opp_ave_pre_rating <- round(chess_df_pre$total_opponents_pre_rating / as.integer(chess_df_pre$num_games), digits = 0)
# Subset desired columns
chess_df <- subset(chess_df_pre, select = c("player_name","state","pts","pre_rating","opp_ave_pre_rating"))
# Write result to .csv
write.csv(chess_df, file = "chess_df.csv", row.names = FALSE)
head(chess_df)
## player_name state pts pre_rating opp_ave_pre_rating
## 1 GARY HUA ON 6 1794 1605
## 2 DAKSHESH DARURI MI 6 1553 1469
## 3 ADITYA BAJAJ MI 6 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5 1686 1519