Goal of this project:

Given a text file with chess tournament results where the information has some structure, create an R Markdown file that generate a .csv file (that could for example be imported into s SQL database) with the following information for all of the players:

1) Read data

chess <- readLines("https://raw.githubusercontent.com/miachen410/DATA607/master/tournamentinfo.txt") #read .txt file from github using readLines function in base R

head(chess, 10)
##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------"
tail(chess, 10)
##  [1] "   62 | ASHWIN BALAJI                   |1.0  |W  55|U    |U    |U    |U    |U    |U    |"
##  [2] "   MI | 15219542 / R: 1530   ->1535     |     |B    |     |     |     |     |     |     |"
##  [3] "-----------------------------------------------------------------------------------------"
##  [4] "   63 | THOMAS JOSEPH HOSMER            |1.0  |L   2|L  48|D  49|L  43|L  45|H    |U    |"
##  [5] "   MI | 15057092 / R: 1175   ->1125     |     |W    |B    |W    |B    |B    |     |     |"
##  [6] "-----------------------------------------------------------------------------------------"
##  [7] "   64 | BEN LI                          |1.0  |L  22|D  30|L  31|D  49|L  46|L  42|L  54|"
##  [8] "   MI | 15006561 / R: 1163   ->1112     |     |B    |W    |W    |B    |W    |B    |B    |"
##  [9] "-----------------------------------------------------------------------------------------"
## [10] ""

2) Data Wrangling

As we can see, the data structure is somewhat complex. We will use the “stringr” and “rebus” packages to manipulate the strings.

library(stringr)
library(rebus)

Our target information are located in different rows, so we are going to assign data into two separate vectors for convinience.

part1 <- chess[seq(from = 5, to = length(chess)-1, by = 3)] #ignore first 4 rows, extract data from row 5 to second last row, by every 3 rows
part1 <- str_split(part1, pattern = "\\|", simplify = TRUE) #split strings, set simplify = TRUE to obtain a matrix format that's easier to work with
head(part1)
##      [,1]     [,2]                                [,3]    [,4]    [,5]   
## [1,] "    1 " " GARY HUA                        " "6.0  " "W  39" "W  21"
## [2,] "    2 " " DAKSHESH DARURI                 " "6.0  " "W  63" "W  58"
## [3,] "    3 " " ADITYA BAJAJ                    " "6.0  " "L   8" "W  61"
## [4,] "    4 " " PATRICK H SCHILLING             " "5.5  " "W  23" "D  28"
## [5,] "    5 " " HANSHI ZUO                      " "5.5  " "W  45" "W  37"
## [6,] "    6 " " HANSEN SONG                     " "5.0  " "W  34" "D  29"
##      [,6]    [,7]    [,8]    [,9]    [,10]   [,11]
## [1,] "W  18" "W  14" "W   7" "D  12" "D   4" ""   
## [2,] "L   4" "W  17" "W  16" "W  20" "W   7" ""   
## [3,] "W  25" "W  21" "W  11" "W  13" "W  12" ""   
## [4,] "W   2" "W  26" "D   5" "W  19" "D   1" ""   
## [5,] "D  12" "D  13" "D   4" "W  14" "W  17" ""   
## [6,] "L  11" "W  35" "D  10" "W  27" "W  21" ""
part2 <- chess[seq(from = 6, to = length(chess), by = 3)] #ignore first 5 rows, extract data from row 6 to last row, by every 3 rows
part2 <- str_split(part2, pattern = "\\|", simplify = TRUE) #split strings, set simplify = TRUE to obtain a matrix that's easier to work with
head(part2)
##      [,1]     [,2]                                [,3]    [,4]    [,5]   
## [1,] "   ON " " 15445895 / R: 1794   ->1817     " "N:2  " "W    " "B    "
## [2,] "   MI " " 14598900 / R: 1553   ->1663     " "N:2  " "B    " "W    "
## [3,] "   MI " " 14959604 / R: 1384   ->1640     " "N:2  " "W    " "B    "
## [4,] "   MI " " 12616049 / R: 1716   ->1744     " "N:2  " "W    " "B    "
## [5,] "   MI " " 14601533 / R: 1655   ->1690     " "N:2  " "B    " "W    "
## [6,] "   OH " " 15055204 / R: 1686   ->1687     " "N:3  " "W    " "B    "
##      [,6]    [,7]    [,8]    [,9]    [,10]   [,11]
## [1,] "W    " "B    " "W    " "B    " "W    " ""   
## [2,] "B    " "W    " "B    " "W    " "B    " ""   
## [3,] "W    " "B    " "W    " "B    " "W    " ""   
## [4,] "W    " "B    " "W    " "B    " "B    " ""   
## [5,] "B    " "W    " "B    " "W    " "B    " ""   
## [6,] "W    " "B    " "B    " "W    " "B    " ""

3) Create Vectors

name <- str_trim(c(part1[ ,2])) #assign all values from second column in matrix 'part1' to vector 'name', and trim white spaces
head(name)
## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"
total <- str_trim(c(part1[ ,3])) #assign all values from third column in matrix 'part1' to vector 'total', and trim white spaces
head(total)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
state <- str_trim(c(part2[ ,1])) #assign all values from first column in matrix 'part2' to vector 'state', and trim white spaces
head(state)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
rating <- "R:" %R% one_or_more(SPACE) %R% one_or_more(DIGIT) %R% optional("P") #pattern of pre-rating
pre_rating <- c(str_extract(str_extract(part2[, 2], pattern = rating), pattern = "\\d+")) #extract pre-rating values from matrix 'part 2', digits(one or more) only
pre_rating <- as.numeric(pre_rating) #change from character to numbers
head(pre_rating)
## [1] 1794 1553 1384 1716 1655 1686

4) Combining Vectors

Create a data.frame “tournament” to store the vectors “name, total, state, pre_rating” into columns “PlayerName, PlayerState, TotalPoints, PreRating”; Insert an empty column named “AvgOppPreRating” (Average Opponent Pre-Rating)

tournament <- data.frame("PlayerName" = name, "PlayerState" = state, "TotalPoints" = total, "PreRating" = pre_rating)
tournament$AvgOppPreRating <- NA #assign NA so there is no value in the column
head(tournament)
##            PlayerName PlayerState TotalPoints PreRating AvgOppPreRating
## 1            GARY HUA          ON         6.0      1794              NA
## 2     DAKSHESH DARURI          MI         6.0      1553              NA
## 3        ADITYA BAJAJ          MI         6.0      1384              NA
## 4 PATRICK H SCHILLING          MI         5.5      1716              NA
## 5          HANSHI ZUO          MI         5.5      1655              NA
## 6         HANSEN SONG          OH         5.0      1686              NA

5) Create a matrix to store opponents’ IDs for each player

opponents <- matrix(str_extract_all(part1[,4:10], "\\d+", simplify = TRUE), ncol = 7) #extract all digits from 4th to 10th columns in matrix 'part1', assign into a matrix with 7 columns- one for each opponent number
head(opponents)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] "39" "21" "18" "14" "7"  "12" "4" 
## [2,] "63" "58" "4"  "17" "16" "20" "7" 
## [3,] "8"  "61" "25" "21" "11" "13" "12"
## [4,] "23" "28" "2"  "26" "5"  "19" "1" 
## [5,] "45" "37" "12" "13" "4"  "14" "17"
## [6,] "34" "29" "11" "35" "10" "27" "21"

6) Calculate Average by a For Loop

for(i in 1:nrow(tournament)){
  tournament$AvgOppPreRating[i] <- round(mean(pre_rating[as.numeric(opponents[i,])], na.rm = TRUE))
}
#calculate the mean from each row in opponents corresponding to each player, round the mean and assign each mean to the corresponding column 'AvgOppPreRating' in data.frame 'tournament'

7) Sample Output in Table

load “knitr”" package and make a table of tournament using function “kable”

library(knitr) 
kable(head(tournament)) 
PlayerName PlayerState TotalPoints PreRating AvgOppPreRating
GARY HUA ON 6.0 1794 1605
DAKSHESH DARURI MI 6.0 1553 1469
ADITYA BAJAJ MI 6.0 1384 1564
PATRICK H SCHILLING MI 5.5 1716 1574
HANSHI ZUO MI 5.5 1655 1501
HANSEN SONG OH 5.0 1686 1519

8) Generate a .CSV File

write.csv(tournament, file = "tournamentinfo.csv")