Given a text file with chess tournament results where the information has some structure, create an R Markdown file that generate a .csv file (that could for example be imported into s SQL database) with the following information for all of the players:
chess <- readLines("https://raw.githubusercontent.com/miachen410/DATA607/master/tournamentinfo.txt") #read .txt file from github using readLines function in base R
head(chess, 10)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
tail(chess, 10)
## [1] " 62 | ASHWIN BALAJI |1.0 |W 55|U |U |U |U |U |U |"
## [2] " MI | 15219542 / R: 1530 ->1535 | |B | | | | | | |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 63 | THOMAS JOSEPH HOSMER |1.0 |L 2|L 48|D 49|L 43|L 45|H |U |"
## [5] " MI | 15057092 / R: 1175 ->1125 | |W |B |W |B |B | | |"
## [6] "-----------------------------------------------------------------------------------------"
## [7] " 64 | BEN LI |1.0 |L 22|D 30|L 31|D 49|L 46|L 42|L 54|"
## [8] " MI | 15006561 / R: 1163 ->1112 | |B |W |W |B |W |B |B |"
## [9] "-----------------------------------------------------------------------------------------"
## [10] ""
As we can see, the data structure is somewhat complex. We will use the “stringr” and “rebus” packages to manipulate the strings.
library(stringr)
library(rebus)
Our target information are located in different rows, so we are going to assign data into two separate vectors for convinience.
part1 <- chess[seq(from = 5, to = length(chess)-1, by = 3)] #ignore first 4 rows, extract data from row 5 to second last row, by every 3 rows
part1 <- str_split(part1, pattern = "\\|", simplify = TRUE) #split strings, set simplify = TRUE to obtain a matrix format that's easier to work with
head(part1)
## [,1] [,2] [,3] [,4] [,5]
## [1,] " 1 " " GARY HUA " "6.0 " "W 39" "W 21"
## [2,] " 2 " " DAKSHESH DARURI " "6.0 " "W 63" "W 58"
## [3,] " 3 " " ADITYA BAJAJ " "6.0 " "L 8" "W 61"
## [4,] " 4 " " PATRICK H SCHILLING " "5.5 " "W 23" "D 28"
## [5,] " 5 " " HANSHI ZUO " "5.5 " "W 45" "W 37"
## [6,] " 6 " " HANSEN SONG " "5.0 " "W 34" "D 29"
## [,6] [,7] [,8] [,9] [,10] [,11]
## [1,] "W 18" "W 14" "W 7" "D 12" "D 4" ""
## [2,] "L 4" "W 17" "W 16" "W 20" "W 7" ""
## [3,] "W 25" "W 21" "W 11" "W 13" "W 12" ""
## [4,] "W 2" "W 26" "D 5" "W 19" "D 1" ""
## [5,] "D 12" "D 13" "D 4" "W 14" "W 17" ""
## [6,] "L 11" "W 35" "D 10" "W 27" "W 21" ""
part2 <- chess[seq(from = 6, to = length(chess), by = 3)] #ignore first 5 rows, extract data from row 6 to last row, by every 3 rows
part2 <- str_split(part2, pattern = "\\|", simplify = TRUE) #split strings, set simplify = TRUE to obtain a matrix that's easier to work with
head(part2)
## [,1] [,2] [,3] [,4] [,5]
## [1,] " ON " " 15445895 / R: 1794 ->1817 " "N:2 " "W " "B "
## [2,] " MI " " 14598900 / R: 1553 ->1663 " "N:2 " "B " "W "
## [3,] " MI " " 14959604 / R: 1384 ->1640 " "N:2 " "W " "B "
## [4,] " MI " " 12616049 / R: 1716 ->1744 " "N:2 " "W " "B "
## [5,] " MI " " 14601533 / R: 1655 ->1690 " "N:2 " "B " "W "
## [6,] " OH " " 15055204 / R: 1686 ->1687 " "N:3 " "W " "B "
## [,6] [,7] [,8] [,9] [,10] [,11]
## [1,] "W " "B " "W " "B " "W " ""
## [2,] "B " "W " "B " "W " "B " ""
## [3,] "W " "B " "W " "B " "W " ""
## [4,] "W " "B " "W " "B " "B " ""
## [5,] "B " "W " "B " "W " "B " ""
## [6,] "W " "B " "B " "W " "B " ""
name <- str_trim(c(part1[ ,2])) #assign all values from second column in matrix 'part1' to vector 'name', and trim white spaces
head(name)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
total <- str_trim(c(part1[ ,3])) #assign all values from third column in matrix 'part1' to vector 'total', and trim white spaces
head(total)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
state <- str_trim(c(part2[ ,1])) #assign all values from first column in matrix 'part2' to vector 'state', and trim white spaces
head(state)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
rating <- "R:" %R% one_or_more(SPACE) %R% one_or_more(DIGIT) %R% optional("P") #pattern of pre-rating
pre_rating <- c(str_extract(str_extract(part2[, 2], pattern = rating), pattern = "\\d+")) #extract pre-rating values from matrix 'part 2', digits(one or more) only
pre_rating <- as.numeric(pre_rating) #change from character to numbers
head(pre_rating)
## [1] 1794 1553 1384 1716 1655 1686
Create a data.frame “tournament” to store the vectors “name, total, state, pre_rating” into columns “PlayerName, PlayerState, TotalPoints, PreRating”; Insert an empty column named “AvgOppPreRating” (Average Opponent Pre-Rating)
tournament <- data.frame("PlayerName" = name, "PlayerState" = state, "TotalPoints" = total, "PreRating" = pre_rating)
tournament$AvgOppPreRating <- NA #assign NA so there is no value in the column
head(tournament)
## PlayerName PlayerState TotalPoints PreRating AvgOppPreRating
## 1 GARY HUA ON 6.0 1794 NA
## 2 DAKSHESH DARURI MI 6.0 1553 NA
## 3 ADITYA BAJAJ MI 6.0 1384 NA
## 4 PATRICK H SCHILLING MI 5.5 1716 NA
## 5 HANSHI ZUO MI 5.5 1655 NA
## 6 HANSEN SONG OH 5.0 1686 NA
opponents <- matrix(str_extract_all(part1[,4:10], "\\d+", simplify = TRUE), ncol = 7) #extract all digits from 4th to 10th columns in matrix 'part1', assign into a matrix with 7 columns- one for each opponent number
head(opponents)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] "39" "21" "18" "14" "7" "12" "4"
## [2,] "63" "58" "4" "17" "16" "20" "7"
## [3,] "8" "61" "25" "21" "11" "13" "12"
## [4,] "23" "28" "2" "26" "5" "19" "1"
## [5,] "45" "37" "12" "13" "4" "14" "17"
## [6,] "34" "29" "11" "35" "10" "27" "21"
for(i in 1:nrow(tournament)){
tournament$AvgOppPreRating[i] <- round(mean(pre_rating[as.numeric(opponents[i,])], na.rm = TRUE))
}
#calculate the mean from each row in opponents corresponding to each player, round the mean and assign each mean to the corresponding column 'AvgOppPreRating' in data.frame 'tournament'
load “knitr”" package and make a table of tournament using function “kable”
library(knitr)
kable(head(tournament))
| PlayerName | PlayerState | TotalPoints | PreRating | AvgOppPreRating |
|---|---|---|---|---|
| GARY HUA | ON | 6.0 | 1794 | 1605 |
| DAKSHESH DARURI | MI | 6.0 | 1553 | 1469 |
| ADITYA BAJAJ | MI | 6.0 | 1384 | 1564 |
| PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
| HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
| HANSEN SONG | OH | 5.0 | 1686 | 1519 |
write.csv(tournament, file = "tournamentinfo.csv")