library(stringr)
library(sqldf)
library(datasets)
#install.packages('DT')
library(DT)
In this project, a text (tournamentinfo.txt) file was given with chess tournament results where the information has some structure.
Below is an example of the chess tournament table structure. It is a .txt file with PIPE Delimited character
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
## [11] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [12] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [13] "-----------------------------------------------------------------------------------------"
My bellow process create a R Markdown file that generates a .CSV file which could for example be imported into a SQL database, with the following information for all of the players: (Total # of Players are - 64 )
For the first player, the information would be:
a | b | c | d | e | f | g |
---|---|---|---|---|---|---|
ID | Name | State | Total Pts | Pre Rating | Post Rating | Opp Avg |
1 | GARY HUA | ON | 6 | 1794 | 1817 | 1605 |
1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments.
~ Connect and load the tournamentinfo.txt
~ Analysis of the data structure
~ Stage and Separate data rows using Regular Expression
url <- "https://raw.githubusercontent.com/kfolsom98/DATA607/master/tournamentinfo.txt"
#url <- "https://raw.githubusercontent.com/mathsanu/CUNY_MSDA/master/DATA607/project1/tournamentinfo.txt"
download_file <- "tournamentinfo.txt" # name of the file on the local machine after download
# the file will be downloaded to the working directory
downloader::download(url, download_file)
# read the file into a vector
conn <- file(download_file, open="r")
tournamentinfo <- readLines(conn, warn = FALSE)
#tournamentinfo
#tournamentinfo[1:13]
close(conn)
length(tournamentinfo)
## [1] 196
head(tournamentinfo)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
As seen above, we have to remove the header i.e row 1 to row 4.Without removing header and subsequent code will retrieve Player Info and Rating Info.
# Remove lines with "-"
toinfo.stg <- str_replace_all(str_replace_all(tournamentinfo, "->", ">>"), "-{3,}", "")
toinfo.stg <- tournamentinfo[toinfo.stg != ""]
toinfo.stg<- toinfo.stg[-(1:2)]
# determine the format by looking at the contents of the first field in the record
toinfo.players <- toinfo.stg[str_detect(str_sub(toinfo.stg, 1, 6), "[0-9]")]
toinfo.ratings <- toinfo.stg[str_detect(str_sub(toinfo.stg, 1, 6), "[A-Z]{2,2}")]
head(toinfo.players, 5)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
head(toinfo.ratings, 5)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
pairNo <- as.integer(str_extract(toinfo.players, "\\d+"))
Name <- str_trim(str_extract(toinfo.players, "(\\w+\\s){2,3}"))
Region <- str_extract(toinfo.ratings, "\\w+")
Points <- as.numeric(str_extract(toinfo.players, "\\d+\\.\\d+"))
Rating <- as.integer(str_extract(str_extract(toinfo.ratings, "[^\\d]\\d{3,4}[^\\d]"), "\\d+"))
Opponents <- str_extract_all(str_extract_all(toinfo.players, "\\d+\\|"), "\\d+")
Won <- str_count(toinfo.players, "\\Q|W \\E")
Loose <- str_count(toinfo.players, "\\Q|L \\E")
Draw <- str_count(toinfo.players, "\\Q|D \\E")
# # of of Players
length(toinfo.players)
## [1] 64
# # of of ratings
length(toinfo.ratings)
## [1] 64
mRating <- length(toinfo.players)
for (i in 1:length(toinfo.players)) {
mRating[i] <- round(mean(Rating[as.numeric(unlist(Opponents[pairNo[i]]))]), digits = 0)
}
opData <- data.frame(Name, Region, Points, Rating, mRating, Won, Loose, Draw);
head(opData)
## Name Region Points Rating mRating Won Loose Draw
## 1 GARY HUA ON 6.0 1794 1605 5 0 2
## 2 DAKSHESH DARURI MI 6.0 1553 1469 6 1 0
## 3 ADITYA BAJAJ MI 6.0 1384 1564 6 1 0
## 4 PATRICK H SCHILLING MI 5.5 1716 1574 4 0 3
## 5 HANSHI ZUO MI 5.5 1655 1501 4 0 3
## 6 HANSEN SONG OH 5.0 1686 1519 4 1 2
# Display All the Pivoted Columns in a Table
colnames(opData) <- c("Player's Name", "Player's State", "Total Number of Points", "Player's Pre-Rating", " Average Pre Chess Rating of Opponents", "Won", "Lost", "Draw")
datatable(opData)
# Select only the specified columns to export
Final.df <- data.frame(Name , Region, Points, Rating, mRating);
colnames(Final.df ) <- c("Player's Name", "Player's State", "Total Number of Points", "Player's Pre-Rating", " Average Pre Chess Rating of Opponents")
head(Final.df)
## Player's Name Player's State Total Number of Points
## 1 GARY HUA ON 6.0
## 2 DAKSHESH DARURI MI 6.0
## 3 ADITYA BAJAJ MI 6.0
## 4 PATRICK H SCHILLING MI 5.5
## 5 HANSHI ZUO MI 5.5
## 6 HANSEN SONG OH 5.0
## Player's Pre-Rating Average Pre Chess Rating of Opponents
## 1 1794 1605
## 2 1553 1469
## 3 1384 1564
## 4 1716 1574
## 5 1655 1501
## 6 1686 1519
write.csv(Final.df, file = "stgChessInfo.csv")