The Objective of the this project is to processes and extract information from a text file that includes chess tournament results and create a R script that generates an .CSV file, after extracting the needed information.
The provided chess tournament results text file was uploaded to Github, and it will be loaded from there.
Lets load the data from the text file as a character vector:
tournamentdata <- readLines('https://raw.githubusercontent.com/aaitelmouden/DATA607S2020/master/project1/tournamentinfo.txt')
Lets Check the head of the data:
head(tournamentdata, 16)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
## [11] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [12] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [13] "-----------------------------------------------------------------------------------------"
## [14] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [15] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [16] "-----------------------------------------------------------------------------------------"
##Clean up the data##
The stringr package provide a cohesive set of functions designed to make working with strings as easy as possible.
library(stringr)
nodash <- sub("^-+$", "", tournamentdata) # This also can be done using str_replace_all function
tail(nodash)
## [1] " 63 | THOMAS JOSEPH HOSMER |1.0 |L 2|L 48|D 49|L 43|L 45|H |U |"
## [2] " MI | 15057092 / R: 1175 ->1125 | |W |B |W |B |B | | |"
## [3] ""
## [4] " 64 | BEN LI |1.0 |L 22|D 30|L 31|D 49|L 46|L 42|L 54|"
## [5] " MI | 15006561 / R: 1163 ->1112 | |B |W |W |B |W |B |B |"
## [6] ""
noblanklines <- nodash[sapply(nodash, nchar) > 0]
head(noblanklines)
## [1] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [2] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [3] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [4] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [5] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [6] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
Players Names Table
PlayerName <- noblanklines[seq(1, 130, 2)]
head(PlayerName)
## [1] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [2] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [3] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [4] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [5] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [6] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
Players ID’s Table
** Same function will be used to extract the player ID)
PlayerID <- noblanklines[seq(2, 130, 2)]
head(PlayerID)
## [1] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [4] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [5] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [6] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
Remove header rows from PlayerName and PlayerID
PlayerNameNoHeader <- PlayerName[-c(1:1)]
head(PlayerNameNoHeader)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
PlayerIDNoHeader <- PlayerID[-c(1:1)]
head(PlayerIDNoHeader)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
make all the data about one player combined in one row
Combinedata <- mapply(paste, sep = "", PlayerNameNoHeader, PlayerIDNoHeader)
head(Combinedata)
##
## " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## <NA>
## " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7| MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## <NA>
## " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12| MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## <NA>
## " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1| MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## <NA>
## " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17| MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## <NA>
## " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21| OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
Extract USCFID
USCFID <- str_extract(string = Combinedata, pattern = "[0-9]{8}")
USCFID
## [1] "15445895" "14598900" "14959604" "12616049" "14601533" "15055204"
## [7] "11146376" "15142253" "14954524" "14150362" "12581589" "12681257"
## [13] "15082995" "10131499" "15619130" "10295068" "10297702" "11342094"
## [19] "14862333" "14529060" "15495066" "12405534" "15030142" "13469010"
## [25] "12486656" "15131520" "14476567" "14882954" "15323285" "12577178"
## [31] "15131618" "14073750" "14691842" "15051807" "14601397" "14773163"
## [37] "15489571" "15108523" "12923035" "14892710" "15761443" "14462326"
## [43] "14101068" "15323504" "15372807" "15490981" "12533115" "14369165"
## [49] "12531685" "14773178" "15205474" "14918803" "12578849" "12836773"
## [55] "15412571" "14679887" "15113330" "14700365" "12841036" "14579262"
## [61] "15771592" "15219542" "15057092" "15006561"
Extract Player ID
PlayerID <- str_extract(string = Combinedata, pattern = "[\\s{3}]\\d{1,2}[\\s\\|]")
PlayerID <- str_trim(PlayerID)
head(PlayerID)
## [1] "1" "2" "3" "4" "5" "6"
Extract player Names
Name <- str_extract(string = Combinedata, pattern = "(\\w+\\s){2,5}")
Name <- str_trim(Name)
head(Name)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
Extract Player States
State <- str_extract(string = PlayerIDNoHeader, pattern = "[[:alpha:]]{2}")
head(State)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
Extract the Total Number of Points
NbrPoints <- str_extract(string = Combinedata, pattern = "[0-9]\\.[0-9]")
head(NbrPoints)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
Extract the Player’s Pre-Rating
PreRating <- str_extract(string = Combinedata, pattern = "\\s\\d{3,4}[^\\d]")
PreRating <- as.integer(str_extract(PreRating, "\\d+")) #Remove all the “P’s” at the end of the ratings
head(PreRating)
## [1] 1794 1553 1384 1716 1655 1686
In this step we’ll extract the Opponents data by Player ID
opponentData <- unlist(str_extract_all(PlayerNameNoHeader, "([\\|][A-Z]([[:space:]]+)\\d*[\\|])([A-Z]([[:space:]]+)\\d*[\\|])*", simplify=TRUE))
opponents <- matrix(ncol=7)
head(opponentData)
## [,1]
## [1,] "|W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2,] "|W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3,] "|L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4,] "|W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5,] "|W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6,] "|W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
Get the individual Opponent Indexes into a matrix of 7 columns
opponents <- unlist(str_extract_all(opponentData[,], "\\d+", simplify=TRUE))
head(opponents)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] "39" "21" "18" "14" "7" "12" "4"
## [2,] "63" "58" "4" "17" "16" "20" "7"
## [3,] "8" "61" "25" "21" "11" "13" "12"
## [4,] "23" "28" "2" "26" "5" "19" "1"
## [5,] "45" "37" "12" "13" "4" "14" "17"
## [6,] "34" "29" "11" "35" "10" "27" "21"
In this step we’ll Loop through each row of Opponent Index. Match each Opponent Index with its corresponding Pre-Rating. Get the average Opponent rating for each row
RatingAverages = NULL #initialize teh rating averages
for(row in 1:nrow(opponents)){
numberOfgames = 0
sum = 0
for(col in 1:ncol(opponents)){
if(opponents[row, col] != ""){ # Check to make sure we are not looking at a null opponent index value
Oppindex <- opponents[row, col] # Get the Opponent Index
Oppindex <- strtoi(Oppindex, base=0L) # Convert to integer
sum = sum + strtoi(PreRating[Oppindex]) # Update sum of corresponding pre-ratings
numberOfgames = numberOfgames + 1 # Update number of opponents
}
}
avgerage = sum/numberOfgames
RatingAverages = round(rbind(RatingAverages, data.frame(avgerage)),digits = 0)
}
head(RatingAverages)
ChessResults <- data.frame(PlayerID, Name, State, NbrPoints, PreRating, RatingAverages)
head(ChessResults, 5)
Change the column names
colnames(ChessResults) <- c("PlayerID","Player's Name", "State", "Total Number of Points", "Pre-Rating", "Average Rating of Opponents")
head(ChessResults, 5)
write.csv(ChessResults, file = "ChessResults.csv")