The Purpose of this project is to create an R markdown file which will scrape a Chess player cross table to extract specific columns, create a column to calculate an average measure and finally generate a CSV file containing this aggregated information.
After review of the text file provided, the project presents the following main challenges:
Player information to be extracted is spread over two rows where odd rows contain Player’s Number, Name and Total Number of Points, and even rows contain Player’s State and Player’s Pre-Rating.
The formatting of the input text file is such that the data contains different special characters (-,|) and extra spaces (both leading and trailing)
The average measure to be calculated is dependent on rating of a player’s opponents and is non linear in nature.
library(stringr)
library(DT)
tinfo<- readLines("tournamentinfo.txt")
## Warning in readLines("tournamentinfo.txt"): incomplete final line found on
## 'tournamentinfo.txt'
head(tinfo,4)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
df<- str_replace_all(tinfo, "^-+$|", "")
df<-df[-c(1:4)]
head(df,3)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] ""
indexo <- c(seq(1, length(df), 3))
indexe <- c(seq(2, length(df), 3))
head(indexo,3)
## [1] 1 4 7
head(indexe,3)
## [1] 2 5 8
name <- df[indexo]
other_info <- df[indexe]
head(name,2)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
head(other_info,2)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
# Extract Player Number and Name and total points. All numeric fields will be converted as such
player_no. <- str_extract(name, "^\\s+(\\d+)\\s+\\|")
player_name <- str_extract(name, "\\s+([[:alpha:]- ]+)\\b\\s*\\|")
total_points <- as.numeric((str_extract(name, "[[:digit:]]+\\.[[:digit:]]")))
# Extract Player State, and Player Pre rating
state <- str_extract(other_info, "[[:alpha:]]{2}")
pre_rating <- str_extract(other_info, ".\\: \\s?[[:digit:]]{3,4}")
# Review the results
head(player_no.,2)
## [1] " 1 |" " 2 |"
head(player_name,2)
## [1] " GARY HUA |"
## [2] " DAKSHESH DARURI |"
head(total_points,2)
## [1] 6 6
head(state,2)
## [1] "ON" "MI"
head(pre_rating,2)
## [1] "R: 1794" "R: 1553"
Substep 5.2 - Upon review we note that some of the fields are not formatted appropriately and might contain special characters or extra spaces. These need to be removed or replaced.
# Remove special characters and spaces. Convert to numeric where appropriate
player_no. <- trimws(str_remove(player_no.,"[|]"),"both")
player_name <- trimws(str_remove(player_name,"[|]"),"both")
pre_rating <- as.numeric(trimws(str_remove(pre_rating,"R: "),"both"))
opponents <- str_extract_all(name, "[[:digit:]]{1,2}\\|")
# Using str_remove to remove the | led to a "not an atmoic vector error". As an alternative we will use regex again to extract the numbers in the current vector and then use lapply to convert each member of the list to a numeric
opponents <- (str_extract_all(opponents,"[[:digit:]]{1,2}"))
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
opponents <- lapply(opponents, as.numeric)
Avg <- list()
for (a in 1:length(opponents))#since there are individuals who played less than 7 rounds #
{
Avg[a] <-ceiling(mean(pre_rating[unlist(opponents[a])])) #Rounding to the next highest integer
}
Average_oppnt_prerating <- lapply(Avg, as.numeric)
finaldf <-data.frame(player_name,state,total_points,pre_rating,unlist(Average_oppnt_prerating))
#Changing the column name Average_oppnt_prerating to AOPR for ease of use
names(finaldf)[5] <- "AOPR"
#Review the resulting dataframe
head(finaldf)
## player_name state total_points pre_rating AOPR
## 1 GARY HUA ON 6.0 1794 1606
## 2 DAKSHESH DARURI MI 6.0 1553 1470
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
write.csv(finaldf, file = "ChessPlayerData.csv")