if (!require('DT')) install.packages('DT')
if (!require('stringr')) install.packages('stringr')
if (!require('ggplot2')) install.packages('ggplot2')
sourcefile = 'https://raw.githubusercontent.com/ahussan/DATA_607_CUNY_SPS/master/Project%201/tournamentinfo.txt'
raw <- readLines('tournamentinfo.txt')
head(raw)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
Our data is laid out in a table, it will be helpful to narrow down the areas our regular expressions to match the pattern
We may be able to extract necessary fields by matching pattern from certain sections of a line.
#Find the table breaks
b0 <- 0
b1 <- unname(str_locate_all(pattern = '\\|', raw[5])[[1]][1,1])
b2 <- unname(str_locate_all(pattern = '\\|', raw[5])[[1]][2,1])
b3 <- unname(str_locate_all(pattern = '\\|', raw[5])[[1]][3,1])
b4 <- max(nchar(raw))
In the initial data we can find required data in the following string positions in each line:
Now that we have some matched pattern - we will divide into two groups to get our row data in to to section
# Group1 = Num, Name, Points, Opponent IDs
g1row <- seq(5, 196, 3)
# Group2 = State, Rating
g2row <- seq(6, 196, 3)
# subset for easy searching
group1 <- raw[g1row]
group2 <- raw[g2row]
Now we need to extract our desired fields.
# Find subsets of player name
namesub <- substr(group1, b1+1, b2-2)
namesub <- str_trim(namesub)
PlayerName <- str_to_title(namesub)
head(PlayerName)
[1] "Gary Hua" "Dakshesh Daruri" "Aditya Bajaj"
[4] "Patrick H Schilling" "Hanshi Zuo" "Hansen Song"
statesub <- substr(group2, b0, b1-1)
State <- str_trim(statesub)
head(State)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
# dataframe
chess <- data.frame(PlayerName, State)
pointsub <- substr(group1, b2+1, b3-1)
head(pointsub)
## [1] "6.0 " "6.0 " "6.0 " "5.5 " "5.5 " "5.0 "
chess$TotalPoints <- sprintf("%.1f", as.numeric(pointsub))
presub <- substr(group2, b1+1, b2-1)
presub <- str_extract(presub, ': *\\d{2,}')
head(presub)
## [1] ": 1794" ": 1553" ": 1384" ": 1716" ": 1655" ": 1686"
chess$PreRating <- as.integer(str_extract(presub, '\\d{2,}'))
oppsub <- substr(group1, b3+1, b4)
oppsub <- str_extract_all(oppsub, '\\b\\d{1,}')
oppsub <- as.matrix(oppsub)
calculation <- function(l, p ){
temp <- l[p]
for (place in temp){
rating <- 0
counter <- 0
for(i in place) {
counter <- counter + 1
rating <- rating + chess$PreRating[as.numeric(i)]
}
rating <- round(rating / counter)
}
return (rating)
}
chess$AvgOppPreRating <- apply(oppsub, 1, calculation)
datatable(chess)
# export
write.csv(chess, "chessData.csv", row.names=FALSE)
x <- ggplot(chess, aes(PreRating, AvgOppPreRating)) + geom_point(aes(color=TotalPoints)) + ggtitle("Pre-Rating VS Avg Opponent Pre-Rating by Total Points Gained")
x