#Loading packages
library(stringr)
library(knitr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
#Read the txt file from my local machine
chessData <- read.csv(paste0("/Users/aaliyahmjh/Downloads/tournamentinfo.txt"))
head(chessData)
## X.........................................................................................
## 1 Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## 2 Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## 3 -----------------------------------------------------------------------------------------
## 4 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 5 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 6 -----------------------------------------------------------------------------------------
#Remove the first 3 rows (headers and --- separator)
tidyData <-chessData[-c(1:3),]
head(tidyData)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [5] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [6] "-----------------------------------------------------------------------------------------"
#Extract player details (score and opponents) from every 3rd line (starting from line 1 and skipping the lines including state and ratings and the --- separator)
playerGameInfo <- tidyData[seq(1, length(tidyData), 3)]
head(playerGameInfo, 10)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
## [7] " 7 | GARY DEE SWATHELL |5.0 |W 57|W 46|W 13|W 11|L 1|W 9|L 2|"
## [8] " 8 | EZEKIEL HOUGHTON |5.0 |W 3|W 32|L 14|L 9|W 47|W 28|W 19|"
## [9] " 9 | STEFANO LEE |5.0 |W 25|L 18|W 59|W 8|W 26|L 7|W 20|"
## [10] " 10 | ANVIT RAO |5.0 |D 16|L 19|W 55|W 31|D 6|W 25|W 18|"
#Extract more player details (state and ratings) from every 3rd line (starting from line 2)
stateAndRatings<-tidyData[seq(2, length(tidyData), 3)]
head(stateAndRatings,10)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
## [7] " MI | 11146376 / R: 1649 ->1673 |N:3 |W |B |W |B |B |W |W |"
## [8] " MI | 15142253 / R: 1641P17->1657P24 |N:3 |B |W |B |W |B |W |W |"
## [9] " ON | 14954524 / R: 1411 ->1564 |N:2 |W |B |W |B |W |B |B |"
## [10] " MI | 14150362 / R: 1365 ->1544 |N:3 |W |W |B |B |W |B |W |"
id <- c(1:64)
#---
name <- str_trim(str_extract(playerGameInfo, "(\\w+\\s){2,3}"))
glimpse(name)
## chr [1:64] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" ...
#Regex to match any string that has 2 digits separated by a period - eg. 6.0, 5.5...
points <- as.numeric(str_extract(playerGameInfo, "\\d.\\d"))
glimpse(points)
## num [1:64] 6 6 6 5.5 5.5 5 5 5 5 5 ...
#
preRating <- as.integer(str_extract(str_extract(stateAndRatings, "\\D\\d{3,4}\\D"), "\\d+"))
glimpse(preRating)
## int [1:64] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 ...
#Matches a string with 2 letters like ON or MI
state <- str_extract(stateAndRatings, "\\w\\w")
glimpse(state)
## chr [1:64] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" ...
#Matches a string with 1 or more digits followed by a | and puts it in a list
temp<- str_extract_all(playerGameInfo, "\\d+\\|")
#---
opponentId <- lapply(temp, function(x) {
str_extract_all(x, "\\d+")
})
#Loop through the playerGameInfo dataframe and grab the prerating for every opponent and finding the average of the pre ratings for each players opponents
avgOppRatings <- length(playerGameInfo)
for (i in 1:length(playerGameInfo))
{
avgOppRatings[i] <- round(mean(preRating[as.numeric(unlist(opponentId[id[i]]))]), digits = 0)
}
head(avgOppRatings)
## [1] 1605 1469 1564 1574 1501 1519
chessDf <- tibble(name, state, points, preRating, avgOppRatings)
colnames(chessDf) <- c("Name", "State", "Points", "PreRating", "AverageOpponentRating")
head(chessDf)
## # A tibble: 6 Ă— 5
## Name State Points PreRating AverageOpponentRating
## <chr> <chr> <dbl> <int> <dbl>
## 1 GARY HUA ON 6 1794 1605
## 2 DAKSHESH DARURI MI 6 1553 1469
## 3 ADITYA BAJAJ MI 6 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5 1686 1519
write.csv(chessDf, "tournamentinfo.csv", row.names = FALSE)
How does the players’ pre ratings correlate to their points?
chessDf %>%
arrange(desc(points)) %>%
head(10)
## # A tibble: 10 Ă— 5
## Name State Points PreRating AverageOpponentRating
## <chr> <chr> <dbl> <int> <dbl>
## 1 GARY HUA ON 6 1794 1605
## 2 DAKSHESH DARURI MI 6 1553 1469
## 3 ADITYA BAJAJ MI 6 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5 1686 1519
## 7 GARY DEE SWATHELL MI 5 1649 1372
## 8 EZEKIEL HOUGHTON MI 5 1641 1468
## 9 STEFANO LEE ON 5 1411 1523
## 10 ANVIT RAO MI 5 1365 1554
Michigan had an overwhelming amount of representation at this tournament compared to the other states with most of their players having just over a 1500 rating (on the stronger side)
table(chessDf$State)
##
## MI OH ON
## 55 1 8
library(ggplot2)
ggplot(chessDf, aes(x = PreRating, fill = State)) +
geom_histogram(bins = 15) +
facet_wrap(~ State) +
labs(title = "Pre-Rating Distribution by State")
I wanted to check if the top 10 positions switched if they were ranked
on their average opponent ratings as well (high points even when playing
stronger opponents)
chessDf <- chessDf %>%
arrange(desc(Points), desc(AverageOpponentRating))
head(chessDf, 10)
## # A tibble: 10 Ă— 5
## Name State Points PreRating AverageOpponentRating
## <chr> <chr> <dbl> <int> <dbl>
## 1 GARY HUA ON 6 1794 1605
## 2 ADITYA BAJAJ MI 6 1384 1564
## 3 DAKSHESH DARURI MI 6 1553 1469
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 ANVIT RAO MI 5 1365 1554
## 7 STEFANO LEE ON 5 1411 1523
## 8 HANSEN SONG OH 5 1686 1519
## 9 EZEKIEL HOUGHTON MI 5 1641 1468
## 10 GARY DEE SWATHELL MI 5 1649 1372
plot(chessDf$PreRating, chessDf$Points,
xlab = "Pre-Tournament Rating",
ylab = "Total Points",
main = "Scatterplot of Rating vs. Points")
abline(lm(Points ~ PreRating, data = chessDf), col = "red")