#Loading packages
library(stringr)
library(knitr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)

Import the file

#Read the txt file from my local machine
chessData <- read.csv(paste0("/Users/aaliyahmjh/Downloads/tournamentinfo.txt"))
head(chessData)

##   X.........................................................................................
## 1  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 2  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 3  -----------------------------------------------------------------------------------------
## 4      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 5     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 6  -----------------------------------------------------------------------------------------

Tidying the data

#Remove the first 3 rows (headers and --- separator)
tidyData <-chessData[-c(1:3),]
head(tidyData)

## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [5] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "-----------------------------------------------------------------------------------------"

#Extract player details (score and opponents) from every 3rd line (starting from line 1 and skipping the lines including state and ratings and the --- separator)
playerGameInfo <- tidyData[seq(1, length(tidyData), 3)]
head(playerGameInfo, 10)

##  [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
##  [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
##  [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
##  [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
##  [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
##  [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"
##  [7] "    7 | GARY DEE SWATHELL               |5.0  |W  57|W  46|W  13|W  11|L   1|W   9|L   2|"
##  [8] "    8 | EZEKIEL HOUGHTON                |5.0  |W   3|W  32|L  14|L   9|W  47|W  28|W  19|"
##  [9] "    9 | STEFANO LEE                     |5.0  |W  25|L  18|W  59|W   8|W  26|L   7|W  20|"
## [10] "   10 | ANVIT RAO                       |5.0  |D  16|L  19|W  55|W  31|D   6|W  25|W  18|"

#Extract more player details (state and ratings) from every 3rd line (starting from line 2)
stateAndRatings<-tidyData[seq(2, length(tidyData), 3)]
head(stateAndRatings,10)

##  [1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
##  [2] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
##  [3] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
##  [4] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
##  [5] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
##  [6] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"
##  [7] "   MI | 11146376 / R: 1649   ->1673     |N:3  |W    |B    |W    |B    |B    |W    |W    |"
##  [8] "   MI | 15142253 / R: 1641P17->1657P24  |N:3  |B    |W    |B    |W    |B    |W    |W    |"
##  [9] "   ON | 14954524 / R: 1411   ->1564     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [10] "   MI | 14150362 / R: 1365   ->1544     |N:3  |W    |W    |B    |B    |W    |B    |W    |"

Building the dataframe

id <- c(1:64)
#---
name <- str_trim(str_extract(playerGameInfo, "(\\w+\\s){2,3}"))
glimpse(name)

##  chr [1:64] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" ...

#Regex to match any string that has 2 digits separated by a period - eg. 6.0, 5.5...
points <- as.numeric(str_extract(playerGameInfo, "\\d.\\d"))
glimpse(points)

##  num [1:64] 6 6 6 5.5 5.5 5 5 5 5 5 ...

#
preRating <- as.integer(str_extract(str_extract(stateAndRatings, "\\D\\d{3,4}\\D"), "\\d+")) 
glimpse(preRating)

##  int [1:64] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 ...

#Matches a string with 2 letters like ON or MI
state <- str_extract(stateAndRatings, "\\w\\w")
glimpse(state)

##  chr [1:64] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" ...

#Matches a string with 1 or more digits followed by a | and puts it in a list
temp<- str_extract_all(playerGameInfo, "\\d+\\|")

#---
opponentId <- lapply(temp, function(x) {
  str_extract_all(x, "\\d+")
})

Calculate average opponent rating

#Loop through the playerGameInfo dataframe and grab the prerating for every opponent and finding the average of the pre ratings for each players opponents
avgOppRatings <- length(playerGameInfo)
for (i in 1:length(playerGameInfo)) 
{ 
  avgOppRatings[i] <- round(mean(preRating[as.numeric(unlist(opponentId[id[i]]))]), digits = 0)
}
head(avgOppRatings)

## [1] 1605 1469 1564 1574 1501 1519

chessDf <- tibble(name, state, points, preRating, avgOppRatings)
colnames(chessDf) <-  c("Name", "State", "Points", "PreRating", "AverageOpponentRating")
head(chessDf)

## # A tibble: 6 × 5
##   Name                State Points PreRating AverageOpponentRating
##   <chr>               <chr>  <dbl>     <int>                 <dbl>
## 1 GARY HUA            ON       6        1794                  1605
## 2 DAKSHESH DARURI     MI       6        1553                  1469
## 3 ADITYA BAJAJ        MI       6        1384                  1564
## 4 PATRICK H SCHILLING MI       5.5      1716                  1574
## 5 HANSHI ZUO          MI       5.5      1655                  1501
## 6 HANSEN SONG         OH       5        1686                  1519

write.csv(chessDf, "tournamentinfo.csv", row.names = FALSE)

EDA

How does the players’ pre ratings correlate to their points?

Top 10 performers based on points

chessDf %>%
  arrange(desc(points)) %>%
  head(10)

## # A tibble: 10 × 5
##    Name                State Points PreRating AverageOpponentRating
##    <chr>               <chr>  <dbl>     <int>                 <dbl>
##  1 GARY HUA            ON       6        1794                  1605
##  2 DAKSHESH DARURI     MI       6        1553                  1469
##  3 ADITYA BAJAJ        MI       6        1384                  1564
##  4 PATRICK H SCHILLING MI       5.5      1716                  1574
##  5 HANSHI ZUO          MI       5.5      1655                  1501
##  6 HANSEN SONG         OH       5        1686                  1519
##  7 GARY DEE SWATHELL   MI       5        1649                  1372
##  8 EZEKIEL HOUGHTON    MI       5        1641                  1468
##  9 STEFANO LEE         ON       5        1411                  1523
## 10 ANVIT RAO           MI       5        1365                  1554

Michigan had an overwhelming amount of representation at this tournament compared to the other states with most of their players having just over a 1500 rating (on the stronger side)

table(chessDf$State)

## 
## MI OH ON 
## 55  1  8

library(ggplot2)
ggplot(chessDf, aes(x = PreRating, fill = State)) +
  geom_histogram(bins = 15) +
  facet_wrap(~ State) +
  labs(title = "Pre-Rating Distribution by State")

I wanted to check if the top 10 positions switched if they were ranked on their average opponent ratings as well (high points even when playing stronger opponents)

chessDf <- chessDf %>%
  arrange(desc(Points), desc(AverageOpponentRating))

head(chessDf, 10)

## # A tibble: 10 × 5
##    Name                State Points PreRating AverageOpponentRating
##    <chr>               <chr>  <dbl>     <int>                 <dbl>
##  1 GARY HUA            ON       6        1794                  1605
##  2 ADITYA BAJAJ        MI       6        1384                  1564
##  3 DAKSHESH DARURI     MI       6        1553                  1469
##  4 PATRICK H SCHILLING MI       5.5      1716                  1574
##  5 HANSHI ZUO          MI       5.5      1655                  1501
##  6 ANVIT RAO           MI       5        1365                  1554
##  7 STEFANO LEE         ON       5        1411                  1523
##  8 HANSEN SONG         OH       5        1686                  1519
##  9 EZEKIEL HOUGHTON    MI       5        1641                  1468
## 10 GARY DEE SWATHELL   MI       5        1649                  1372

plot(chessDf$PreRating, chessDf$Points,
     xlab = "Pre-Tournament Rating",
     ylab = "Total Points",
     main = "Scatterplot of Rating vs. Points")
abline(lm(Points ~ PreRating, data = chessDf), col = "red")

Many of the top 10 position were swapped when arranged based on how strong their opponents - this shows that some players may have had an easier route to the points.
When examining the correlation between the pre-ratings and the points, I noticed that, typically, the players’s pre-tournament ratings usually lined up with their scores (higher ratings, higher scores - average ratings average scores) Of course there were some outliers which could be chalked up to a higher rated player having a bad day or missing some games and a lower rated player having a great day or easier opponents.
As I mentioned before, MI dominated in participation which makes me believe that there might be more active chess clubs/programs in that state while OH only had 1 participant, which could mean the opposite. It is, however, admirable that the representative from OH was able to finish in the top 10.

Project1_EDA_Data607

Aaliyah John-Harry

2025-02-27

Import the file

Tidying the data

Building the dataframe

Calculate average opponent rating

EDA

Top 10 performers based on points