Overview

This project analyzes the results of a chess tournament with 64 players by transforming raw data extracted from a text file to a structured dataset with the goal of understanding the relationship between player pre-tournament ratings and actual tournament performance. The raw data was spread across multiple lines per player, so I had to parse, clean, and restructure the data before the analysis. The goal was create a dataset with each player’s name, state, total points, pre-tournament rating, and average pre-rating of opponents. I also created lookup table to calculate opponent’s stats. In order to see all player’s performance compared with pre tournament performance, I also created a scatter plot highlighting the top 5 performers.

Load libraries

library(readr)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)

Load the data from my Github

raw_url <- "https://raw.githubusercontent.com/JDO-MSDS/DATA-607/refs/heads/main/Project%201/chess.txt"

chess <- readr::read_lines(raw_url)
head(chess, 15)
##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------" 
## [11] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|" 
## [12] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [13] "-----------------------------------------------------------------------------------------" 
## [14] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|" 
## [15] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"

Extract the information

names <- c()
states <- c()
points <- c()
pre_ratings <- c()
avg_opponent_rantings <- c()

# Collect all players data
player_data <- list()

for (i in seq(5, length(chess), 3)) {
  if (i + 1 <= length(chess)) {
    # player data line
    player_line <- chess[i]
    state_line <- chess[i + 1]
  
    # split line |
    player_fields_split <- strsplit(player_line, "\\|")[[1]] |> trimws()
    state_fields_split <- strsplit(state_line, "\\|")[[1]] |> trimws()

  
    # fields extraction for player
    player_num <- as.numeric(player_fields_split[1])
    name <- player_fields_split[2]  # player name position
    state <- state_fields_split[1] # state position
    total_points <- as.numeric(player_fields_split[3])
 
  
    # points and ratings numeric cleaning - remove everything else
    rating_match <- stringr::str_match(state_fields_split[2], "R:\\s*(\\d+)")
    pre_rating <- as.numeric(rating_match[,2])
  
    # opponents
    opponents <- c()
    for (round in 5:11) {
      if (round <= length(player_fields_split)) {
        round_result <- player_fields_split[round]
        opponent_match <- stringr::str_extract(round_result, "[WLD]\\s*(\\d+)")
        if (!is.na(opponent_match)) {
          opponent_num <- as.numeric(stringr::str_extract(opponent_match, "\\d+"))
          opponents <- c(opponents, opponent_num)
        }
      }
    }
    
    # Save player data 
    player_data[[as.character(player_num)]] <- list(
      name = name,
      state = state,
      points = total_points,
      pre_rating = pre_rating,
      opponents = opponents
    )
  }
}

# Calculate avg opponent ratings
for (player_num in names(player_data)) {
  player <- player_data[[player_num]]
  
  # avg pre rating
  opponent_ratings <- c()
  for (opponent_num in player$opponents) {
    if (as.character(opponent_num) %in% names(player_data)) {
      opponent_ratings <- c(opponent_ratings, player_data[[as.character(opponent_num)]]$pre_rating)
    }
  }
  
  avg_opponent_ranting <- if(length(opponent_ratings) > 0) {
    round(mean(opponent_ratings), 0)
  } else {
    NA
  }
  
  # update vectors
  names <- c(names, player$name)
  states <- c(states, player$state)
  points <- c(points, player$points)
  pre_ratings <- c(pre_ratings, player$pre_rating)
  avg_opponent_rantings <- c(avg_opponent_rantings, avg_opponent_ranting)
}

# Create the data frame
chess_data <- data.frame(
  Name = names,
  State = states,
  Points = points,
  Pre_Rating = pre_ratings,
  Avg_Opponent_Ratings = avg_opponent_rantings
)

# Check
print(head(chess_data, 30))
##                          Name State Points Pre_Rating Avg_Opponent_Ratings
## 1                    GARY HUA    ON    6.0       1794                 1634
## 2             DAKSHESH DARURI    MI    6.0       1553                 1518
## 3                ADITYA BAJAJ    MI    6.0       1384                 1551
## 4         PATRICK H SCHILLING    MI    5.5       1716                 1609
## 5                  HANSHI ZUO    MI    5.5       1655                 1544
## 6                 HANSEN SONG    OH    5.0       1686                 1539
## 7           GARY DEE SWATHELL    MI    5.0       1649                 1419
## 8            EZEKIEL HOUGHTON    MI    5.0       1641                 1482
## 9                 STEFANO LEE    ON    5.0       1411                 1486
## 10                  ANVIT RAO    MI    5.0       1365                 1546
## 11   CAMERON WILLIAM MC LEMAN    MI    4.5       1712                 1475
## 12             KENNETH J TACK    MI    4.5       1663                 1541
## 13          TORRANCE HENRY JR    MI    4.5       1666                 1522
## 14               BRADLEY SHAW    MI    4.5       1610                 1556
## 15     ZACHARY JAMES HOUGHTON    MI    4.5       1220                 1470
## 16               MIKE NIKITIN    MI    4.0       1604                 1391
## 17         RONALD GRZEGORCZYK    MI    4.0       1629                 1518
## 18              DAVID SUNDEEN    MI    4.0       1600                 1500
## 19               DIPANKAR ROY    MI    4.0       1564                 1461
## 20                JASON ZHENG    MI    4.0       1595                 1421
## 21              DINH DANG BUI    ON    4.0       1563                 1502
## 22           EUGENE L MCCLURE    MI    4.0       1555                 1328
## 23                   ALAN BUI    ON    4.0       1363                 1130
## 24          MICHAEL R ALDRICH    MI    4.0       1229                 1332
## 25           LOREN SCHWIEBERT    MI    3.5       1745                 1355
## 26                    MAX ZHU    ON    3.5       1579                 1543
## 27             GAURAV GIDWANI    MI    3.5       1552                 1264
## 28 SOFIA ADINA STANESCU-BELLU    MI    3.5       1507                 1571
## 29           CHIEDOZIE OKORIE    MI    3.5       1602                 1365
## 30         GEORGE AVERY JONES    ON    3.5       1522                 1179

Scatter plot

# top 5 players
top5 <- chess_data[order(chess_data$Points, decreasing = TRUE), ][1:5, ]

ggplot(chess_data, aes(x = Pre_Rating, y = Points)) +
  geom_point(aes(color = ifelse(Name %in% top5$Name, "Top 5", "Other")), size = 3, alpha = 0.7) + 
  scale_color_manual(values = c("Top 5" = "red", "Other" = "darkgreen")) +
  labs(
    title = "Chess Tournament: Pre-Rating vs Points",
    subtitle = "Reds = Top 5 players",
    x = "Pre-Tournament Rating",
    y = "Total Points Scored",
    color = "Player Category"
  )

Save to csv file

write.csv(chess_data, "chess_tournament_results.csv", row.names = FALSE)
cat("chess_tournament_results.csv was saved")
## chess_tournament_results.csv was saved

Conclusion

This analysis extracted and processed chess tournament data from a structured text file, creating a comprehensive csv dataset with 64 players. The visualization shows the relationship between pre-tournament ratings and actual performance, with the top 5 performers highlighted in red. For example, we can see that there is one player with a pre tournament rating below 1500 in the top 5 players in this tournament. At the same time, we can see that the player with the best pre tournament rating, was between the top 5 players (actually top 3, since only 3 players scored 6 points).