This project analyzes the results of a chess tournament with 64 players by transforming raw data extracted from a text file to a structured dataset with the goal of understanding the relationship between player pre-tournament ratings and actual tournament performance. The raw data was spread across multiple lines per player, so I had to parse, clean, and restructure the data before the analysis. The goal was create a dataset with each player’s name, state, total points, pre-tournament rating, and average pre-rating of opponents. I also created lookup table to calculate opponent’s stats. In order to see all player’s performance compared with pre tournament performance, I also created a scatter plot highlighting the top 5 performers.
library(readr)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
raw_url <- "https://raw.githubusercontent.com/JDO-MSDS/DATA-607/refs/heads/main/Project%201/chess.txt"
chess <- readr::read_lines(raw_url)
head(chess, 15)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
## [11] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [12] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [13] "-----------------------------------------------------------------------------------------"
## [14] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [15] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
names <- c()
states <- c()
points <- c()
pre_ratings <- c()
avg_opponent_rantings <- c()
# Collect all players data
player_data <- list()
for (i in seq(5, length(chess), 3)) {
if (i + 1 <= length(chess)) {
# player data line
player_line <- chess[i]
state_line <- chess[i + 1]
# split line |
player_fields_split <- strsplit(player_line, "\\|")[[1]] |> trimws()
state_fields_split <- strsplit(state_line, "\\|")[[1]] |> trimws()
# fields extraction for player
player_num <- as.numeric(player_fields_split[1])
name <- player_fields_split[2] # player name position
state <- state_fields_split[1] # state position
total_points <- as.numeric(player_fields_split[3])
# points and ratings numeric cleaning - remove everything else
rating_match <- stringr::str_match(state_fields_split[2], "R:\\s*(\\d+)")
pre_rating <- as.numeric(rating_match[,2])
# opponents
opponents <- c()
for (round in 5:11) {
if (round <= length(player_fields_split)) {
round_result <- player_fields_split[round]
opponent_match <- stringr::str_extract(round_result, "[WLD]\\s*(\\d+)")
if (!is.na(opponent_match)) {
opponent_num <- as.numeric(stringr::str_extract(opponent_match, "\\d+"))
opponents <- c(opponents, opponent_num)
}
}
}
# Save player data
player_data[[as.character(player_num)]] <- list(
name = name,
state = state,
points = total_points,
pre_rating = pre_rating,
opponents = opponents
)
}
}
# Calculate avg opponent ratings
for (player_num in names(player_data)) {
player <- player_data[[player_num]]
# avg pre rating
opponent_ratings <- c()
for (opponent_num in player$opponents) {
if (as.character(opponent_num) %in% names(player_data)) {
opponent_ratings <- c(opponent_ratings, player_data[[as.character(opponent_num)]]$pre_rating)
}
}
avg_opponent_ranting <- if(length(opponent_ratings) > 0) {
round(mean(opponent_ratings), 0)
} else {
NA
}
# update vectors
names <- c(names, player$name)
states <- c(states, player$state)
points <- c(points, player$points)
pre_ratings <- c(pre_ratings, player$pre_rating)
avg_opponent_rantings <- c(avg_opponent_rantings, avg_opponent_ranting)
}
# Create the data frame
chess_data <- data.frame(
Name = names,
State = states,
Points = points,
Pre_Rating = pre_ratings,
Avg_Opponent_Ratings = avg_opponent_rantings
)
# Check
print(head(chess_data, 30))
## Name State Points Pre_Rating Avg_Opponent_Ratings
## 1 GARY HUA ON 6.0 1794 1634
## 2 DAKSHESH DARURI MI 6.0 1553 1518
## 3 ADITYA BAJAJ MI 6.0 1384 1551
## 4 PATRICK H SCHILLING MI 5.5 1716 1609
## 5 HANSHI ZUO MI 5.5 1655 1544
## 6 HANSEN SONG OH 5.0 1686 1539
## 7 GARY DEE SWATHELL MI 5.0 1649 1419
## 8 EZEKIEL HOUGHTON MI 5.0 1641 1482
## 9 STEFANO LEE ON 5.0 1411 1486
## 10 ANVIT RAO MI 5.0 1365 1546
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712 1475
## 12 KENNETH J TACK MI 4.5 1663 1541
## 13 TORRANCE HENRY JR MI 4.5 1666 1522
## 14 BRADLEY SHAW MI 4.5 1610 1556
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220 1470
## 16 MIKE NIKITIN MI 4.0 1604 1391
## 17 RONALD GRZEGORCZYK MI 4.0 1629 1518
## 18 DAVID SUNDEEN MI 4.0 1600 1500
## 19 DIPANKAR ROY MI 4.0 1564 1461
## 20 JASON ZHENG MI 4.0 1595 1421
## 21 DINH DANG BUI ON 4.0 1563 1502
## 22 EUGENE L MCCLURE MI 4.0 1555 1328
## 23 ALAN BUI ON 4.0 1363 1130
## 24 MICHAEL R ALDRICH MI 4.0 1229 1332
## 25 LOREN SCHWIEBERT MI 3.5 1745 1355
## 26 MAX ZHU ON 3.5 1579 1543
## 27 GAURAV GIDWANI MI 3.5 1552 1264
## 28 SOFIA ADINA STANESCU-BELLU MI 3.5 1507 1571
## 29 CHIEDOZIE OKORIE MI 3.5 1602 1365
## 30 GEORGE AVERY JONES ON 3.5 1522 1179
# top 5 players
top5 <- chess_data[order(chess_data$Points, decreasing = TRUE), ][1:5, ]
ggplot(chess_data, aes(x = Pre_Rating, y = Points)) +
geom_point(aes(color = ifelse(Name %in% top5$Name, "Top 5", "Other")), size = 3, alpha = 0.7) +
scale_color_manual(values = c("Top 5" = "red", "Other" = "darkgreen")) +
labs(
title = "Chess Tournament: Pre-Rating vs Points",
subtitle = "Reds = Top 5 players",
x = "Pre-Tournament Rating",
y = "Total Points Scored",
color = "Player Category"
)
write.csv(chess_data, "chess_tournament_results.csv", row.names = FALSE)
cat("chess_tournament_results.csv was saved")
## chess_tournament_results.csv was saved
This analysis extracted and processed chess tournament data from a structured text file, creating a comprehensive csv dataset with 64 players. The visualization shows the relationship between pre-tournament ratings and actual performance, with the top 5 performers highlighted in red. For example, we can see that there is one player with a pre tournament rating below 1500 in the top 5 players in this tournament. At the same time, we can see that the player with the best pre tournament rating, was between the top 5 players (actually top 3, since only 3 players scored 6 points).