#Approach
The approach for this project begins with carefully examining and understanding the structure of the raw tournament data. After reviewing the file format, the tournamentinfo.txt file is downloaded and placed into the working directory to ensure reproducibility.
The next step involves data cleaning. This includes removing separator lines, header information, and any content that is not relevant to the required analysis. Since each player’s information is distributed across two consecutive rows, the data is then structured by grouping every two rows as a single player record.
From the first row, the following fields are extracted:
Player number
Player name
Total points
Opponent identification numbers
From the second row, the following fields are extracted:
Player’s state
Player’s pre-tournament rating
String manipulation functions such as str_split() and str_extract() are used to isolate and retrieve the relevant components from each row.
After extracting the necessary information, a structured data frame is constructed containing all required variables. Finally, the average pre-tournament rating of each player’s opponents is calculated by matching opponent identification numbers to their corresponding ratings. The completed dataset is then exported as a .csv file for further use or potential database integration.
raw link from github
file_local <- "tournamentinfo.txt"
file_url <- "https://raw.githubusercontent.com/japhet125/Project1-Data-Science/refs/heads/main/tournamentinfo.txt"
if (file.exists(file_local)){
info <- readLines(file_local)
} else {
info <- readLines(file_url)
}
## Warning in readLines(file_local): incomplete final line found on
## 'tournamentinfo.txt'
#info
head(info)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
clean_lines <- info[!grepl("^-", info)]
clean_lines <- clean_lines[clean_lines != ""]
head(clean_lines)
## [1] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [2] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [3] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [4] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [5] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [6] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
#clean_lines
#cleaning data removing header
clean_lines <- clean_lines[!grepl("^\\s*Pair", clean_lines)]
clean_lines <- clean_lines[!grepl("^\\s*Num", clean_lines)]
head(clean_lines, 4)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [4] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
#group every two rows
player_blocks <- split(clean_lines, ceiling(seq_along(clean_lines)/2))
player_blocks[[1]]
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
#Extracting fields from players
library(stringr)
block <- player_blocks[[1]]
row1 <- block[1]
row2 <- block[2]
#Extracting player number
player_num <- str_extract(row1, "^\\s*\\d+")
player_num
## [1] " 1"
#Extracting player name
name <- str_trim(str_extract(row1, "(?<=\\| ).*?(?=\\s+\\|)"))
name
## [1] "GARY HUA"
#Extracting total Points
points <- str_extract(row1, "\\d+\\.\\d")
points
## [1] "6.0"
#Extracting state
state <- str_extract(row2, "^[^|]+")
state
## [1] " ON "
#Extracting pre-rating
pre_rating <- str_extract(row2, "R:\\s*\\d+")
pre_rating
## [1] "R: 1794"
#building the data frame
players_df <- data.frame()
for (block in player_blocks) {
row1 <- block[1]
row2 <- block[2]
player_num <- as.numeric(str_extract(row1, "^\\s*\\d+"))
name <- str_trim(str_extract(row1, "(?<=\\| ).*?(?=\\s+\\|)"))
points <- as.numeric(str_extract(row1, "\\d+\\.\\d"))
state <- str_trim(str_extract(row2, "^[^|]+"))
pre_rating <- str_extract(row2, "R:\\s*\\d+")
pre_rating <- as.numeric(str_extract(pre_rating, "\\d+"))
opponents <- str_extract_all(row1, "(?<=\\s)\\d{1,2}(?=\\|)")[[1]]
opponents <- as.numeric(opponents)
players_df <- rbind(players_df, data.frame(
player_num,
name,
state,
points,
pre_rating,
opponents = I(list(opponents))
))
}
head(players_df)
## player_num name state points pre_rating opponents
## 1 1 GARY HUA ON 6.0 1794 39, 21, ....
## 2 2 DAKSHESH DARURI MI 6.0 1553 63, 58, ....
## 3 3 ADITYA BAJAJ MI 6.0 1384 8, 61, 2....
## 4 4 PATRICK H SCHILLING MI 5.5 1716 23, 28, ....
## 5 5 HANSHI ZUO MI 5.5 1655 45, 37, ....
## 6 6 HANSEN SONG OH 5.0 1686 34, 29, ....
players_df$avg_opponent_rating <- sapply(players_df$opponents, function(opp_ids) {
opp_ratings <- players_df$pre_rating[
players_df$player_num %in% opp_ids
]
mean(opp_ratings)
})
players_df$avg_opponent_rating <- round(players_df$avg_opponent_rating, 0)
players_df[1, ]
## player_num name state points pre_rating opponents avg_opponent_rating
## 1 1 GARY HUA ON 6 1794 39, 21, .... 1605
#Exporting it to a .csv file
write.csv(
players_df[, c("name", "state", "points", "pre_rating", "avg_opponent_rating")],
"tournamentinfo_results.csv",
row.names = FALSE
)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggpubr)
players_df %>%
ggplot(aes(x = avg_opponent_rating, y = pre_rating, color = state)) +
geom_point(size = 3, alpha = 0.3) +
geom_smooth(method = "lm") +
stat_cor(aes(label = ..r.label..), method = "pearson",
label.x = min(players_df$avg_opponent_rating),
label.y = max(players_df$pre_rating)) +
labs(
title = "Player Pre-Rating vs Average Opponent Rating",
x = "Average Opponent Pre-Rating",
y = "Player Pre-Rating"
)+
theme_minimal()
## Warning: The dot-dot notation (`..r.label..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(r.label)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
#getting the top 10 players by total point
top10 <- players_df %>%
arrange(desc(points), desc(pre_rating)) %>%
select(name, points, pre_rating, avg_opponent_rating) %>%
head(10)
top10
## name points pre_rating avg_opponent_rating
## 1 GARY HUA 6.0 1794 1605
## 2 DAKSHESH DARURI 6.0 1553 1469
## 3 ADITYA BAJAJ 6.0 1384 1564
## 4 PATRICK H SCHILLING 5.5 1716 1574
## 5 HANSHI ZUO 5.5 1655 1501
## 6 HANSEN SONG 5.0 1686 1519
## 7 GARY DEE SWATHELL 5.0 1649 1372
## 8 EZEKIEL HOUGHTON 5.0 1641 1468
## 9 STEFANO LEE 5.0 1411 1523
## 10 ANVIT RAO 5.0 1365 1554
#Calculating the average rating by state
avg_by_state <- players_df %>%
group_by(state) %>%
summarise(avg_rating = mean(pre_rating, na.rm = TRUE)) %>%
arrange(desc(avg_rating))
avg_by_state
## # A tibble: 3 × 2
## state avg_rating
## <chr> <dbl>
## 1 OH 1686
## 2 ON 1454.
## 3 MI 1362
#visualize and analyse
avg_by_state %>%
ggplot(aes(x = reorder(state, avg_rating), y = avg_rating, fill = state)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(avg_rating, 0)), hjust = -0.2, size = 4) +
#coord_flip()+
labs(
title = "Average Pre-Rating by State",
x = "State",
y = "Average Pre-Rating"
)+
theme_minimal()
#installing packages
library(ggpubr)
library(ggplot2)
library(dplyr)
library(DBI)
library(RSQLite)
## Warning: package 'RSQLite' was built under R version 4.5.2
base on the bar char above we can conclude that OH has the most state rating.
Project 1 provided a hands-on experience that closely mirrors the day-to-day tasks of a data scientist. I enjoyed working through the challenge of cleaning and structuring raw, messy data. The project required careful attention to detail, including removing unnecessary lines and whitespace, extracting relevant information from multi-line records, and building a clean, structured data frame.
In addition, calculating derived metrics such as the average opponent ratings helped me practice applying analytical logic and working with relational data. Exporting the data to a CSV and visualizing it reinforced the importance of reproducible workflows and clear presentation of results.
Overall, this project strengthened my skills in data cleaning, string manipulation, data wrangling, and visualization, and gave me a realistic sense of the problem-solving process involved in real-world data science projects. It was challenging, but also highly rewarding, and it increased my confidence in handling complex datasets.