#Approach In this assignment, I will extend the analysis conducted in Project 1 by leveraging the structured dataset previously created from the tournament results. Using the cleaned and organized player data, I will calculate each player’s expected score based on the Elo rating system.
The expected score will be computed using the standard Elo formula, which estimates the probability of a player scoring a point against each opponent based on the difference in their pre-tournament ratings. For each player, I will:
1 - Retrieve the ratings of all opponents faced during the tournament.
2 - Apply the Elo expected score formula to calculate the expected outcome for each individual match.
3 - Sum the expected values across all rounds to obtain the player’s total expected score.
4 - Compare the expected score to the player’s actual tournament points.
The difference between actual performance and expected performance will be used to determine whether a player overperformed (actual score greater than expected) or underperformed (actual score lower than expected).
Finally, I will identify and report:
The five players who most significantly overperformed relative to expectations.
The five players who most significantly underperformed relative to expectations.
raw link from github
file_local <- "tournamentinfo.txt"
file_url <- "https://raw.githubusercontent.com/japhet125/Project1-Data-Science/refs/heads/main/tournamentinfo.txt"
if (file.exists(file_local)){
info <- readLines(file_local)
} else {
info <- readLines(file_url)
}
## Warning in readLines(file_url): incomplete final line found on
## 'https://raw.githubusercontent.com/japhet125/Project1-Data-Science/refs/heads/main/tournamentinfo.txt'
#info
head(info)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
clean_lines <- info[!grepl("^-", info)]
clean_lines <- clean_lines[clean_lines != ""]
head(clean_lines)
## [1] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [2] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [3] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [4] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [5] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [6] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
#clean_lines
#cleaning data removing header
clean_lines <- clean_lines[!grepl("^\\s*Pair", clean_lines)]
clean_lines <- clean_lines[!grepl("^\\s*Num", clean_lines)]
head(clean_lines, 4)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [4] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
#group every two rows
player_blocks <- split(clean_lines, ceiling(seq_along(clean_lines)/2))
player_blocks[[1]]
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
#Extracting fields from players
library(stringr)
block <- player_blocks[[1]]
row1 <- block[1]
row2 <- block[2]
#Extracting player number
player_num <- str_extract(row1, "^\\s*\\d+")
player_num
## [1] " 1"
#Extracting player name
name <- str_trim(str_extract(row1, "(?<=\\| ).*?(?=\\s+\\|)"))
name
## [1] "GARY HUA"
#Extracting total Points
points <- str_extract(row1, "\\d+\\.\\d")
points
## [1] "6.0"
#Extracting state
state <- str_extract(row2, "^[^|]+")
state
## [1] " ON "
#Extracting pre-rating
pre_rating <- str_extract(row2, "R:\\s*\\d+")
pre_rating
## [1] "R: 1794"
#building the data frame
players_df <- data.frame()
for (block in player_blocks) {
row1 <- block[1]
row2 <- block[2]
player_num <- as.numeric(str_extract(row1, "^\\s*\\d+"))
name <- str_trim(str_extract(row1, "(?<=\\| ).*?(?=\\s+\\|)"))
points <- as.numeric(str_extract(row1, "\\d+\\.\\d"))
state <- str_trim(str_extract(row2, "^[^|]+"))
pre_rating <- str_extract(row2, "R:\\s*\\d+")
pre_rating <- as.numeric(str_extract(pre_rating, "\\d+"))
opponents <- str_extract_all(row1, "(?<=\\s)\\d{1,2}(?=\\|)")[[1]]
opponents <- as.numeric(opponents)
players_df <- rbind(players_df, data.frame(
player_num,
name,
state,
points,
pre_rating,
opponents = I(list(opponents))
))
}
head(players_df)
## player_num name state points pre_rating opponents
## 1 1 GARY HUA ON 6.0 1794 39, 21, ....
## 2 2 DAKSHESH DARURI MI 6.0 1553 63, 58, ....
## 3 3 ADITYA BAJAJ MI 6.0 1384 8, 61, 2....
## 4 4 PATRICK H SCHILLING MI 5.5 1716 23, 28, ....
## 5 5 HANSHI ZUO MI 5.5 1655 45, 37, ....
## 6 6 HANSEN SONG OH 5.0 1686 34, 29, ....
Look at their list of opponent IDs
Find those opponents in the dataframe
Get their ratings
Compute the average
Store it as a new column
players_df$avg_opponent_rating <- sapply(players_df$opponents, function(opp_ids) {
opp_ratings <- players_df$pre_rating[
players_df$player_num %in% opp_ids
]
mean(opp_ratings)
})
players_df$avg_opponent_rating <- round(players_df$avg_opponent_rating, 0)
players_df[1, ]
## player_num name state points pre_rating opponents avg_opponent_rating
## 1 1 GARY HUA ON 6 1794 39, 21, .... 1605
#Exporting it to a .csv file
write.csv(
players_df[, c("name", "state", "points", "pre_rating", "avg_opponent_rating")],
"tournamentinfo_results.csv",
row.names = FALSE
)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggpubr)
players_df %>%
ggplot(aes(x = avg_opponent_rating, y = pre_rating, color = state)) +
geom_point(size = 3, alpha = 0.3) +
geom_smooth(method = "lm") +
stat_cor(aes(label = ..r.label..), method = "pearson",
label.x = min(players_df$avg_opponent_rating),
label.y = max(players_df$pre_rating)) +
labs(
title = "Player Pre-Rating vs Average Opponent Rating",
x = "Average Opponent Pre-Rating",
y = "Player Pre-Rating"
)+
theme_minimal()
## Warning: The dot-dot notation (`..r.label..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(r.label)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
#getting the top 10 players by total point
top10 <- players_df %>%
arrange(desc(points), desc(pre_rating)) %>%
select(name, points, pre_rating, avg_opponent_rating) %>%
head(10)
top10
## name points pre_rating avg_opponent_rating
## 1 GARY HUA 6.0 1794 1605
## 2 DAKSHESH DARURI 6.0 1553 1469
## 3 ADITYA BAJAJ 6.0 1384 1564
## 4 PATRICK H SCHILLING 5.5 1716 1574
## 5 HANSHI ZUO 5.5 1655 1501
## 6 HANSEN SONG 5.0 1686 1519
## 7 GARY DEE SWATHELL 5.0 1649 1372
## 8 EZEKIEL HOUGHTON 5.0 1641 1468
## 9 STEFANO LEE 5.0 1411 1523
## 10 ANVIT RAO 5.0 1365 1554
#Calculating the average rating by state
avg_by_state <- players_df %>%
group_by(state) %>%
summarise(avg_rating = mean(pre_rating, na.rm = TRUE)) %>%
arrange(desc(avg_rating))
avg_by_state
## # A tibble: 3 × 2
## state avg_rating
## <chr> <dbl>
## 1 OH 1686
## 2 ON 1454.
## 3 MI 1362
#visualize and analyse
avg_by_state %>%
ggplot(aes(x = reorder(state, avg_rating), y = avg_rating, fill = state)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(avg_rating, 0)), hjust = -0.2, size = 4) +
#coord_flip()+
labs(
title = "Average Pre-Rating by State",
x = "State",
y = "Average Pre-Rating"
)+
theme_minimal()
#installing packages
library(ggpubr)
library(ggplot2)
library(dplyr)
library(DBI)
library(RSQLite)
## Warning: package 'RSQLite' was built under R version 4.5.2
#Assignement 5B begin here
players_df$expected_score <- sapply(1:nrow(players_df), function(i){
player_rating <- players_df$pre_rating[i]
opponent_ids <- players_df$opponents[[i]]
opponent_ratings <- players_df$pre_rating[
players_df$player_num %in% opponent_ids
]
expected_each <- 1 / (1 + 10^((opponent_ratings - player_rating)/400))
sum(expected_each)
})
#computing over/under performance of each player
players_df$performance_diff <- players_df$points - players_df$expected_score
#selecting the 5 top performers
top_over <- players_df %>%
arrange(desc(performance_diff)) %>%
select(name, points, expected_score, performance_diff) %>%
head(5)
top_over
## name points expected_score performance_diff
## 1 ADITYA BAJAJ 6.0 1.94508791 4.054912
## 2 ZACHARY JAMES HOUGHTON 4.5 1.37330887 3.126691
## 3 ANVIT RAO 5.0 1.94485405 3.055146
## 4 JACOB ALEXANDER LAVALLEY 3.0 0.04324981 2.956750
## 5 AMIYATOSH PWNANANDAM 3.5 0.77345290 2.726547
#top 5 underperformes
top_under <- players_df %>%
arrange(performance_diff) %>%
select(name, points, expected_score, performance_diff) %>%
head(5)
top_under
## name points expected_score performance_diff
## 1 LOREN SCHWIEBERT 3.5 6.275650 -2.775650
## 2 GEORGE AVERY JONES 3.5 6.018220 -2.518220
## 3 JARED GE 3.0 5.010416 -2.010416
## 4 RISHI SHETTY 3.5 5.092465 -1.592465
## 5 JOSHUA DAVID LEE 3.5 4.957890 -1.457890
#visualizing performance over plot The analysis reveals that some lower-rated players significantly outperformed expectations, suggesting either strong tournament form or underestimation by their pre-tournament rating. Conversely, several higher-rated players underperformed relative to their expected scores, indicating variance inherent in competitive tournaments.
players_df %>%
ggplot(aes(x = expected_score, y = points)) + geom_point() +
geom_abline(color = "blue") + labs(
title = "Actual Score vs Expected Score",
x = "Expected Score",
y = "Actual Score"
) +
theme_minimal()
#Conclusion
The expected score for each player was calculated using the standard Elo rating formula as defined by Arpad Elo and adopted by FIDE. The model estimates the probability of a player scoring a point against each opponent based on rating differences. Summing these probabilities across all rounds produces the total expected score. Comparing expected scores to actual performance allows identification of players who significantly overperformed or underperformed relative to their rating.