I used some of the code from project 1 to load and clean the data frame:
raw <- readLines(url("https://raw.githubusercontent.com/ShanaFarber/cuny-sps/master/DATA_607/project1/player_stats.txt"))
player_names <- unlist(str_extract_all(raw[-c(1, 2, 3, 4)], "([A-Z])+\\s([A-Z](\\s)?)*([A-Z])+"))
totals <- unlist(str_extract_all(raw, "\\d\\.\\d"))
pre_ratings <- unlist(str_extract_all(raw[-c(1, 2, 3, 4)], "\\d+(P\\d+)?(\\s+)?\\->"))
pre_ratings <- str_remove(pre_ratings, "(P\\d+)?(\\s+)?->")
# include player ID for future reference
chess_stats <- data.frame(player_id = c(1:64), player_name = player_names, total = totals,
player_start_rating = pre_ratings)
chess_stats$total <- as.numeric(chess_stats$total)
chess_stats$player_start_rating <- as.numeric(chess_stats$player_start_rating)
rounds <- unlist(str_extract_all(unlist((str_extract_all(str_remove(raw, "^\\s+[A-Z].+"),
"\\|[WLDXUHB]\\s+(\\d+)?"))), "[WLDXUHB]\\s+(\\d+)?"))
num_players <- length(player_names)
num_rounds <- length(rounds)/num_players
# create a seven column matrix from rounds to have a data frame of the results
# of each round for each player
rounds <- matrix(rounds, byrow = T, ncol = num_rounds)
full_stats <- chess_stats %>%
cbind(rounds) %>%
pivot_longer(cols = c("1", "2", "3", "4", "5", "6", "7"), names_to = "round",
values_to = "results")
full_stats <- full_stats %>%
mutate(opp_player_id = as.integer(str_extract(results, "\\d+")))
The formula for calculating the expected score is as follows:
\[ E_{A} = {1 \over {1 + 10^{(R_{B} - R_{A})/400}}} \]
Additional formula for calculating player’s new rating based on performance:
\[ new \space rating = rating + 32(score - expected \space score) \] Formulas found here.
Using this formula, I created a function to calculate the expected score for a player:
expectedScore <- function(a_rating, b_rating) {
expected_score <- 1/(1 + 10^((b_rating - a_rating)/400))
return(expected_score)
}
I created another function to calculate the new rating of each player after each round:
newRating <- function(rating, score, score_expected) {
new_rating <- rating + 32 * (score - score_expected)
return(round(new_rating, 0))
}
In order to perform the calculations, the following information is required: - Player name - Player’s pre-rating - Player’s opponents - The results of each round
After each round, the ratings of each player and their opponent must be adjusted based on the results of the previous round.
The scores for each round are awarded as follows:
I created a data frame with these values:
# score values
result_score <- data.frame(results = c("W", "L", "D"), actual_score = c(1, 0, 0.5))
# combining with original data frame to have actual result of each round
full_stats <- full_stats %>%
mutate(results = str_extract(results, "[WLD]")) %>%
left_join(result_score, by = "results")
In order to do the overall calculation, I first made a separate data
frame for each round to calculate the new player ratings after each
round. For each instance, I applied expectedScore() to get
the expected score for the round and then newRating() to
get the new rating for each player. I then changed the values of
opp_ratings to reflect the change from the previous
round.
# initial opponent ratings
opp_ratings <- data.frame(opp_player_id = c(1:64), opp_start_rating = chess_stats$player_start_rating)
# round one expected result and new player ratings
round_one_results <- full_stats %>%
filter(round == 1) %>%
left_join(opp_ratings, by = "opp_player_id") %>%
mutate(expected_score = expectedScore(player_start_rating, opp_start_rating),
player_new_rating = newRating(player_start_rating, actual_score, expected_score))
# adjust opponent player rating for next round
for (i in 1:64) {
if (!is.na(round_one_results$player_new_rating)[i]) {
opp_ratings$opp_start_rating[i] <- round_one_results$player_new_rating[i]
} else {
opp_ratings$opp_start_rating[i] <- opp_ratings$opp_start_rating[i]
}
}
# round two expected result and new player ratings
round_two_results <- full_stats %>%
filter(round == 2) %>%
left_join(opp_ratings, by = "opp_player_id") %>%
mutate(player_start_rating = round_one_results$player_new_rating, expected_score = expectedScore(player_start_rating,
opp_start_rating), player_new_rating = newRating(player_start_rating, actual_score,
expected_score))
# adjust opponent player rating for next round
for (i in 1:64) {
if (!is.na(round_two_results$player_new_rating)[i]) {
opp_ratings$opp_start_rating[i] <- round_two_results$player_new_rating[i]
} else {
opp_ratings$opp_start_rating[i] <- opp_ratings$opp_start_rating[i]
}
}
# round three expected result and new player ratings
round_three_results <- full_stats %>%
filter(round == 3) %>%
left_join(opp_ratings, by = "opp_player_id") %>%
mutate(player_start_rating = round_two_results$player_new_rating, expected_score = expectedScore(player_start_rating,
opp_start_rating), player_new_rating = newRating(player_start_rating, actual_score,
expected_score))
# adjust opponent player rating for next round
for (i in 1:64) {
if (!is.na(round_three_results$player_new_rating)[i]) {
opp_ratings$opp_start_rating[i] <- round_three_results$player_new_rating[i]
} else {
opp_ratings$opp_start_rating[i] <- opp_ratings$opp_start_rating[i]
}
}
# round four expected result and new player ratings
round_four_results <- full_stats %>%
filter(round == 4) %>%
left_join(opp_ratings, by = "opp_player_id") %>%
mutate(player_start_rating = round_three_results$player_new_rating, expected_score = expectedScore(player_start_rating,
opp_start_rating), player_new_rating = newRating(player_start_rating, actual_score,
expected_score))
# adjust opponent player rating for next round
for (i in 1:64) {
if (!is.na(round_four_results$player_new_rating)[i]) {
opp_ratings$opp_start_rating[i] <- round_four_results$player_new_rating[i]
} else {
opp_ratings$opp_start_rating[i] <- opp_ratings$opp_start_rating[i]
}
}
# round five expected result and new player ratings
round_five_results <- full_stats %>%
filter(round == 5) %>%
left_join(opp_ratings, by = "opp_player_id") %>%
mutate(player_start_rating = round_four_results$player_new_rating, expected_score = expectedScore(player_start_rating,
opp_start_rating), player_new_rating = newRating(player_start_rating, actual_score,
expected_score))
# adjust opponent player rating for next round
for (i in 1:64) {
if (!is.na(round_five_results$player_new_rating)[i]) {
opp_ratings$opp_start_rating[i] <- round_five_results$player_new_rating[i]
} else {
opp_ratings$opp_start_rating[i] <- opp_ratings$opp_start_rating[i]
}
}
# round six expected result and new player ratings
round_six_results <- full_stats %>%
filter(round == 6) %>%
left_join(opp_ratings, by = "opp_player_id") %>%
mutate(player_start_rating = round_five_results$player_new_rating, expected_score = expectedScore(player_start_rating,
opp_start_rating), player_new_rating = newRating(player_start_rating, actual_score,
expected_score))
# adjust opponent player rating for next round
for (i in 1:64) {
if (!is.na(round_six_results$player_new_rating)[i]) {
opp_ratings$opp_start_rating[i] <- round_six_results$player_new_rating[i]
} else {
opp_ratings$opp_start_rating[i] <- opp_ratings$opp_start_rating[i]
}
}
# round seven expected result and new player ratings
round_seven_results <- full_stats %>%
filter(round == 7) %>%
left_join(opp_ratings, by = "opp_player_id") %>%
mutate(player_start_rating = round_six_results$player_new_rating, expected_score = expectedScore(player_start_rating,
opp_start_rating), player_new_rating = newRating(player_start_rating, actual_score,
expected_score))
# adjust opponent player rating for next round
for (i in 1:64) {
if (!is.na(round_seven_results$player_new_rating)[i]) {
opp_ratings$opp_start_rating[i] <- round_seven_results$player_new_rating[i]
} else {
opp_ratings$opp_start_rating[i] <- opp_ratings$opp_start_rating[i]
}
}
Using the post-rating from the initial raw data, I checked the
results of my calculations by comparing to the
player_new_rating after round 7:
post_ratings <- unlist(str_extract_all(raw[-c(1, 2, 3, 4)], "->(\\s+)?\\d+(P\\d+)?"))
post_ratings <- str_remove(post_ratings, "->(\\s+)?")
post_ratings <- str_remove(post_ratings, "(P\\d+)")
compare_ratings <- data.frame(player_id = c(1:64), pre_rating = pre_ratings, post_seven_rating = opp_ratings$opp_start_rating,
actual_post_rating = post_ratings)
knitr::kable(head(compare_ratings))
| player_id | pre_rating | post_seven_rating | actual_post_rating |
|---|---|---|---|
| 1 | 1794 | 1820 | 1817 |
| 2 | 1553 | 1621 | 1663 |
| 3 | 1384 | 1507 | 1640 |
| 4 | 1716 | 1742 | 1744 |
| 5 | 1655 | 1691 | 1690 |
| 6 | 1686 | 1688 | 1687 |
I notice that my new rating for each player does not exactly match the post-rating from the cross table. However, if we look at player 3’s pre and post ratings, and the results from each match, we see that he won 5/6 rounds and lost 1, and his rating increased by 256 points. However, according to the explanation in the video and the K value chosen for calculating the new rating, the rating could only increase by a maximum of 32(5) = 160 for games won. I am therefore unsure how the actual post-rating value was calculated and whether they used a different K factor for different players.
Based on this Wikipedia article, the K factor changes based on the number of games a player has already played in so it is possible the K factors used in the actual chess cross table are different for each player based on the number of games they have played.
Moving on, I bound these tables into a data frame of the results of all rounds:
all_rounds <- rbind(round_one_results, round_two_results, round_three_results, round_four_results,
round_five_results, round_six_results, round_seven_results)
I used summarize() to calculate the overall expected
score for each player:
# table of expected scores
expected_scores <- all_rounds %>%
filter(!is.na(results)) %>%
group_by(player_name) %>%
summarize(expected_score = sum(expected_score))
I then joined this to a table of the player’s actual scores:
# create table of expected scores
expected_scores <- all_rounds %>%
mutate(expected_scores = expectedScore(player_start_rating, opp_start_rating)) %>%
group_by(player_name) %>%
summarize(expected_score = sum(expected_scores))
# create table of actual scores
actual_scores <- read.csv(url("https://raw.githubusercontent.com/ShanaFarber/cuny-sps/master/DATA_607/project1/player_stats.csv")) %>%
transmute(player_name, actual_score = total)
compare_scores <- actual_scores %>%
left_join(expected_scores, on = "player_name") %>%
mutate(score_diff = actual_score - expected_score)
## Joining, by = "player_name"
Who were the five players who most over-performed relative to their expected score?
overperformed <- compare_scores %>%
arrange(desc(score_diff))
knitr::kable(head(overperformed, n = 5))
| player_name | actual_score | expected_score | score_diff |
|---|---|---|---|
| ADITYA BAJAJ | 6.0 | 2.1595274 | 3.840473 |
| JACOB ALEXANDER LAVALLEY | 3.0 | 0.0521352 | 2.947865 |
| ZACHARY JAMES HOUGHTON | 4.5 | 1.6542121 | 2.845788 |
| ANVIT RAO | 5.0 | 2.1878462 | 2.812154 |
| STEFANO LEE | 5.0 | 2.4909780 | 2.509022 |
Who were the five players who most under-performed relative to their expected score?
underperformed <- compare_scores %>%
arrange(score_diff)
knitr::kable(head(underperformed, n = 5))
| player_name | actual_score | expected_score | score_diff |
|---|---|---|---|
| LOREN SCHWIEBERT | 3.5 | 6.089633 | -2.589633 |
| GEORGE AVERY JONES | 3.5 | 5.827445 | -2.327445 |
| JARED GE | 3.0 | 4.746449 | -1.746449 |
| BEN LI | 1.0 | 2.425461 | -1.425461 |
| RISHI SHETTY | 3.5 | 4.907391 | -1.407391 |
Initially I had done the problem this way. However, this does not account for the change in player ratings based on the results of each round.
opp_ratings <- data.frame(opp_player_id = c(1:64), opp_start_rating = chess_stats$player_start_rating)
expected_scores <- full_stats %>%
filter(!is.na(actual_score)) %>%
left_join(opp_ratings, by = "opp_player_id") %>%
mutate(expected_score = expectedScore(player_start_rating, opp_start_rating)) %>%
group_by(player_name) %>%
summarize(expected_score = sum(expected_score))
compare_scores <- actual_scores %>%
left_join(expected_scores, by = "player_name") %>%
mutate(score_diff = actual_score - expected_score)
# 5 most over-performed
overperformed <- compare_scores %>%
arrange(desc(score_diff))
knitr::kable(head(overperformed, n = 5))
| player_name | actual_score | expected_score | score_diff |
|---|---|---|---|
| ADITYA BAJAJ | 6.0 | 1.9450879 | 4.054912 |
| ZACHARY JAMES HOUGHTON | 4.5 | 1.3733089 | 3.126691 |
| ANVIT RAO | 5.0 | 1.9448541 | 3.055146 |
| JACOB ALEXANDER LAVALLEY | 3.0 | 0.0432498 | 2.956750 |
| AMIYATOSH PWNANANDAM | 3.5 | 0.7734529 | 2.726547 |
# 5 most under-performed
underperformed <- compare_scores %>%
arrange(score_diff)
knitr::kable(head(underperformed, n = 5))
| player_name | actual_score | expected_score | score_diff |
|---|---|---|---|
| LOREN SCHWIEBERT | 3.5 | 6.275650 | -2.775650 |
| GEORGE AVERY JONES | 3.5 | 6.018220 | -2.518220 |
| JARED GE | 3.0 | 5.010416 | -2.010416 |
| RISHI SHETTY | 3.5 | 5.092465 | -1.592465 |
| JOSHUA DAVID LEE | 3.5 | 4.957890 | -1.457890 |