library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
url='https://raw.githubusercontent.com/mehreengillani/DATA607/refs/heads/main/player_stats_with_rounds_and_opponents.csv'
data = read.csv(url)
data
## Player State PreRating R1_Result R1_Opponent R2_Result
## 1 Gary Hua ON 1794 W Alice Smith W
## 2 Alice Smith CA 1650 L Gary Hua W
## 3 Bob Johnson TX 1550 D Charlie Brown L
## 4 Charlie Brown NY 1700 W Bob Johnson D
## 5 Diana Davis FL 1600 L Harry Kane W
## 6 Harry Kane NY 1350 W Gary Hua D
## 7 Jordan Henderson FL 1460 L Alice Smith L
## 8 Raheem Sterling CA 1975 D Alice Smith W
## 9 Jack Grealish IL 1190 W Charlie Brown D
## 10 Phil Foden IL 1450 L Diana Davis L
## R2_Opponent R3_Result R3_Opponent R4_Result R4_Opponent
## 1 Bob Johnson D Charlie Brown W Diana Davis
## 2 Charlie Brown W Diana Davis D Bob Johnson
## 3 Gary Hua W Alice Smith L Alice Smith
## 4 Alice Smith L Gary Hua W Diana Davis
## 5 Jordan Henderson L Bob Johnson D Gary Hua
## 6 Jordan Henderson W Jordan Henderson W Jack Grealish
## 7 Raheem Sterling L Raheem Sterling D Phil Foden
## 8 Jack Grealish D Jack Grealish L Gary Hua
## 9 Phil Foden W Jordan Henderson W Alice Smith
## 10 Alice Smith D Gary Hua D Bob Johnson
## R5_Result R5_Opponent R6_Result R6_Opponent R7_Result R7_Opponent
## 1 W Charlie Brown W Bob Johnson L Alice Smith
## 2 W Diana Davis W Charlie Brown L Bob Johnson
## 3 D Phil Foden L Diana Davis W Charlie Brown
## 4 L Alice Smith L Harry Kane D Diana Davis
## 5 W Charlie Brown W Jordan Henderson W Alice Smith
## 6 W Gary Hua L Raheem Sterling D Gary Hua
## 7 W Alice Smith D Alice Smith L Charlie Brown
## 8 D Harry Kane W Phil Foden W Bob Johnson
## 9 L Charlie Brown D Jordan Henderson D Diana Davis
## 10 W Diana Davis L Raheem Sterling W Gary Hua
# Gather round results and opponents
df_long <- data %>%
pivot_longer(
cols = starts_with("R"),
names_to = c("Round", ".value"),
names_pattern = "(R\\d+)_(.*)"
)
# Check the new structure
head(df_long)
## # A tibble: 6 × 6
## Player State PreRating Round Result Opponent
## <chr> <chr> <int> <chr> <chr> <chr>
## 1 Gary Hua ON 1794 R1 W Alice Smith
## 2 Gary Hua ON 1794 R2 W Bob Johnson
## 3 Gary Hua ON 1794 R3 D Charlie Brown
## 4 Gary Hua ON 1794 R4 W Diana Davis
## 5 Gary Hua ON 1794 R5 W Charlie Brown
## 6 Gary Hua ON 1794 R6 W Bob Johnson
# For each round, get the opponent's PreRating
df_long <- df_long %>%
left_join(data %>% select(Player, PreRating), by = c("Opponent" = "Player")) %>%
rename(OpponentRating = PreRating.y, PlayerRating = PreRating.x)
head(df_long)
## # A tibble: 6 × 7
## Player State PlayerRating Round Result Opponent OpponentRating
## <chr> <chr> <int> <chr> <chr> <chr> <int>
## 1 Gary Hua ON 1794 R1 W Alice Smith 1650
## 2 Gary Hua ON 1794 R2 W Bob Johnson 1550
## 3 Gary Hua ON 1794 R3 D Charlie Brown 1700
## 4 Gary Hua ON 1794 R4 W Diana Davis 1600
## 5 Gary Hua ON 1794 R5 W Charlie Brown 1700
## 6 Gary Hua ON 1794 R6 W Bob Johnson 1550
The expected score in the Elo system is calculated as \(E_A = \frac{1}{1 + 10^{(R_B - R_A)/400}}\) for player A against player B.
expected_score <- function(R_a, R_b) {
1 / (1 + 10^((R_b - R_a)/400))
}
df_long <- df_long %>%
mutate(Expected = expected_score(PlayerRating, OpponentRating))
head(df_long)
## # A tibble: 6 × 8
## Player State PlayerRating Round Result Opponent OpponentRating Expected
## <chr> <chr> <int> <chr> <chr> <chr> <int> <dbl>
## 1 Gary Hua ON 1794 R1 W Alice Smith 1650 0.696
## 2 Gary Hua ON 1794 R2 W Bob Johnson 1550 0.803
## 3 Gary Hua ON 1794 R3 D Charlie Brown 1700 0.632
## 4 Gary Hua ON 1794 R4 W Diana Davis 1600 0.753
## 5 Gary Hua ON 1794 R5 W Charlie Brown 1700 0.632
## 6 Gary Hua ON 1794 R6 W Bob Johnson 1550 0.803
df_long <- df_long %>%
mutate(Score = case_when(
Result == "W" ~ 1,
Result == "D" ~ 0.5,
Result == "L" ~ 0,
TRUE ~ NA_real_ # in case of typos like 'Wthi'
))
# K-factor
K <- 32
df_long <- df_long %>%
mutate(post_rating = PlayerRating + K * (Score - Expected))
colnames(df_long)
## [1] "Player" "State" "PlayerRating" "Round"
## [5] "Result" "Opponent" "OpponentRating" "Expected"
## [9] "Score" "post_rating"
player_summary <- df_long %>%
group_by(Player) %>%
summarise(
TotalExpected = sum(Expected, na.rm = TRUE),
TotalActual = sum(Score, na.rm = TRUE),
Difference = TotalActual - TotalExpected
) %>%
arrange(desc(Difference))
player_summary
## # A tibble: 10 × 4
## Player TotalExpected TotalActual Difference
## <chr> <dbl> <dbl> <dbl>
## 1 Jack Grealish 0.785 4.5 3.71
## 2 Harry Kane 1.65 5 3.35
## 3 Phil Foden 1.48 3 1.52
## 4 Alice Smith 3.58 4.5 0.916
## 5 Diana Davis 3.80 4.5 0.703
## 6 Gary Hua 5.02 5.5 0.484
## 7 Jordan Henderson 1.57 2 0.434
## 8 Bob Johnson 2.58 3 0.421
## 9 Charlie Brown 4.38 3 -1.38
## 10 Raheem Sterling 6.43 4.5 -1.93
* Difference > 0 → overperformed
* Difference < 0 →
underperformed
top_over <- player_summary %>% top_n(5, Difference)
top_under <- player_summary %>% top_n(-5, Difference)
top_over
## # A tibble: 5 × 4
## Player TotalExpected TotalActual Difference
## <chr> <dbl> <dbl> <dbl>
## 1 Jack Grealish 0.785 4.5 3.71
## 2 Harry Kane 1.65 5 3.35
## 3 Phil Foden 1.48 3 1.52
## 4 Alice Smith 3.58 4.5 0.916
## 5 Diana Davis 3.80 4.5 0.703
top_under
## # A tibble: 5 × 4
## Player TotalExpected TotalActual Difference
## <chr> <dbl> <dbl> <dbl>
## 1 Gary Hua 5.02 5.5 0.484
## 2 Jordan Henderson 1.57 2 0.434
## 3 Bob Johnson 2.58 3 0.421
## 4 Charlie Brown 4.38 3 -1.38
## 5 Raheem Sterling 6.43 4.5 -1.93
library(ggplot2)
ggplot(player_summary, aes(x = reorder(Player, Difference), y = Difference, fill = Difference > 0)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = c("maroon", "darkblue"), labels = c("Underperformed", "Overperformed")) +
labs(
title = "Player Performance vs Expected Score",
x = "Player",
y = "Performance Difference (Actual - Expected)",
fill = "Performance"
) +
theme_minimal()