Step 1: Load Required Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)

Step 2: Read csv from github

url='https://raw.githubusercontent.com/mehreengillani/DATA607/refs/heads/main/player_stats_with_rounds_and_opponents.csv'
data = read.csv(url)
data
##              Player State PreRating R1_Result   R1_Opponent R2_Result
## 1          Gary Hua    ON      1794         W   Alice Smith         W
## 2       Alice Smith    CA      1650         L      Gary Hua         W
## 3       Bob Johnson    TX      1550         D Charlie Brown         L
## 4     Charlie Brown    NY      1700         W   Bob Johnson         D
## 5       Diana Davis    FL      1600         L    Harry Kane         W
## 6        Harry Kane    NY      1350         W      Gary Hua         D
## 7  Jordan Henderson    FL      1460         L   Alice Smith         L
## 8   Raheem Sterling    CA      1975         D   Alice Smith         W
## 9     Jack Grealish    IL      1190         W Charlie Brown         D
## 10       Phil Foden    IL      1450         L   Diana Davis         L
##         R2_Opponent R3_Result      R3_Opponent R4_Result   R4_Opponent
## 1       Bob Johnson         D    Charlie Brown         W   Diana Davis
## 2     Charlie Brown         W      Diana Davis         D   Bob Johnson
## 3          Gary Hua         W      Alice Smith         L   Alice Smith
## 4       Alice Smith         L         Gary Hua         W   Diana Davis
## 5  Jordan Henderson         L      Bob Johnson         D      Gary Hua
## 6  Jordan Henderson         W Jordan Henderson         W Jack Grealish
## 7   Raheem Sterling         L  Raheem Sterling         D    Phil Foden
## 8     Jack Grealish         D    Jack Grealish         L      Gary Hua
## 9        Phil Foden         W Jordan Henderson         W   Alice Smith
## 10      Alice Smith         D         Gary Hua         D   Bob Johnson
##    R5_Result   R5_Opponent R6_Result      R6_Opponent R7_Result   R7_Opponent
## 1          W Charlie Brown         W      Bob Johnson         L   Alice Smith
## 2          W   Diana Davis         W    Charlie Brown         L   Bob Johnson
## 3          D    Phil Foden         L      Diana Davis         W Charlie Brown
## 4          L   Alice Smith         L       Harry Kane         D   Diana Davis
## 5          W Charlie Brown         W Jordan Henderson         W   Alice Smith
## 6          W      Gary Hua         L  Raheem Sterling         D      Gary Hua
## 7          W   Alice Smith         D      Alice Smith         L Charlie Brown
## 8          D    Harry Kane         W       Phil Foden         W   Bob Johnson
## 9          L Charlie Brown         D Jordan Henderson         D   Diana Davis
## 10         W   Diana Davis         L  Raheem Sterling         W      Gary Hua

Step 3: Prepare the data - long format

# Gather round results and opponents
df_long <- data %>%
  pivot_longer(
    cols = starts_with("R"),
    names_to = c("Round", ".value"),
    names_pattern = "(R\\d+)_(.*)"
  )

# Check the new structure
head(df_long)
## # A tibble: 6 × 6
##   Player   State PreRating Round Result Opponent     
##   <chr>    <chr>     <int> <chr> <chr>  <chr>        
## 1 Gary Hua ON         1794 R1    W      Alice Smith  
## 2 Gary Hua ON         1794 R2    W      Bob Johnson  
## 3 Gary Hua ON         1794 R3    D      Charlie Brown
## 4 Gary Hua ON         1794 R4    W      Diana Davis  
## 5 Gary Hua ON         1794 R5    W      Charlie Brown
## 6 Gary Hua ON         1794 R6    W      Bob Johnson

Step 4: Merge opponent ratings

# For each round, get the opponent's PreRating
df_long <- df_long %>%
  left_join(data %>% select(Player, PreRating), by = c("Opponent" = "Player")) %>%
  rename(OpponentRating = PreRating.y, PlayerRating = PreRating.x)
head(df_long)
## # A tibble: 6 × 7
##   Player   State PlayerRating Round Result Opponent      OpponentRating
##   <chr>    <chr>        <int> <chr> <chr>  <chr>                  <int>
## 1 Gary Hua ON            1794 R1    W      Alice Smith             1650
## 2 Gary Hua ON            1794 R2    W      Bob Johnson             1550
## 3 Gary Hua ON            1794 R3    D      Charlie Brown           1700
## 4 Gary Hua ON            1794 R4    W      Diana Davis             1600
## 5 Gary Hua ON            1794 R5    W      Charlie Brown           1700
## 6 Gary Hua ON            1794 R6    W      Bob Johnson             1550

Step 5: Define Elo expected score formula

The expected score in the Elo system is calculated as \(E_A = \frac{1}{1 + 10^{(R_B - R_A)/400}}\) for player A against player B.

expected_score <- function(R_a, R_b) {
  1 / (1 + 10^((R_b - R_a)/400))
}

df_long <- df_long %>%
  mutate(Expected = expected_score(PlayerRating, OpponentRating))

head(df_long)
## # A tibble: 6 × 8
##   Player   State PlayerRating Round Result Opponent      OpponentRating Expected
##   <chr>    <chr>        <int> <chr> <chr>  <chr>                  <int>    <dbl>
## 1 Gary Hua ON            1794 R1    W      Alice Smith             1650    0.696
## 2 Gary Hua ON            1794 R2    W      Bob Johnson             1550    0.803
## 3 Gary Hua ON            1794 R3    D      Charlie Brown           1700    0.632
## 4 Gary Hua ON            1794 R4    W      Diana Davis             1600    0.753
## 5 Gary Hua ON            1794 R5    W      Charlie Brown           1700    0.632
## 6 Gary Hua ON            1794 R6    W      Bob Johnson             1550    0.803

Step 6: convert result to numeric value

df_long <- df_long %>%
  mutate(Score = case_when(
    Result == "W" ~ 1,
    Result == "D" ~ 0.5,
    Result == "L" ~ 0,
    TRUE ~ NA_real_   # in case of typos like 'Wthi'
  ))

Step 7: Compute post rating after each round

# K-factor
K <- 32

df_long <- df_long %>%
  mutate(post_rating = PlayerRating + K * (Score - Expected))

colnames(df_long)
##  [1] "Player"         "State"          "PlayerRating"   "Round"         
##  [5] "Result"         "Opponent"       "OpponentRating" "Expected"      
##  [9] "Score"          "post_rating"

Step 8: Summarize total expected vs actual

player_summary <- df_long %>%
  group_by(Player) %>%
  summarise(
    TotalExpected = sum(Expected, na.rm = TRUE),
    TotalActual = sum(Score, na.rm = TRUE),
    Difference = TotalActual - TotalExpected
  ) %>%
  arrange(desc(Difference))

player_summary
## # A tibble: 10 × 4
##    Player           TotalExpected TotalActual Difference
##    <chr>                    <dbl>       <dbl>      <dbl>
##  1 Jack Grealish            0.785         4.5      3.71 
##  2 Harry Kane               1.65          5        3.35 
##  3 Phil Foden               1.48          3        1.52 
##  4 Alice Smith              3.58          4.5      0.916
##  5 Diana Davis              3.80          4.5      0.703
##  6 Gary Hua                 5.02          5.5      0.484
##  7 Jordan Henderson         1.57          2        0.434
##  8 Bob Johnson              2.58          3        0.421
##  9 Charlie Brown            4.38          3       -1.38 
## 10 Raheem Sterling          6.43          4.5     -1.93


* Difference > 0 → overperformed
* Difference < 0 → underperformed

Step 9: Top over- and under-performers

top_over <- player_summary %>% top_n(5, Difference)
top_under <- player_summary %>% top_n(-5, Difference)

top_over
## # A tibble: 5 × 4
##   Player        TotalExpected TotalActual Difference
##   <chr>                 <dbl>       <dbl>      <dbl>
## 1 Jack Grealish         0.785         4.5      3.71 
## 2 Harry Kane            1.65          5        3.35 
## 3 Phil Foden            1.48          3        1.52 
## 4 Alice Smith           3.58          4.5      0.916
## 5 Diana Davis           3.80          4.5      0.703
top_under
## # A tibble: 5 × 4
##   Player           TotalExpected TotalActual Difference
##   <chr>                    <dbl>       <dbl>      <dbl>
## 1 Gary Hua                  5.02         5.5      0.484
## 2 Jordan Henderson          1.57         2        0.434
## 3 Bob Johnson               2.58         3        0.421
## 4 Charlie Brown             4.38         3       -1.38 
## 5 Raheem Sterling           6.43         4.5     -1.93

Step 10: Visualiztion

library(ggplot2)


ggplot(player_summary, aes(x = reorder(Player, Difference), y = Difference, fill = Difference > 0)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = c("maroon", "darkblue"), labels = c("Underperformed", "Overperformed")) +
  labs(
    title = "Player Performance vs Expected Score",
    x = "Player",
    y = "Performance Difference (Actual - Expected)",
    fill = "Performance"
  ) +
  theme_minimal()

Conclusions:

Overperformers

  • Jack Grealish, Harry Kane, Phil Foden, Alice Smith, Diana Davis, Gary Hua, Jordan Henderson, Bob Johnson all scored more than their expected points.
  • Jack Grealish had the highest overperformance (Difference ≈ 3.71), meaning he scored far above what was expected based on his initial rating.
  • Harry Kane also significantly overperformed (Difference ≈ 3.35).

Underperformers

  • Charlie Brown and Raheem Sterling are the main underperformers.
  • Raheem Sterling had the largest underperformance (Difference ≈ -1.93), scoring much lower than expected.
  • Charlie Brown also underperformed (Difference ≈ -1.38), earning fewer points than predicted by his rating.

General Insights

  • Players with high pre-ratings (like Raheem Sterling) may underperform if the competition is tougher than expected.
  • Players with moderate pre-ratings (like Jack Grealish and Harry Kane) can exceed expectations with consistent wins.
  • Overperformance does not always correlate with high initial rating—some lower-rated players like Alice Smith and Diana Davis still slightly exceeded their expected scores.
  • The Difference metric effectively identifies who performed above or below their expected Elo-based score, which can inform coaching decisions or player rankings.