Week5b

Author

Zihao YU

1.How will I tackle the problem?

I will incorporate the code from Project 1 and calculate the expected Elo rating for each round. Finally, compute the difference between the actual total and expected total, then extract the top 5 overperformed and underperformed players.

2.What data challenges do I anticipate?

Since B/H/U have no real opponents, expected ratings cannot be calculated for these lines. To avoid errors, consider excluding them from the calculation.

source: “https://raw.githubusercontent.com/XxY-coder/Week5b.Y/refs/heads/main/chess_players.csv” “https://github.com/XxY-coder/Week5b.Y/blob/main/tournamentinfo.txt” formula source: “https://mattmazzola.medium.com/implementing-the-elo-rating-system-a085f178e065”

3.Get the columns

The previous few steps are similar to Proj#1, I will keep it and make some changes(no need State).

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)

chess.raw <- readLines("https://raw.githubusercontent.com/XxY-coder/data607-Proj.Y/refs/heads/main/tournamentinfo.txt")
Warning in
readLines("https://raw.githubusercontent.com/XxY-coder/data607-Proj.Y/refs/heads/main/tournamentinfo.txt"):
incomplete final line found on
'https://raw.githubusercontent.com/XxY-coder/data607-Proj.Y/refs/heads/main/tournamentinfo.txt'
head(chess.raw, 20)
 [1] "-----------------------------------------------------------------------------------------" 
 [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
 [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
 [4] "-----------------------------------------------------------------------------------------" 
 [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
 [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
 [7] "-----------------------------------------------------------------------------------------" 
 [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
 [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
[10] "-----------------------------------------------------------------------------------------" 
[11] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|" 
[12] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
[13] "-----------------------------------------------------------------------------------------" 
[14] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|" 
[15] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |" 
[16] "-----------------------------------------------------------------------------------------" 
[17] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|" 
[18] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
[19] "-----------------------------------------------------------------------------------------" 
[20] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|" 
chess_df <- chess.raw[-(1:4)]
chess_df <- chess_df[chess_df != ""]

chess_df <- chess_df[!str_detect(chess_df, "^\\s*-+\\s*$")]

line_n <- which(str_detect(chess_df, "^\\s*\\d+\\s*\\|"))
line_n
 [1]   1   3   5   7   9  11  13  15  17  19  21  23  25  27  29  31  33  35  37
[20]  39  41  43  45  47  49  51  53  55  57  59  61  63  65  67  69  71  73  75
[39]  77  79  81  83  85  87  89  91  93  95  97  99 101 103 105 107 109 111 113
[58] 115 117 119 121 123 125 127
line_odd <- chess_df[line_n]      
line_even <- chess_df[line_n + 1]   

parse_line1 <-
  function(x) {
    parts <-
      str_split(x, "\\|", simplify = TRUE) |> 
      as.character()
    parts <- str_trim(parts)
    parts <- parts[parts != ""]
    
    tibble(
      player_num   = as.integer(parts[1]),
      name         = parts[2],
      total_points = as.numeric(parts[3]),
      
      round1       = parts[4],
      round2       = parts[5],
      round3       = parts[6],
      round4       = parts[7],
      round5       = parts[8],
      round6       = parts[9],
      round7       = parts[10]
  )
}

line1_df <- 
  map_dfr(line_odd, parse_line1)

line1_df |> 
  select(player_num, name, total_points, round1:round7) |>
  head()
# A tibble: 6 × 10
  player_num name  total_points round1 round2 round3 round4 round5 round6 round7
       <int> <chr>        <dbl> <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr> 
1          1 GARY…          6   W  39  W  21  W  18  W  14  W   7  D  12  D   4 
2          2 DAKS…          6   W  63  W  58  L   4  W  17  W  16  W  20  W   7 
3          3 ADIT…          6   L   8  W  61  W  25  W  21  W  11  W  13  W  12 
4          4 PATR…          5.5 W  23  D  28  W   2  W  26  D   5  W  19  D   1 
5          5 HANS…          5.5 W  45  W  37  D  12  D  13  D   4  W  14  W  17 
6          6 HANS…          5   W  34  D  29  L  11  W  35  D  10  W  27  W  21 
parse_line2 <- 
  function(x) {
    parts <- 
      str_split(x, "\\|", simplify = TRUE) |>
      as.character()
    parts <- str_trim(parts)
    parts <- parts[parts != ""]
  
    pre <- str_match(x, "R:\\s*(\\d{3,4})")[, 2]
  
    if (is.na(pre)) pre <- 
      str_match(x, "(\\d{3,4})\\s*->")[, 2]

    tibble(
      pre_rating = as.integer(pre)
      )
}

line2_df <- 
  map_dfr(line_even, parse_line2)
line2_df |> 
  head()
# A tibble: 6 × 1
  pre_rating
       <int>
1       1794
2       1553
3       1384
4       1716
5       1655
6       1686
opp_info <- 
  line1_df |>
  select(player_num, starts_with("round")) |>
  pivot_longer(
    cols = starts_with("round"),
    names_to = "round",
    values_to = "cell"
) |>
  mutate(
    opp_num = as.integer(str_extract(cell, "\\d+")),
    game_result = str_extract(cell, "^[WDL]"),
    game_score = case_when(
      game_result == "W" ~ 1.0,
      game_result == "D" ~ 0.5,
      game_result == "L" ~ 0.0,
      TRUE ~ NA_real_
    ),
    cell = NULL
) |>
 filter(!is.na(opp_num))
opp_info
# A tibble: 408 × 5
   player_num round  opp_num game_result game_score
        <int> <chr>    <int> <chr>            <dbl>
 1          1 round1      39 W                  1  
 2          1 round2      21 W                  1  
 3          1 round3      18 W                  1  
 4          1 round4      14 W                  1  
 5          1 round5       7 W                  1  
 6          1 round6      12 D                  0.5
 7          1 round7       4 D                  0.5
 8          2 round1      63 W                  1  
 9          2 round2      58 W                  1  
10          2 round3       4 L                  0  
# ℹ 398 more rows

4.Recal the total score

I decide to remove the rounds that are B/H/U.

score_recal <-
  opp_info |>
  group_by(player_num) |>
  summarise(
  total_rounds = n(),
  new_total = sum(game_score, na.rm = TRUE)
)
score_recal
# A tibble: 64 × 3
   player_num total_rounds new_total
        <int>        <int>     <dbl>
 1          1            7       6  
 2          2            7       6  
 3          3            7       6  
 4          4            7       5.5
 5          5            7       5.5
 6          6            7       5  
 7          7            7       5  
 8          8            7       5  
 9          9            7       5  
10         10            7       5  
# ℹ 54 more rows

5.Mkake a new table

chess_info <- opp_info |>
  left_join(
    tibble(player_num = line1_df$player_num, my_rating = line2_df$pre_rating),
    by = "player_num"
) |>
  left_join(
    tibble(player_num = line1_df$player_num, opp_rating = line2_df$pre_rating),
    by = c("opp_num" = "player_num")
) |>
  left_join(
    tibble(player_num = score_recal$player_num, new_total = score_recal$new_total),
    by = "player_num"
)
chess_info
# A tibble: 408 × 8
   player_num round  opp_num game_result game_score my_rating opp_rating
        <int> <chr>    <int> <chr>            <dbl>     <int>      <int>
 1          1 round1      39 W                  1        1794       1436
 2          1 round2      21 W                  1        1794       1563
 3          1 round3      18 W                  1        1794       1600
 4          1 round4      14 W                  1        1794       1610
 5          1 round5       7 W                  1        1794       1649
 6          1 round6      12 D                  0.5      1794       1663
 7          1 round7       4 D                  0.5      1794       1716
 8          2 round1      63 W                  1        1553       1175
 9          2 round2      58 W                  1        1553        917
10          2 round3       4 L                  0        1553       1716
# ℹ 398 more rows
# ℹ 1 more variable: new_total <dbl>

6. Cal the ELO rating

elo_cal <-
  chess_info |>
  mutate(
    expected_match_score = 1 / (1 + 10^((opp_rating - my_rating) / 400))
  )
elo_cal
# A tibble: 408 × 9
   player_num round  opp_num game_result game_score my_rating opp_rating
        <int> <chr>    <int> <chr>            <dbl>     <int>      <int>
 1          1 round1      39 W                  1        1794       1436
 2          1 round2      21 W                  1        1794       1563
 3          1 round3      18 W                  1        1794       1600
 4          1 round4      14 W                  1        1794       1610
 5          1 round5       7 W                  1        1794       1649
 6          1 round6      12 D                  0.5      1794       1663
 7          1 round7       4 D                  0.5      1794       1716
 8          2 round1      63 W                  1        1553       1175
 9          2 round2      58 W                  1        1553        917
10          2 round3       4 L                  0        1553       1716
# ℹ 398 more rows
# ℹ 2 more variables: new_total <dbl>, expected_match_score <dbl>

7.Sum the expected score and do the difference.

performance_summary <-
  elo_cal |>
  group_by(player_num) |>
  summarise(
    Expected_score = sum(expected_match_score, na.rm = TRUE)
) |>
  left_join(score_recal |> 
              select(player_num, new_total), by = "player_num") |>
  left_join(line1_df |> 
              select(player_num, name), by = "player_num") |>
  mutate(Difference = score_recal$new_total - Expected_score) 

performance_summary
# A tibble: 64 × 5
   player_num Expected_score new_total name                Difference
        <int>          <dbl>     <dbl> <chr>                    <dbl>
 1          1           5.16       6   GARY HUA                0.838 
 2          2           3.78       6   DAKSHESH DARURI         2.22  
 3          3           1.95       6   ADITYA BAJAJ            4.05  
 4          4           4.74       5.5 PATRICK H SCHILLING     0.758 
 5          5           4.38       5.5 HANSHI ZUO              1.12  
 6          6           4.94       5   HANSEN SONG             0.0554
 7          7           4.58       5   GARY DEE SWATHELL       0.419 
 8          8           5.03       5   EZEKIEL HOUGHTON       -0.0343
 9          9           2.29       5   STEFANO LEE             2.71  
10         10           1.94       5   ANVIT RAO               3.06  
# ℹ 54 more rows

8.Finl coding.

arrange(performance_summary, desc(Difference))
# A tibble: 64 × 5
   player_num Expected_score new_total name                     Difference
        <int>          <dbl>     <dbl> <chr>                         <dbl>
 1          3         1.95         6   ADITYA BAJAJ                   4.05
 2         15         1.37         4.5 ZACHARY JAMES HOUGHTON         3.13
 3         10         1.94         5   ANVIT RAO                      3.06
 4         46         0.0432       3   JACOB ALEXANDER LAVALLEY       2.96
 5          9         2.29         5   STEFANO LEE                    2.71
 6          2         3.78         6   DAKSHESH DARURI                2.22
 7         52         0.295        2.5 ETHAN GUO                      2.20
 8         51         1.03         2.5 TEJAS AYYAGARI                 1.47
 9         24         2.55         4   MICHAEL R ALDRICH              1.45
10         37         0.773        2   AMIYATOSH PWNANANDAM           1.23
# ℹ 54 more rows
arrange(performance_summary, (Difference))
# A tibble: 64 × 5
   player_num Expected_score new_total name               Difference
        <int>          <dbl>     <dbl> <chr>                   <dbl>
 1         25           6.28       3.5 LOREN SCHWIEBERT        -2.78
 2         30           6.02       3.5 GEORGE AVERY JONES      -2.52
 3         54           3.40       1   LARRY HODGE             -2.40
 4         42           5.01       3   JARED GE                -2.01
 5         31           5.09       3.5 RISHI SHETTY            -1.59
 6         35           4.96       3.5 JOSHUA DAVID LEE        -1.46
 7         45           4.37       3   DEREK YAN               -1.37
 8         43           4.33       3   ROBERT GLEN VASEY       -1.33
 9         64           2.27       1   BEN LI                  -1.27
10         33           4.64       3.5 JADE GE                 -1.14
# ℹ 54 more rows
ggplot(
  performance_summary,
  aes(x=player_num, y=Expected_score)
) +
  geom_line()+
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'

ggplot(
  score_recal,
  aes(x=player_num, y=new_total)
) +
  geom_line()+
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'

ggplot(
  performance_summary,
  aes(x=player_num, y=Difference)
) +
  geom_line()+
  geom_smooth(method = "lm")
`geom_smooth()` using formula = 'y ~ x'

9.Summary

The top 5 overperformed players are: ADITYA BAJAJ ZACHARY JAMES HOUGHTON ANVIT RAO JACOB ALEXANDER LAVALLEY STEFANO LEE


And the top 5 underperformed players are: LOREN SCHWIEBERT GEORGE AVERY JONES LARRY HODGE JARED GE RISHI SHETTY