Assignment 5b

library(stringr)
library(tidyr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Defining ELO formula

calc_expected <- function(p_rating, o_rating) {
  if (is.na(o_rating)) return(0)
  1 / (1 + 10^((o_rating - p_rating) / 400))
}

Getting the data

# Loading and cleaning the data
data <- readLines("https://raw.githubusercontent.com/Jeovany97/Data-607/refs/heads/main/Project%201/tournamentinfo.txt", warn = FALSE)

clean_data <- data[!str_detect(data, "^-+$")]
clean_data <- clean_data[-(1:2)]
head(clean_data)
[1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
[2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
[3] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
[4] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
[5] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
[6] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
#splitting each line into seperate list for easier use
name_point <- clean_data[seq(1, length(clean_data), 2)]
state_rating <- clean_data[seq(2, length(clean_data), 2)]

Getting the data on next from the txt

name <- str_trim(str_extract(name_point, "(\\s?[A-Z]{2,}\\s?){2,}+"))
player_id <- as.numeric(str_extract(name_point, "\\d+"))
previous_rating <- as.numeric(str_extract(state_rating, "(?<=R:\\s{0,5})\\d+"))
opponents_id <- data.frame(id = player_id, rating = previous_rating)
opponents_faced <- tibble(raw = name_point) %>%
  mutate(
    player_id = as.numeric(str_extract(raw, "^\\s*\\d+")),
    opponent_id = str_extract_all(raw, "(?<=[WLD]\\s{1,5})\\d+")
  ) %>%
  select(player_id, opponent_id)

actual_scores <- as.numeric(str_extract(name_point, "\\d+\\.\\d"))
rating_lookup <- opponents_id %>% rename(opp_rating = rating)

performance_df <- opponents_faced %>%
  unnest(opponent_id) %>%
  mutate(opponent_id = as.numeric(opponent_id)) %>%
  # Join to get the rating of each opponent
  left_join(rating_lookup, by = c("opponent_id" = "id")) %>%
  # Calculate elo using the elo function made earlier
  rowwise() %>%
  mutate(expected = calc_expected(previous_rating[player_id == player_id][1], opp_rating)) %>%
  group_by(player_id) %>%
  summarise(
    Player_Name = name[unique(player_id)],
    Actual_Score = actual_scores[unique(player_id)],
    Expected_Score = round(sum(expected, na.rm = TRUE), 2),
    Difference = round(Actual_Score - Expected_Score, 2)
  )

overperformers <- performance_df %>%
  arrange(desc(Difference)) %>% 
  head(5)
underperformers <- performance_df %>%
  arrange(Difference) %>% 
  head(5)
overperformers
# A tibble: 5 × 5
  player_id Player_Name     Actual_Score Expected_Score Difference
      <dbl> <chr>                  <dbl>          <dbl>      <dbl>
1         3 ADITYA BAJAJ             6             5.03       0.97
2         1 GARY HUA                 6             5.16       0.84
3         2 DAKSHESH DARURI          6             5.5        0.5 
4         4 PATRICK                  5.5           5.32       0.18
5         5 HANSHI ZUO               5.5           5.38       0.12
underperformers
# A tibble: 5 × 5
  player_id Player_Name       Actual_Score Expected_Score Difference
      <dbl> <chr>                    <dbl>          <dbl>      <dbl>
1        64 BEN LI                     1             6.31      -5.31
2        61 JEZZEL FARKAS              1.5           6.43      -4.93
3        51 TEJAS AYYAGARI             2.5           6.36      -3.86
4        43 ROBERT GLEN VASEY          3             6.61      -3.61
5        59 SEAN                       2             5.61      -3.61