#Approach In this assignment, I will extend the analysis conducted in Project 1 by leveraging the structured dataset previously created from the tournament results. Using the cleaned and organized player data, I will calculate each player’s expected score based on the Elo rating system.

The expected score will be computed using the standard Elo formula, which estimates the probability of a player scoring a point against each opponent based on the difference in their pre-tournament ratings. For each player, I will:

1 - Retrieve the ratings of all opponents faced during the tournament.

2 - Apply the Elo expected score formula to calculate the expected outcome for each individual match.

3 - Sum the expected values across all rounds to obtain the player’s total expected score.

4 - Compare the expected score to the player’s actual tournament points.

The difference between actual performance and expected performance will be used to determine whether a player overperformed (actual score greater than expected) or underperformed (actual score lower than expected).

Finally, I will identify and report:

raw link from github

file_local <- "tournamentinfo.txt"
file_url <- "https://raw.githubusercontent.com/japhet125/Project1-Data-Science/refs/heads/main/tournamentinfo.txt"
if (file.exists(file_local)){
  info <- readLines(file_local)
} else {
  info <- readLines(file_url)
}
## Warning in readLines(file_url): incomplete final line found on
## 'https://raw.githubusercontent.com/japhet125/Project1-Data-Science/refs/heads/main/tournamentinfo.txt'
#info
head(info)
## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

process cleaning data

clean_lines <- info[!grepl("^-", info)]

clean_lines <- clean_lines[clean_lines != ""]

head(clean_lines)
## [1] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [2] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [3] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [4] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [5] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
## [6] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
#clean_lines

#cleaning data removing header

clean_lines <- clean_lines[!grepl("^\\s*Pair", clean_lines)]
clean_lines <- clean_lines[!grepl("^\\s*Num", clean_lines)]
head(clean_lines, 4)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [4] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"

#group every two rows

player_blocks <- split(clean_lines, ceiling(seq_along(clean_lines)/2))
player_blocks[[1]]
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

#Extracting fields from players

library(stringr)
block <- player_blocks[[1]]
row1 <- block[1]
row2 <- block[2]

#Extracting player number

player_num <- str_extract(row1, "^\\s*\\d+")
player_num
## [1] "    1"

#Extracting player name

name <- str_trim(str_extract(row1, "(?<=\\| ).*?(?=\\s+\\|)"))
name
## [1] "GARY HUA"

#Extracting total Points

points <- str_extract(row1, "\\d+\\.\\d")
points
## [1] "6.0"

#Extracting state

state <- str_extract(row2, "^[^|]+")
state
## [1] "   ON "

#Extracting pre-rating

pre_rating <- str_extract(row2, "R:\\s*\\d+")
pre_rating
## [1] "R: 1794"

#building the data frame

players_df <- data.frame()
for (block in player_blocks) {
  row1 <- block[1]
  row2 <- block[2]
  
  player_num <- as.numeric(str_extract(row1, "^\\s*\\d+"))
  
  name <- str_trim(str_extract(row1, "(?<=\\| ).*?(?=\\s+\\|)"))
  
  points <- as.numeric(str_extract(row1, "\\d+\\.\\d"))
  
  state <- str_trim(str_extract(row2, "^[^|]+"))
  
  pre_rating <- str_extract(row2, "R:\\s*\\d+")
  pre_rating <- as.numeric(str_extract(pre_rating, "\\d+"))
  
  opponents <- str_extract_all(row1, "(?<=\\s)\\d{1,2}(?=\\|)")[[1]]
  opponents <- as.numeric(opponents)
  
  players_df <- rbind(players_df, data.frame(
    player_num,
    name,
    state,
    points,
    pre_rating,
    opponents = I(list(opponents))
  ))
  
}
head(players_df)
##   player_num                name state points pre_rating    opponents
## 1          1            GARY HUA    ON    6.0       1794 39, 21, ....
## 2          2     DAKSHESH DARURI    MI    6.0       1553 63, 58, ....
## 3          3        ADITYA BAJAJ    MI    6.0       1384 8, 61, 2....
## 4          4 PATRICK H SCHILLING    MI    5.5       1716 23, 28, ....
## 5          5          HANSHI ZUO    MI    5.5       1655 45, 37, ....
## 6          6         HANSEN SONG    OH    5.0       1686 34, 29, ....

calculating the average opponent pre-rating

Look at their list of opponent IDs

Find those opponents in the dataframe

Get their ratings

Compute the average

Store it as a new column

players_df$avg_opponent_rating <- sapply(players_df$opponents, function(opp_ids) {
  opp_ratings <- players_df$pre_rating[
    players_df$player_num %in% opp_ids
  ]
  mean(opp_ratings)
})
players_df$avg_opponent_rating <- round(players_df$avg_opponent_rating, 0)
players_df[1, ]
##   player_num     name state points pre_rating    opponents avg_opponent_rating
## 1          1 GARY HUA    ON      6       1794 39, 21, ....                1605

#Exporting it to a .csv file

write.csv(
  players_df[, c("name", "state", "points", "pre_rating", "avg_opponent_rating")],
  "tournamentinfo_results.csv",
  row.names = FALSE
)

visualizing with a scatterplot

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggpubr)
players_df %>%
  ggplot(aes(x = avg_opponent_rating, y = pre_rating, color = state)) +
  geom_point(size = 3, alpha = 0.3) +
  geom_smooth(method = "lm") +
  stat_cor(aes(label = ..r.label..),  method =  "pearson", 
          label.x = min(players_df$avg_opponent_rating),
          label.y = max(players_df$pre_rating)) +
  labs(
    title = "Player Pre-Rating vs Average Opponent Rating",
    x = "Average Opponent Pre-Rating",
    y = "Player Pre-Rating"
  )+
  theme_minimal()
## Warning: The dot-dot notation (`..r.label..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(r.label)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

#getting the top 10 players by total point

top10 <- players_df %>%
  arrange(desc(points), desc(pre_rating)) %>%
  select(name, points, pre_rating, avg_opponent_rating) %>%
  head(10)
top10
##                   name points pre_rating avg_opponent_rating
## 1             GARY HUA    6.0       1794                1605
## 2      DAKSHESH DARURI    6.0       1553                1469
## 3         ADITYA BAJAJ    6.0       1384                1564
## 4  PATRICK H SCHILLING    5.5       1716                1574
## 5           HANSHI ZUO    5.5       1655                1501
## 6          HANSEN SONG    5.0       1686                1519
## 7    GARY DEE SWATHELL    5.0       1649                1372
## 8     EZEKIEL HOUGHTON    5.0       1641                1468
## 9          STEFANO LEE    5.0       1411                1523
## 10           ANVIT RAO    5.0       1365                1554

#Calculating the average rating by state

avg_by_state <- players_df %>%
  group_by(state) %>%
  summarise(avg_rating = mean(pre_rating, na.rm = TRUE)) %>%
  arrange(desc(avg_rating))
avg_by_state
## # A tibble: 3 × 2
##   state avg_rating
##   <chr>      <dbl>
## 1 OH         1686 
## 2 ON         1454.
## 3 MI         1362

#visualize and analyse

avg_by_state %>%
  ggplot(aes(x = reorder(state, avg_rating), y = avg_rating, fill = state)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(avg_rating, 0)), hjust = -0.2, size = 4) +
  #coord_flip()+
  labs(
    title = "Average Pre-Rating by State",
    x = "State",
    y = "Average Pre-Rating"
  )+
  theme_minimal()

#installing packages

library(ggpubr)
library(ggplot2)
library(dplyr)
library(DBI)
library(RSQLite)
## Warning: package 'RSQLite' was built under R version 4.5.2

#Assignement 5B begin here

players_df$expected_score <- sapply(1:nrow(players_df), function(i){
  player_rating <- players_df$pre_rating[i]
  opponent_ids <- players_df$opponents[[i]]
  
  
  opponent_ratings <- players_df$pre_rating[
    players_df$player_num %in% opponent_ids
  ]
  
  expected_each <- 1 / (1 + 10^((opponent_ratings - player_rating)/400))

  sum(expected_each)
})

#computing over/under performance of each player

players_df$performance_diff <- players_df$points - players_df$expected_score

#selecting the 5 top performers

top_over <- players_df %>%
  arrange(desc(performance_diff)) %>%
  select(name, points, expected_score, performance_diff) %>%
  head(5)
top_over
##                       name points expected_score performance_diff
## 1             ADITYA BAJAJ    6.0     1.94508791         4.054912
## 2   ZACHARY JAMES HOUGHTON    4.5     1.37330887         3.126691
## 3                ANVIT RAO    5.0     1.94485405         3.055146
## 4 JACOB ALEXANDER LAVALLEY    3.0     0.04324981         2.956750
## 5     AMIYATOSH PWNANANDAM    3.5     0.77345290         2.726547

#top 5 underperformes

top_under <- players_df %>%
  arrange(performance_diff) %>%
  select(name, points, expected_score,  performance_diff) %>%
  head(5)
top_under
##                 name points expected_score performance_diff
## 1   LOREN SCHWIEBERT    3.5       6.275650        -2.775650
## 2 GEORGE AVERY JONES    3.5       6.018220        -2.518220
## 3           JARED GE    3.0       5.010416        -2.010416
## 4       RISHI SHETTY    3.5       5.092465        -1.592465
## 5   JOSHUA DAVID LEE    3.5       4.957890        -1.457890

#visualizing performance over plot The analysis reveals that some lower-rated players significantly outperformed expectations, suggesting either strong tournament form or underestimation by their pre-tournament rating. Conversely, several higher-rated players underperformed relative to their expected scores, indicating variance inherent in competitive tournaments.

players_df %>%
  ggplot(aes(x = expected_score, y = points)) + geom_point() +
  geom_abline(color = "blue") + labs(
    title = "Actual Score vs Expected Score",
    x = "Expected Score",
    y = "Actual Score"
  ) +
  theme_minimal()

#Conclusion

The expected score for each player was calculated using the standard Elo rating formula as defined by Arpad Elo and adopted by FIDE. The model estimates the probability of a player scoring a point against each opponent based on rating differences. Summing these probabilities across all rounds produces the total expected score. Comparing expected scores to actual performance allows identification of players who significantly overperformed or underperformed relative to their rating.