Loading the given Text File from raw data in GitHub repository

# Load the necessary library
library(stringr)
library(readr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

# Read the data from the URL
chess <- read.delim('https://raw.githubusercontent.com/LwinShwe/607Project1/main/tournament.txt', skip = 3, header = FALSE, stringsAsFactors=FALSE)

# visualize the first few rows of the data
head(chess)
##                                                                                          V1
## 1 -----------------------------------------------------------------------------------------
## 2     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 3    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 4 -----------------------------------------------------------------------------------------
## 5     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## 6    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |

Organizing Data

Raw Data is cleaned by removing extra lines, eliminating empty lines and taking off “-” hyphen from V1 column in between the text.A regular expression pattern is used to extract Player Names from V1 Column that matches at least a first and last name and empty string are removed.In addition, Player States are retrieved that matches two capital letters followed by a space and a pipe symbol “|”. The total number of points are caluculated that capture all possible decimal numbers and remove blank rows.

# Remove hyphens ("-") from the V1 column
chess$V1 <- str_replace_all(chess$V1, "-|\\n", "") # Remove hyphens ("-") from the V1 column

# Find player's names from the V1 column using regx and remove empty strings
PlayerNames <- unlist(str_extract_all(chess$V1, "\\w+[[:space:]]\\w+([[:space:]]\\w+)*", simplify = TRUE))
PlayerNames <- PlayerNames[PlayerNames != ""]
head(PlayerNames)
## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"
# Find player's States from the V1 column using regx then remove a pipe symbol "|" and blank rows from the data
PlayerStates <- unlist(str_extract_all(chess$V1, "[A-Z][A-Z][[:space:]]\\|"))
PlayerStates <- str_split(PlayerStates, "[[:space:]]\\|", simplify = TRUE)
PlayerStates <- PlayerStates[, -ncol(PlayerStates)]
PlayerStates <- PlayerStates[PlayerStates != ""]
head(PlayerStates)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
# Find decimal numbers from the V1 column using regular expression pattern and remove blank rows
TotalPoints <- unlist(str_extract_all(chess$V1, "\\b\\d+\\.\\d+\\b", simplify = TRUE))
TotalPoints <- TotalPoints[TotalPoints != ""] 
head(TotalPoints)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
# Find the Players Pre-Ratings that get numbers after R: and before any number of space and remove blank rows from the data
PreRatings <- unlist(str_extract_all(chess[,], "[R:]([[:space:]]+)([[:alnum:]]+)([[:space:]]*)", simplify=TRUE))
PreRatings <- unlist(str_extract_all(PreRatings, "\\d+[[:alnum:]]+", simplify=TRUE))
PreRatings <- unlist(str_extract_all(PreRatings, "\\d\\d\\d+", simplify=TRUE))
PreRatings <- PreRatings[!PreRatings[,] == "",]
PreRatings <- as.numeric(PreRatings)
head(PreRatings)
## [1] 1794 1553 1384 1716 1655 1686
# Extract the opponent strings
OpponentData <- unlist(str_extract_all(chess[,], "([\\|][A-Z]([[:space:]]+)\\d*[\\|])([A-Z]([[:space:]]+)\\d*[\\|])*", simplify=TRUE))
Opponents <- matrix(ncol=7)

# Find the individual Opponent Indexes into a matrix of 7 columns
Opponents <- unlist(str_extract_all(OpponentData[,], "\\d+", simplify=TRUE))
Opponents <- Opponents[rowSums(Opponents=="")!=ncol(Opponents), ]

Calculate Average PreRaing Chess Rating of Opponents

# Set an empty vector to store the Average Pre Chess Rating of Opponents
AvgPreRaing <- numeric()

# Loop through each row of Opponents

for (row in 1:nrow(Opponents)) {
  indexes <- Opponents[row, Opponents[row, ] != ""] # Get the opponent indexes for the current row excluding empty values
  indexes <- as.integer(indexes) # Convert the indexes to integers
  opponent_ratings <- PreRatings[indexes] # Extract corresponding pre-ratings for the indexes
  avg_prerating <- mean(opponent_ratings, na.rm = TRUE) # Calculate the average opponent rating for the current row
  AvgPreRaing <- c(AvgPreRaing, avg_prerating) # Append the average rating to the Rating Averages vector
}

# Display average opponent ratings
head(AvgPreRaing)
## [1] 1605.286 1469.286 1563.571 1573.571 1500.857 1518.714

Build a cleaned and organized data frame of “tournament.txt”

# Create the Tournament Results data frame 
Results <- data.frame(
  "Player Name" = PlayerNames,
  "Player State" = PlayerStates,
  "Total Number of Points" = TotalPoints,
  "Player's Pre-Rating" = PreRatings,
  "Average Pre Chess Rating of Opponents" = AvgPreRaing
)

#Writing CSV 
write.csv(Results, 'game-results.csv', TRUE)
## Warning in utils::write.table(Results, "game-results.csv", TRUE, col.names =
## NA, : appending column names to file

The “Results” data frame has 64 observations and 5 variables which is exported as game-results.csv file.

Extra Credit - ELO calculations

Based on difference in ratings between the chess players and each of their opponents in our Project 1 tournament, calculate each player’s expected score (e.g. 4.3) and the difference from their actual score (e.g 4.0). List the five players who most overperformed relative to their expected score, and the five players that most underperformed relative to their expected score.

You’ll find some small differences in different implementation of ELO formulas. You may use any reasonably-sourced formula, but please cite your source.

Statistical Analysis of the Elo Rating System in Chess

To calculate the expected score for each player against their opponents using the Elo rating system, statistical Framework for the Elo Rating System will be used.

In the Elo rating system, if Player \(A\) has an Elo rating that is 400 points greater than opponent Player \(B\), then Player \(A\) should be 10 times more likely to win the game. More generally, if Player \(A\) has rating \(R_A\) and Player \(B\) has rating \(R_B\), then the odds of Player \(A\) beating Player \(B\) are given by the following formula. \[ \text { Odds }(A \text { beats } B)=\frac{\operatorname{Pr}(A \text { beats } B)}{\operatorname{Pr}(B \text { beeats } A)}=\frac{\operatorname{Pr}(A \text { beats } B)}{1-\operatorname{Pr}(A \text { beats } B)}=10^{\frac{R_A-R_B}{400}} \]

Solving for \(\operatorname{Pr}(A\) beats \(B)\) gives expected Score: \[ \operatorname{Pr}(A \text { beats } B)=E_{A B}=\frac{1}{1+10^{\frac{R_B-R_A}{400}}} \]

This probability can also be interpreted as an expected score for Player \(A\) when playing against Player \(B\), shown as \(E_{A B}\) above. The three possible outcomes for \(A\) are win, lose, or draw, corresponding to scores of 1,0 , and 0.5 , respectively.

Elo Formula Source: https://chance.amstat.org/2020/09/chess

# Calculate expected scores
Results$Expected_Score <- 1 / (1 + 10^((Results$'Average.Pre.Chess.Rating.of.Opponents' - Results$'Player.s.Pre.Rating') / 400))

# Calculating the difference between actual and expected scores
Results$Difference <- round((as.numeric(Results$Total.Number.of.Points)) - (as.numeric(Results$Expected_Score)), digits = 2)

# Top 5 overperformers relative to expected score
top_overperformers <- head(Results[order(Results$Difference, decreasing = TRUE),], 5)
top_overperformers
##           Player.Name Player.State Total.Number.of.Points Player.s.Pre.Rating
## 3        ADITYA BAJAJ           MI                    6.0                1384
## 2     DAKSHESH DARURI           MI                    6.0                1553
## 1            GARY HUA           ON                    6.0                1794
## 4 PATRICK H SCHILLING           MI                    5.5                1716
## 5          HANSHI ZUO           MI                    5.5                1655
##   Average.Pre.Chess.Rating.of.Opponents Expected_Score Difference
## 3                              1563.571      0.2623681       5.74
## 2                              1469.286      0.6181960       5.38
## 1                              1605.286      0.7476894       5.25
## 4                              1573.571      0.6942119       4.81
## 5                              1500.857      0.7083363       4.79
# Top 5 underperformers relative to expected score
top_underperformers <- head(Results[order(Results$Difference ),], 5)
top_underperformers
##             Player.Name Player.State Total.Number.of.Points Player.s.Pre.Rating
## 62        ASHWIN BALAJI           MI                    1.0                1530
## 64               BEN LI           MI                    1.0                1163
## 63 THOMAS JOSEPH HOSMER           MI                    1.0                1175
## 60           JULIA SHEN           MI                    1.5                 967
## 61        JEZZEL FARKAS           ON                    1.5                 955
##    Average.Pre.Chess.Rating.of.Opponents Expected_Score Difference
## 62                              1186.000      0.8787050       0.12
## 64                              1263.000      0.3599350       0.64
## 63                              1350.200      0.2672672       0.73
## 60                              1330.200      0.1099994       1.39
## 61                              1327.286      0.1049826       1.40
# plot graph of the actual score and the expected score 
ggplot(data = Results, aes(x=Expected_Score, y=Total.Number.of.Points, color=Player.State)) + geom_point()

The top 5 over performers were: 1. ADITYA BAJAJ 2. DAKSHESH DARURI 3. GARY HUA 4. PATRICK H SCHILLING 5. HANSHI ZUO.

The top 5 under performers were: 1. ASHWIN BALAJI 2. BEN LI 3. THOMAS JOSEPH HOSMER 4. JULIA SHEN 5. JEZZEL FARKAS.