The Elo rating system is a method for calculating the relative skill levels of players in zero-sum games such as chess. It is named after its creator Arpad Elo, a Hungarian-American physics professor.The Elo system was originally invented as an improved chess-rating system over the previously used Harkness system, but is also used as a rating system for multi-player competition in a number of video games, American football, basketball and other games.To learn more, please visit the wiki page here.
Step 1 is to load the raw data from my github library where we have 64 chess player’s details of their performance in a chess tournament. To understand the dataset better, visit the Youtube link here to hear Professor Andy explain the data schema.
knitr::opts_chunk$set(eval = TRUE, results = FALSE)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.0 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(knitr)
chess_table_raw <- read.delim("https://raw.githubusercontent.com/BharaniNittala/DATA-607/master/tournamentinfo.txt", "|",header = FALSE)
knitr:: kable(head(chess_table_raw,10))
chess_table_mod <- chess_table_raw[-seq(1, NROW(chess_table_raw), by = 3),]
row.names(chess_table_mod) <- NULL
chess_table_mod <- chess_table_mod[-(1:2), , drop = FALSE]
#Reproducibility of the code, in case in the future the number of rows in the dataset changes. In here we are taking the count of rows in the dataset provided
len <- nrow(chess_table_mod)
even <- seq(0,len,2)
odd <- seq(1,len,2)
#Merging the datasets on the rowindexes. We need to transpose alternate rows to columns for easy access. But, let's do it in an innovative way!
chess_table_odd <- chess_table_mod[odd,]
row.names(chess_table_odd) <- NULL
chess_table_even <- chess_table_mod[even,]
row.names(chess_table_even) <- NULL
chess_table_merged <- transform(merge(chess_table_even[ , c("V1", "V2")], chess_table_odd[,!(colnames(chess_table_odd) %in% c("V1","V11"))], by = 0,all=TRUE))
#Not required to state the column names here, but then it's satisfying to look at this dataset with proper names
colnames(chess_table_merged) <- c("Player_no","State","Rating","Player_Name","Total_Points","Round_1","Round_2","Round_3","Round_4","Round_5","Round_6","Round_7")
library(dplyr)
library(tidyr)
#Separate the Player numbers and their result in this step
chess_table_v2 <- chess_table_merged %>% separate(Round_1,c("R1_Result", "R1_Player")) %>% separate(Round_2,c("R2_Result", "R2_Player")) %>% separate(Round_3,c("R3_Result", "R3_Player")) %>% separate(Round_4,c("R4_Result", "R4_Player")) %>% separate(Round_5,c("R5_Result", "R5_Player")) %>% separate(Round_6,c("R6_Result", "R6_Player")) %>% separate(Round_7,c("R7_Result", "R7_Player"))
#In here, let's extract previous rating of the player
chess_table_v3 <- chess_table_v2 %>% separate(Rating,c("Code", "Ratings"),"R:")
#In here, let's extract previous rating of the player
chess_table_v4 <- chess_table_v3 %>% separate(Ratings,c("Pre_Rating", "Post_Rating"),"->")
#We do not need provisional rating for this exercise, but would have been fun if we were to impute based on threshold on certain provisional matches
chess_table_v5 <- chess_table_v4 %>% separate(Pre_Rating,c("Pre_Rating", "Provisional_Pre"),"P") %>% separate(Post_Rating,c("Post_Rating", "Provisional_Post"),"P")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 54 rows [1, 2, 3,
## 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 23, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 56 rows [1, 2, 3,
## 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...].
# For now, let's curb our enthusiasm to deep dive into provisional
chess_table_v5 <- chess_table_v5[, !(colnames(chess_table_v5) %in% c("Provisional_Pre","Provisional_Post","Code"))]
# Let's take what we need - only the columns required from V5 version from above exercise
player_rating <- chess_table_v5[,(colnames(chess_table_v5) %in% c("Player_no","Pre_Rating"))]
# Convert the characters to numerics to perform average and other aggregate functions later
player_rating[] <- lapply(player_rating, function(x) as.numeric(as.character(x)))
# Verify the data type
str(player_rating)
# Now, sort function will work perfectly!
player_rating <- player_rating[order(player_rating$Player_no),]
# I will maintain 2 separate data frames. One with ratings and other one with information on opponent players. Then will merge these two to get ratings of opponent players. Think something like 1 fact table and 1 dimension table
player_opp <- chess_table_v5[ , grepl( "Player" , names( chess_table_v5 ) ) ][,-2]
player_opp[] <- lapply(player_opp, function(x) as.numeric(as.character(x)))
# Obtaining ratings for each of the opponent player from fact table 'player_rating'. This demanded muliple merges. I could have used 'reduce' function but then I had to map to different rows. Something to explore!
Agg_ratings <- merge(x= player_opp,y= player_rating, by.x = "R1_Player",by.y = "Player_no", all.x = TRUE) %>% merge(.,y= player_rating, by.x = "R2_Player",by.y = "Player_no", all.x = TRUE) %>% merge(.,y= player_rating, by.x = "R3_Player",by.y = "Player_no", all.x = TRUE)%>% merge(.,y= player_rating, by.x = "R4_Player",by.y = "Player_no", all.x = TRUE) %>% merge(.,y= player_rating, by.x = "R5_Player",by.y = "Player_no", all.x = TRUE)%>% merge(.,y= player_rating, by.x = "R6_Player",by.y = "Player_no", all.x = TRUE)%>% merge(.,y= player_rating, by.x = "R7_Player",by.y = "Player_no", all.x = TRUE)
## Warning in merge.data.frame(., y = player_rating, by.x = "R4_Player", by.y =
## "Player_no", : column names 'Pre_Rating.x', 'Pre_Rating.y' are duplicated in the
## result
## Warning in merge.data.frame(., y = player_rating, by.x = "R5_Player", by.y =
## "Player_no", : column names 'Pre_Rating.x', 'Pre_Rating.y' are duplicated in the
## result
## Warning in merge.data.frame(., y = player_rating, by.x = "R6_Player", by.y =
## "Player_no", : column names 'Pre_Rating.x', 'Pre_Rating.y', 'Pre_Rating.x',
## 'Pre_Rating.y' are duplicated in the result
## Warning in merge.data.frame(., y = player_rating, by.x = "R7_Player", by.y =
## "Player_no", : column names 'Pre_Rating.x', 'Pre_Rating.y', 'Pre_Rating.x',
## 'Pre_Rating.y' are duplicated in the result
#Preparing the dataframe to take average across the rows to calculate Elo score
Avg_ratings <- Agg_ratings[,!(grepl( "_Player" , names( Agg_ratings )))]
#Calculating row means
Avg_ratings$Mean_Pre_Rating <-rowMeans(Avg_ratings[,-1],na.rm = TRUE)
# Leveraged round_df code from Stackoverflow https://stackoverflow.com/questions/29875914/rounding-values-in-a-dataframe-in-r
round_df <- function(x, digits) {
# round all numeric variables
# x: data frame
# digits: number of digits to round
numeric_columns <- sapply(x, mode) == 'numeric'
x[numeric_columns] <- round(x[numeric_columns], digits)
x
}
Avg_ratings <- round_df(Avg_ratings,0)[,c("Player_no","Mean_Pre_Rating")]
#Stitch together required columns from the tables to generate CSV file
Final_data <- merge(x= chess_table_v5[ , c("Player_no","Player_Name", "State", "Total_Points", "Pre_Rating")],y= Avg_ratings[,c("Player_no","Mean_Pre_Rating")], by = "Player_no")
Final_data$Player_no <- as.numeric(as.character(Final_data$Player_no))
Final_data$Pre_Rating <- as.numeric(as.character(Final_data$Pre_Rating))
Final_data <- Final_data[order(Final_data$Player_no),]
knitr:: kable(head(Final_data,10))
| Player_no | Player_Name | State | Total_Points | Pre_Rating | Mean_Pre_Rating | |
|---|---|---|---|---|---|---|
| 1 | 1 | GARY HUA | ON | 6.0 | 1794 | 1605 |
| 12 | 2 | DAKSHESH DARURI | MI | 6.0 | 1553 | 1469 |
| 23 | 3 | ADITYA BAJAJ | MI | 6.0 | 1384 | 1564 |
| 34 | 4 | PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
| 45 | 5 | HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
| 56 | 6 | HANSEN SONG | OH | 5.0 | 1686 | 1519 |
| 62 | 7 | GARY DEE SWATHELL | MI | 5.0 | 1649 | 1372 |
| 63 | 8 | EZEKIEL HOUGHTON | MI | 5.0 | 1641 | 1468 |
| 64 | 9 | STEFANO LEE | ON | 5.0 | 1411 | 1523 |
| 2 | 10 | ANVIT RAO | MI | 5.0 | 1365 | 1554 |
#Writing CSV file to the current working directory
write.csv(Final_data,"Chess_tournament.csv", row.names = FALSE)
Using the provided ELO calculation, determine each player’s expected result (number of points),based on his or her pre-tournament rating, and the average pre-tournament rating for all of the player’s opponents. Which player scored the most points relative to his or her expected result?
#Calculating expected score for the players
Final_data$Exp_score <- (1 / (1 + 10^((Final_data$Pre_Rating - Final_data$Mean_Pre_Rating)/400)))
# Assuming The rating of player is updated using the formula rating1 = rating1 + K*(Actual Score – Expected score_ where 'k' weighting factor is assumed as '20'
Final_data$Exp_pts <- Final_data$Pre_Rating + 20*(1-Final_data$Exp_score)
#to compare with final score, let's fetch it from raw data
Final_rating <- chess_table_v5[ , c( "Player_no","Post_Rating")]
Final_rating[] <- lapply(Final_rating, function(x) as.numeric(as.character(x)))
Final_datav2 <-merge(x= Final_data,y= Final_rating, by = "Player_no", all.x = TRUE)
Final_datav2$Score_delta <- Final_datav2$Post_Rating - Final_datav2$Exp_pts
Player_beats_odds <- Final_datav2[order(-Final_datav2$Score_delta),][1,]
knitr:: kable(Player_beats_odds)
| Player_no | Player_Name | State | Total_Points | Pre_Rating | Mean_Pre_Rating | Exp_score | Exp_pts | Post_Rating | Score_delta | |
|---|---|---|---|---|---|---|---|---|---|---|
| 46 | 46 | JACOB ALEXANDER LAVALLEY | MI | 3.0 | 377 | 1358 | 0.9964846 | 377.0703 | 1076 | 698.9297 |