Project 1 - Data Analysis

Overview

The Elo rating system is a method for calculating the relative skill levels of players in zero-sum games such as chess. It is named after its creator Arpad Elo, a Hungarian-American physics professor.The Elo system was originally invented as an improved chess-rating system over the previously used Harkness system, but is also used as a rating system for multi-player competition in a number of video games, American football, basketball and other games.To learn more, please visit the wiki page here.

Step 1:Let’s load required libraries and raw data

Step 1 is to load the raw data from my github library where we have 64 chess player’s details of their performance in a chess tournament. To understand the dataset better, visit the Youtube link here to hear Professor Andy explain the data schema.

knitr::opts_chunk$set(eval = TRUE, results = FALSE)
library(tidyverse)

## -- Attaching packages -------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.0     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ----------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(readr)
library(knitr)

chess_table_raw <- read.delim("https://raw.githubusercontent.com/BharaniNittala/DATA-607/master/tournamentinfo.txt", "|",header = FALSE)
knitr:: kable(head(chess_table_raw,10))

But wait, do I need the headers ?

chess_table_mod <- chess_table_raw[-seq(1, NROW(chess_table_raw), by = 3),]
row.names(chess_table_mod) <- NULL
chess_table_mod <- chess_table_mod[-(1:2), , drop = FALSE]

Step 2: Break the task into sub-tasks

1)Understand data format

2)Keep only the required parts

3)Logic to calculate rating

Understand data format

#Reproducibility of the code, in case in the future the number of rows in the dataset changes. In here we are taking the count of rows in the dataset provided
len <- nrow(chess_table_mod) 
even <- seq(0,len,2)
odd <- seq(1,len,2)

#Merging the datasets on the rowindexes. We need to transpose alternate rows to columns for easy access. But, let's do it in an innovative way!
chess_table_odd <- chess_table_mod[odd,]
row.names(chess_table_odd) <- NULL
chess_table_even <- chess_table_mod[even,]
row.names(chess_table_even) <- NULL
chess_table_merged <- transform(merge(chess_table_even[ , c("V1", "V2")], chess_table_odd[,!(colnames(chess_table_odd) %in% c("V1","V11"))], by = 0,all=TRUE))

#Not required to state the column names here, but then it's satisfying to look at this dataset with proper names
colnames(chess_table_merged) <- c("Player_no","State","Rating","Player_Name","Total_Points","Round_1","Round_2","Round_3","Round_4","Round_5","Round_6","Round_7")

Keep only the required parts

library(dplyr)
library(tidyr)

#Separate the Player numbers and their result in this step
chess_table_v2 <- chess_table_merged %>% separate(Round_1,c("R1_Result", "R1_Player")) %>% separate(Round_2,c("R2_Result", "R2_Player")) %>% separate(Round_3,c("R3_Result", "R3_Player")) %>% separate(Round_4,c("R4_Result", "R4_Player")) %>% separate(Round_5,c("R5_Result", "R5_Player")) %>% separate(Round_6,c("R6_Result", "R6_Player")) %>% separate(Round_7,c("R7_Result", "R7_Player"))

#In here, let's extract previous rating of the player  
chess_table_v3 <- chess_table_v2  %>% separate(Rating,c("Code", "Ratings"),"R:")

#In here, let's extract previous rating of the player 
chess_table_v4 <- chess_table_v3  %>% separate(Ratings,c("Pre_Rating", "Post_Rating"),"->")

#We do not need provisional rating for this exercise, but would have been fun if we were to impute based on threshold on certain provisional matches 
chess_table_v5 <- chess_table_v4  %>% separate(Pre_Rating,c("Pre_Rating", "Provisional_Pre"),"P") %>% separate(Post_Rating,c("Post_Rating", "Provisional_Post"),"P")

## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 54 rows [1, 2, 3,
## 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 23, ...].

## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 56 rows [1, 2, 3,
## 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...].

# For now, let's curb our enthusiasm to deep dive into provisional 
chess_table_v5 <- chess_table_v5[, !(colnames(chess_table_v5) %in% c("Provisional_Pre","Provisional_Post","Code"))]

Understand data format

# Let's take what we need - only the columns required from V5 version from above exercise
player_rating <- chess_table_v5[,(colnames(chess_table_v5) %in% c("Player_no","Pre_Rating"))]

# Convert the characters to numerics to perform average and other aggregate functions later
player_rating[] <- lapply(player_rating, function(x) as.numeric(as.character(x)))

# Verify the data type
str(player_rating)

# Now, sort function will work perfectly!
player_rating <- player_rating[order(player_rating$Player_no),]

# I will maintain 2 separate data frames. One with ratings and other one with information on opponent players. Then will merge these two to get ratings of opponent players. Think something like 1 fact table and 1 dimension table
player_opp <- chess_table_v5[ , grepl( "Player" , names( chess_table_v5 ) ) ][,-2]
player_opp[] <- lapply(player_opp, function(x) as.numeric(as.character(x)))

Logic to calculate rating

# Obtaining ratings for each of the opponent player from fact table 'player_rating'. This demanded muliple merges. I could have used 'reduce' function but then I had to map to different rows. Something to explore!
Agg_ratings <-  merge(x= player_opp,y= player_rating, by.x = "R1_Player",by.y = "Player_no", all.x = TRUE) %>% merge(.,y= player_rating, by.x = "R2_Player",by.y = "Player_no", all.x = TRUE) %>% merge(.,y= player_rating, by.x = "R3_Player",by.y = "Player_no", all.x = TRUE)%>% merge(.,y= player_rating, by.x = "R4_Player",by.y = "Player_no", all.x = TRUE) %>% merge(.,y= player_rating, by.x = "R5_Player",by.y = "Player_no", all.x = TRUE)%>% merge(.,y= player_rating, by.x = "R6_Player",by.y = "Player_no", all.x = TRUE)%>% merge(.,y= player_rating, by.x = "R7_Player",by.y = "Player_no", all.x = TRUE)

## Warning in merge.data.frame(., y = player_rating, by.x = "R4_Player", by.y =
## "Player_no", : column names 'Pre_Rating.x', 'Pre_Rating.y' are duplicated in the
## result

## Warning in merge.data.frame(., y = player_rating, by.x = "R5_Player", by.y =
## "Player_no", : column names 'Pre_Rating.x', 'Pre_Rating.y' are duplicated in the
## result

## Warning in merge.data.frame(., y = player_rating, by.x = "R6_Player", by.y =
## "Player_no", : column names 'Pre_Rating.x', 'Pre_Rating.y', 'Pre_Rating.x',
## 'Pre_Rating.y' are duplicated in the result

## Warning in merge.data.frame(., y = player_rating, by.x = "R7_Player", by.y =
## "Player_no", : column names 'Pre_Rating.x', 'Pre_Rating.y', 'Pre_Rating.x',
## 'Pre_Rating.y' are duplicated in the result

#Preparing the dataframe to take average across the rows to calculate Elo score
Avg_ratings <- Agg_ratings[,!(grepl( "_Player" , names( Agg_ratings )))]

#Calculating row means
Avg_ratings$Mean_Pre_Rating <-rowMeans(Avg_ratings[,-1],na.rm = TRUE)

# Leveraged round_df code from Stackoverflow https://stackoverflow.com/questions/29875914/rounding-values-in-a-dataframe-in-r
round_df <- function(x, digits) {
    # round all numeric variables
    # x: data frame 
    # digits: number of digits to round
    numeric_columns <- sapply(x, mode) == 'numeric'
    x[numeric_columns] <-  round(x[numeric_columns], digits)
    x
}

Avg_ratings <- round_df(Avg_ratings,0)[,c("Player_no","Mean_Pre_Rating")]

Step 3: Conclusion

#Stitch together required columns from the tables to generate CSV file
 Final_data <-  merge(x= chess_table_v5[ , c("Player_no","Player_Name", "State", "Total_Points", "Pre_Rating")],y= Avg_ratings[,c("Player_no","Mean_Pre_Rating")], by = "Player_no")

Final_data$Player_no <- as.numeric(as.character(Final_data$Player_no))
Final_data$Pre_Rating <- as.numeric(as.character(Final_data$Pre_Rating))

Final_data <- Final_data[order(Final_data$Player_no),]
knitr:: kable(head(Final_data,10))

	Player_no	Player_Name	State	Total_Points	Pre_Rating	Mean_Pre_Rating
1	1	GARY HUA	ON	6.0	1794	1605
12	2	DAKSHESH DARURI	MI	6.0	1553	1469
23	3	ADITYA BAJAJ	MI	6.0	1384	1564
34	4	PATRICK H SCHILLING	MI	5.5	1716	1574
45	5	HANSHI ZUO	MI	5.5	1655	1501
56	6	HANSEN SONG	OH	5.0	1686	1519
62	7	GARY DEE SWATHELL	MI	5.0	1649	1372
63	8	EZEKIEL HOUGHTON	MI	5.0	1641	1468
64	9	STEFANO LEE	ON	5.0	1411	1523
2	10	ANVIT RAO	MI	5.0	1365	1554

#Writing CSV file to the current working directory
write.csv(Final_data,"Chess_tournament.csv", row.names = FALSE)

Step 4: The Big question!

Using the provided ELO calculation, determine each player’s expected result (number of points),based on his or her pre-tournament rating, and the average pre-tournament rating for all of the player’s opponents. Which player scored the most points relative to his or her expected result?

#Calculating expected score for the players
Final_data$Exp_score <- (1 / (1 + 10^((Final_data$Pre_Rating - Final_data$Mean_Pre_Rating)/400)))
# Assuming The rating of player is updated using the formula rating1 = rating1 + K*(Actual Score – Expected score_ where 'k' weighting factor is assumed as '20'
Final_data$Exp_pts <-  Final_data$Pre_Rating  + 20*(1-Final_data$Exp_score)

#to compare with final score, let's fetch it from raw data

Final_rating <-  chess_table_v5[ , c( "Player_no","Post_Rating")]
Final_rating[] <- lapply(Final_rating, function(x) as.numeric(as.character(x)))

Final_datav2 <-merge(x= Final_data,y= Final_rating, by = "Player_no", all.x = TRUE)

Final_datav2$Score_delta <- Final_datav2$Post_Rating - Final_datav2$Exp_pts

Player_beats_odds <- Final_datav2[order(-Final_datav2$Score_delta),][1,]
knitr:: kable(Player_beats_odds)

	Player_no	Player_Name	State	Total_Points	Pre_Rating	Mean_Pre_Rating	Exp_score	Exp_pts	Post_Rating	Score_delta
46	46	JACOB ALEXANDER LAVALLEY	MI	3.0	377	1358	0.9964846	377.0703	1076	698.9297

Project 1 - Data Analysis

Bharani Nittala

2020-09-19

Overview

Step 1:Let’s load required libraries and raw data

But wait, do I need the headers ?

Step 2: Break the task into sub-tasks

1)Understand data format

2)Keep only the required parts

3)Logic to calculate rating

Understand data format

Keep only the required parts

Understand data format

Logic to calculate rating

Step 3: Conclusion

Step 4: The Big question!

Take a bow “JACOB ALEXANDER LAVALLEY”, that’s indeed a splendid performance!!