Loading libraries to use
library(tidyverse)
library(ggplot2)
library(dplyr)
library(plyr)
library(stringr)
library(readr)
library(data.table)
library(visdat)
library(scales)Actions Performed for Cleaning and Manipulation
Read in text file
View Structure/Glimpse the Data/Search for Missing values
Reshape the dataset to drop junk rows, column containing all NA values
Rename columns with accurate variable names
Subset columns and rows to pull out true data points from junk data through the use of select() and filtering via regular expressions
Create new columns/variables to hold Pre-Tournament Rating Score and Post-Tournament Rating score of contestants
Collapse “Round 1”- “Round 7” columns into single column/variable “Round” to track the round number
Create new columns/variables to hold Game Outcome and Opponent Pair Id from each round played by contestant
Join transformed subsets by pair id into single data set
Match opponent pair id of each round to their contestant pre-score rating through join
Perform vectorized aggregate calculation of opponent pre-rating average by contestant id and save result as new dataset
Match opponent pre-rating average to contestant through join
Subset to only variables requested and reorder columns to order requested to create final dataframe
Write final dataframe to new csv file and specify path to save file in and file name
Extra Credit -
ELO performance rating calculation,
FIDE performance rating calculation
Importing the Data
tournament <- read.table('https://raw.githubusercontent.com/cassie-boylan/DATA-607/main/tournamentinfo.txt',
stringsAsFactors = FALSE,
sep = "|",
nrows = -1,
skip = 1,
fill = TRUE,
strip.white = TRUE,
comment.char = "",
allowEscapes = TRUE)Introducting the Data
The dataset used in this exercise is a txt file containing the information of 64 contestants in a chess tournament: their name, state, pre-tournament rating score and the rounds played - including the opponents contestants played in each round and whether they won, lost, drew or if the round was not played. The data, while having some structure, is not maintained in a tidy or tabular way.
I will be utilizing the stringr package, regular expressions, and tidyr verbs to transform the data of this file into a consistent tabular format. I will be utilizing dplyr verbs to subset variables and observations, to create new columns to hold the additional variables established, and to perform mutating joins on the cross tables established per subsetting.
Once the data has been standardized so that each variable of the dataset is in its own column, and each contestant its own observation/row, I will perform calculations to determine the avg pre-rating score of all opponents that each contestant faces in the tournament. I will also calculate the ELO performance rating of each contestant based on their wins/losses/opponent performance rating as well as the FIDE performance rating based on their total points/opponent performance rating and number of games played.
Assessing Data Structure & Tidiness
vis_miss(tournament)head(tournament)## V1
## 1 Pair
## 2 Num
## 3 -----------------------------------------------------------------------------------------
## 4 1
## 5 ON
## 6 -----------------------------------------------------------------------------------------
## V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 Player Name Total Round Round Round Round Round Round Round
## 2 USCF ID / Rtg (Pre->Post) Pts 1 2 3 4 5 6 7
## 3
## 4 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7 D 12 D 4
## 5 15445895 / R: 1794 ->1817 N:2 W B W B W B W
## 6
## V11
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
str(tournament)## 'data.frame': 195 obs. of 11 variables:
## $ V1 : chr "Pair" "Num" "-----------------------------------------------------------------------------------------" "1" ...
## $ V2 : chr "Player Name" "USCF ID / Rtg (Pre->Post)" "" "GARY HUA" ...
## $ V3 : chr "Total" "Pts" "" "6.0" ...
## $ V4 : chr "Round" "1" "" "W 39" ...
## $ V5 : chr "Round" "2" "" "W 21" ...
## $ V6 : chr "Round" "3" "" "W 18" ...
## $ V7 : chr "Round" "4" "" "W 14" ...
## $ V8 : chr "Round" "5" "" "W 7" ...
## $ V9 : chr "Round" "6" "" "D 12" ...
## $ V10: chr "Round" "7" "" "D 4" ...
## $ V11: logi NA NA NA NA NA NA ...
Reshape / Rename Tournament Dataset
Reshape Data Table
tournament_clean <- tournament[-c(1:3), -11]Reindex Data Table
rownames(tournament_clean) <- 1:nrow(tournament_clean)Set Column Names
names(tournament_clean)[1] <- "state"
names(tournament_clean)[2] <- "name"
names(tournament_clean)[3] <- "total_points"
names(tournament_clean)[4:10] <- paste0("R", 1:7)Subset Variables & Observations
I am filtering to pick out the rows with actual data values via the use of regular expressions
contestants <- tournament_clean %>%
filter(grepl("[0-9]",state)) %>%
transmute(
id = as.numeric(state),
name,
total_points = as.numeric(total_points))score_rtgs <- tournament_clean %>%
select(state, pre_score_raw = name,) %>%
filter(state %in% c(grep("^[A-Z]+",state, value=TRUE)))rounds_tmp <- tournament_clean %>%
select(id = state, starts_with("R")) %>%
filter(grepl("[0-9]",id)) %>%
gather(round_no, outcome_tmp, R1:R7)Establish New Columns to Hold Each Identified Variable
score_rtgs <- score_rtgs %>% separate(pre_score_raw, into=c("USCF ID","Rtg (Pre->Post)"),sep="/")
score_rtgs <- score_rtgs %>% separate("Rtg (Pre->Post)", into=c("rtg_pre","rtg_post"), sep="->")Perform String Extractions
I am utilizing stringr verbs and regular expressions to extract actual data value contained in cell
Perform Column DataType Coercing for Variables that are Numeric
I am utilizing transmute & as.datatype() to coerce variables to the datatype they should be so they can be utilized in subsequent calculations
score_rtgs <- score_rtgs %>%
mutate(rtg_pre = str_extract(rtg_pre,"[0-9]{3,4}"),
rtg_post = str_extract(rtg_post,"[0-9]{3,4}"))
score_rtgs <- score_rtgs %>%
transmute(
state,
rtg_pre = as.numeric(rtg_pre),
rtg_post = as.numeric(rtg_post))
score_rtgs <- tibble::rowid_to_column(score_rtgs, "id")
str(score_rtgs)## 'data.frame': 64 obs. of 4 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ state : chr "ON" "MI" "MI" "MI" ...
## $ rtg_pre : num 1794 1553 1384 1716 1655 ...
## $ rtg_post: num 1817 1663 1640 1744 1690 ...
rounds_tmp <- rounds_tmp %>%
transmute(id = as.numeric(id),
round_no = str_extract(round_no,"\\d+"),
outcome = str_extract(outcome_tmp,"^[A-Z]{1}"),
opp_id = as.numeric(str_extract(outcome_tmp,"\\d+")
))
str(rounds_tmp)## 'data.frame': 448 obs. of 4 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ round_no: chr "1" "1" "1" "1" ...
## $ outcome : chr "W" "W" "L" "W" ...
## $ opp_id : num 39 63 8 23 45 34 57 3 25 16 ...
Merge Clean Data Subsets
new_df <- inner_join(contestants,score_rtgs, by="id")
setcolorder(new_df,c("id","state","name","total_points","rtg_pre","rtg_post"))
str(new_df)## 'data.frame': 64 obs. of 6 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ state : chr "ON" "MI" "MI" "MI" ...
## $ name : chr "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" "PATRICK H SCHILLING" ...
## $ total_points: num 6 6 6 5.5 5.5 5 5 5 5 5 ...
## $ rtg_pre : num 1794 1553 1384 1716 1655 ...
## $ rtg_post : num 1817 1663 1640 1744 1690 ...
tmp2 <- left_join(rounds_tmp, new_df[c("id","rtg_pre")], by=c("opp_id" ="id"))
str(tmp2)## 'data.frame': 448 obs. of 5 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ round_no: chr "1" "1" "1" "1" ...
## $ outcome : chr "W" "W" "L" "W" ...
## $ opp_id : num 39 63 8 23 45 34 57 3 25 16 ...
## $ rtg_pre : num 1436 1175 1641 1363 1242 ...
Explore Tournament Outcome Results
How many rounds were played that resulted in a win/lose or draw in the tournament?
rounds_tmp %>%
filter(grepl("[WLD]",outcome)) %>%
count("outcome")## outcome freq
## 1 D 58
## 2 L 175
## 3 W 175
What percentage does this represent?
percent(length(which(grepl("[WLD]",tmp2$outcome)))/nrow(tmp2),2)## [1] "92%"
How many rounds were not played in the tournament?
length(which(grepl("[U]",tmp2$outcome)))## [1] 16
What percentage of total rounds does this represent?
percent(length(which(grepl("[U]",tmp2$outcome)))/nrow(tmp2),2)## [1] "4%"
How many games overall did not result in a Win/Lose/Draw and did not contribute to performance rating score?
rounds_tmp %>%
filter(grepl("[^WLD]",outcome)) %>%
count("outcome")## outcome freq
## 1 B 7
## 2 H 16
## 3 U 16
## 4 X 1
What percentage of non- Win/Lose/Draw of all rounds does this represent?
percent(length(which(grepl("[^WLD]",tmp2$outcome)))/nrow(tmp2),2)## [1] "8%"
Calculate Average Pre-Score Rating of All Opponents Each Contestant Faced
pre_avg <- aggregate( rtg_pre ~ id, tmp2, mean )
colnames(pre_avg) <- c("id","pre_rtg_avg")
pre_avg$pre_rtg_avg <- round(pre_avg$pre_rtg_avg, 0)
str(pre_avg)## 'data.frame': 64 obs. of 2 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ pre_rtg_avg: num 1605 1469 1564 1574 1501 ...
df_opponent_avg<- inner_join(new_df, pre_avg, on="id")Reshape Dataset to Show Only Requested Variables
final_df <- df_opponent_avg %>%
select(-c("id","rtg_post")) %>%
relocate(name, .before = state)Save Dataset as .csv file
write.csv(final_df, "C:\\Users\\cassi\\Documents\\DATA 607_Fall2021\\project1.csv",row.names = TRUE)Calculating ELO Performance
\[ELO = \text{ total of opponents ratings + 400 x (Wins - Losses) / Number of Games}\]
outcome_count<- tmp2 %>%
filter(grepl("[WLD]",outcome)) %>%
group_by(id,outcome)%>%
tally()
outcome_count <- pivot_wider(outcome_count, names_from = "outcome",values_from = "n")
outcome_count[is.na(outcome_count)] = 0
outcome_count <- outcome_count %>%
mutate(add_wins = W*400,
add_losses = L*-400,
num_games = D+L+W,
ELO_calc1 = add_wins + add_losses)
sum_rtg <- aggregate( rtg_pre ~ id, tmp2, sum )
colnames(sum_rtg) <- c("id","pre_rtg_sum")
sum_rtg$pre_rtg_sum <- round(sum_rtg$pre_rtg_sum, 0)
str(sum_rtg)## 'data.frame': 64 obs. of 2 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ pre_rtg_sum: num 11237 10285 10945 11015 10506 ...
elo_rtg_df <- inner_join(outcome_count,sum_rtg, on="id")
elo_rtg_df <- elo_rtg_df %>%
mutate(ELO_perf_rating = round((ELO_calc1+pre_rtg_sum)/num_games),0)Calculating FIDE Performance
\[FIDE = \text{ Opponents' Rating Average + Rating Difference}\] \[RatingsDifference (d_p)= \text{Lookup of }p\] \[Tournament Percentage Score (p) = \text{ Total Points Scored / Number of Games Played}\]
lookup <- read.csv("https://raw.githubusercontent.com/cassie-boylan/DATA-607/main/FIDE_fractional_score_rating_lookup.csv")fide_df <- left_join(df_opponent_avg,
outcome_count %>% dplyr::select(id, num_games),
by="id")
fide_df <- fide_df %>%
mutate(tourn_perc_score = round(total_points/num_games,2))fide_df <- left_join(fide_df,lookup, by = c("tourn_perc_score" = "p"))
fide_df <- fide_df %>%
mutate(fide_perf_rtg = pre_rtg_avg + dp)final_df_ec <- left_join(new_df,elo_rtg_df %>%
dplyr::select(id, ELO_perf_rating), by="id")
final_df_ec <- left_join(final_df_ec,fide_df %>%
dplyr::select(id, fide_perf_rtg), by="id")