Create an R markdown file that converts a structured text file into a .csv, containing the following:
Player’s Name
Player’s State
Total Number of Points
Player’s Pre-Rating
Average Pre-Chess Rating of Opponents
For the first player, the line should read:
Gary Hua, ON, 6.0, 1794, 1605
Download the raw text file from Github.
# Download the tournament .txt file from Github
txtfile <- getURL("https://raw.githubusercontent.com/mmippolito/cuny/main/data607/project1/tournamentinfo.txt")
Extract each player (2 lines). The raw data will look like this:
1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
# Create a regex to get each player line by line. The lines will look like this:
#
# 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
# ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
#
# Sometimes there could be a bye or a half-bye instead of a win/lose/draw, like this: |B |
# And sometimes there is a provisional rating after a player's pre-performance rating,
# e.g. 12531685 / R: 1291P12->1259P17
#
re <- " +(\\d+) \\| ([A-Z\\- ]+) +\\|([\\d\\.]+) +\\|(([A-Z] +(\\d+)?\\|){7}) *[\\r\\n]+"
re <- str_c(re, " +([A-Z]{2}) +\\| +\\d+ \\/ R: +(\\d+)[P ].+[\\r\\n]+")
matches <- str_match_all(string = txtfile, pattern = re)
Walk through each match, which will be an array of values, like this:
[,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] “1” “GARY HUA ” “6.0” “W 39|W 21|W 18|W 14|W 7|D 12|D 4|” “D 4|” “4” “ON” “1794”
[2,] “2” “DAKSHESH DARURI ” “6.0” “W 63|W 58|L 4|W 17|W 16|W 20|W 7|” “W 7|” “7” “MI” “1553”
[3,] “3” “ADITYA BAJAJ ” “6.0” “L 8|W 61|W 25|W 21|W 11|W 13|W 12|” “W 12|” “12” “MI” “1384”
Further parsing is needed on the “opponent” field. Each line will look like this:
W 39|W 21|W 18|W 14|W 7|D 12|D 4|
and this code will make it look like a vector:
[1] “39” “21” “18” “14” “7” “12” “4”
# Player IDs
ids <- as.integer(matches[[1]][,2])
# Strip whitespace from player name and change to title case
players <- str_to_title(str_trim(string = matches[[1]][,3]))
# Points
pts <- as.numeric(matches[[1]][,4])
# States
states <- matches[[1]][,8]
# Pre-performance rating
preperf <- as.integer(matches[[1]][,9])
# Further parse match results; each line will look like this:
# W 39|W 21|W 18|W 14|W 7|D 12|D 4|
tmp_opp <- str_match_all(string = matches[[1]][,5], pattern = "([A-Z]) +(\\d+)?\\|")
# Create an array of 7 lists of opponent IDs and 7 opponent results (Win/Lose/Draw/Bye/Half-bye).
# For the opp_result, choose the [i + 7]th element,
# since we'll choose the second column of 7 (elements 8 to 14).
# For the opponent ids, choose the [i + 14]th element,
# since we'll choose the third column of 7 (elements 15 to 21).
results <- list(7)
opponents <- list(7)
scores <- list(7)
for(i in 1:7) {
results[[i]] <- (sapply(tmp_opp, "[[", i + 7))
opponents[[i]] <- as.integer(sapply(tmp_opp, "[[", i + 14))
scores[[i]] <- case_when(
results[[i]] == "W" ~ 1,
results[[i]] == "D" ~ 0.5,
results[[i]] == "L" ~ 0,
results[[i]] == "B" ~ 1,
results[[i]] == "H" ~ 0.5,
results[[i]] == "U" ~ 0,
results[[i]] == "X" ~ 1,
results[[i]] == "F" ~ 0
)
}
Now that all the variables are populated, create a dataframe out of them.
# Create main dataframe from variables
df <- data.frame(id = ids, player = players, state = states, pts = pts, prerating = preperf,
opp = opponents, opp_result = results, score = scores)
# Rename opponent columns (opponent's player id, result against that opponent,
# and score against that opponent)
for(i in 1:7) {
names(df)[i + 5] <- str_c("opp", i)
names(df)[i + 12] <- str_c("result", i)
names(df)[i + 19] <- str_c("score", i)
}
# Show the first few lines of the dataframe
df[1:10]
With the dataframe created and populated with variables, now comes the real excitement. That’s right: calculating the mean of each player’s opponents’ pre-tournament rating.
# Create a slimmed-down data frame with only player ID and prerating
dfslim <- select(df, id, prerating)
# Make a copy of the original data frame, only selecting player ID and the
# opponents he/she played against
dftmp <- df %>% select(id, starts_with("opp"))
# Join the slim data frame against each opponent to populate that player's pre-tournament rating
dftmp <- dftmp %>% merge(y = dfslim, by.x = "opp1", by.y = "id", all.x = TRUE) %>% rename(prerating_opp1 = prerating) %>%
merge(y = dfslim, by.x = "opp2", by.y = "id", all.x = TRUE) %>% rename(prerating_opp2 = prerating) %>%
merge(y = dfslim, by.x = "opp3", by.y = "id", all.x = TRUE) %>% rename(prerating_opp3 = prerating) %>%
merge(y = dfslim, by.x = "opp4", by.y = "id", all.x = TRUE) %>% rename(prerating_opp4 = prerating) %>%
merge(y = dfslim, by.x = "opp5", by.y = "id", all.x = TRUE) %>% rename(prerating_opp5 = prerating) %>%
merge(y = dfslim, by.x = "opp6", by.y = "id", all.x = TRUE) %>% rename(prerating_opp6 = prerating) %>%
merge(y = dfslim, by.x = "opp7", by.y = "id", all.x = TRUE) %>% rename(prerating_opp7 = prerating) %>%
arrange(id) %>% select(id, starts_with("prerating_"))
# Compute pre-tournament average
dftmp <- dftmp %>% mutate(opp_prerating_avg = as.integer(rowMeans(subset(dftmp, select =
c(prerating_opp1, prerating_opp2, prerating_opp3, prerating_opp4,
prerating_opp5, prerating_opp6, prerating_opp7)), na.rm = TRUE))) %>%
select(id, opp_prerating_avg)
# Join the new dataframe that includes the prerating average back to the original data frame
dfnew <- df %>% merge(y = dftmp, by = "id")
# Show the first few rows and columns of the new dataframe
dfnew %>% select(player, state, pts, prerating, opp_prerating_avg)
Export the transformed data frame to CSV.
# Export to CSV
dfnew %>% select(player, state, pts, prerating, opp_prerating_avg) %>%
write.csv("project1-out.csv", quote = FALSE, row.names = FALSE)
# Calculate expected score = 7 * expected score using avg pretournament ratings of opponents
dfnew <- dfnew %>% mutate(exp_score = 7 * (10^(prerating / 400)) /
(10^(prerating / 400) + 10^(opp_prerating_avg / 400)))
# Calculate actual score based on actual matches - this should exactly match the "pts" column
dfnew <- dfnew %>% mutate(actual_score = score1 + score2 + score3 +
score4 + score5 + score6 + score7)
dfnew %>% select(player, state, pts, exp_score, actual_score)
Evaluate which players did the best in the tournament, comparing actual score against expected score.
# Calculate difference between actual and expected scores
dfnew <- dfnew %>% mutate(score_diff = (pts - exp_score))
# Display the top ten performers
top_performers <- dfnew %>% select(player, state, pts, exp_score, score_diff) %>%
arrange(desc(score_diff))
top_performers[1:10,]
As shown, Aditya Bajaj of Michigan was the player who most exceeded expectations.
…