library(tidyverse)
library(readr)
library(textdata)
txt_lines <- readLines("tournamentinfo.txt")
txt_data <- txt_lines[!grepl("^-+$", txt_lines)]
In this report, a text file showing chess data is transformed so it
may be used for future analysis. The data in the text file appears like
this:
First, the data is split using the separators within the rows, and saved as a dataframe.
split_data <- strsplit(txt_data, "\\|")
df <- do.call(rbind, split_data) |> as.data.frame()
Then, a column titled ID is calculated to create a unique ID for each data entry group, as organized in the text file.
df_v2 <- df |> mutate(ID =ceiling(row_number()/2))
To begin the data transformation, regex statements are used to extract values.
df_v3 <- df_v2 |>
mutate(uscf_id = str_extract(V2,"\\d{8}"),
rtg_pre = str_extract(V2, "R:\\s\\d+[P]\\d+|R:\\s\\d+"),
rtg_post = str_extract(V2, "(->)\\d+[a-zA-Z]\\d+|(->)\\d+"),
norm = str_extract(V3, "[N]:\\d+"),
pair_id = str_extract(V1,"\\d+"),
total_pts = str_extract(V3,"\\d.\\d"),
player_state = str_extract(V1,"[A-Z]{2}"),
rtg_pre = str_remove(rtg_pre,"R:"),
rtg_post = str_remove(rtg_post,"(->)"),
norm = str_remove(norm, "[N]:")
) |>
fill(pair_id,
total_pts) |>
fill(rtg_pre,
rtg_post,
norm,
player_state,
.direction="up")
Then we create the first table, including the player information.
tbl1 <- df_v3 |>
select(V2,ID,uscf_id,pair_id,rtg_pre,rtg_post,total_pts,norm,player_state) |>
tail(-2) |>
mutate(player_name = if_else(is.na(uscf_id),V2,NA))
tbl1 <- tbl1 |>
fill(`player_name`,.direction="down") |>
fill(`uscf_id`,.direction="up") |>
relocate(`ID`,`player_name`,`uscf_id`,`rtg_pre`,`rtg_post`) |>
select(-V2) |>
distinct() #remove duplicates
ID | player_name | uscf_id | rtg_pre | rtg_post | pair_id | total_pts | norm | player_state |
---|---|---|---|---|---|---|---|---|
2 | GARY HUA | 15445895 | 1794 | 1817 | 1 | 6.0 | 2 | ON |
3 | DAKSHESH DARURI | 14598900 | 1553 | 1663 | 2 | 6.0 | 2 | MI |
4 | ADITYA BAJAJ | 14959604 | 1384 | 1640 | 3 | 6.0 | 2 | MI |
5 | PATRICK H SCHILLING | 12616049 | 1716 | 1744 | 4 | 5.5 | 2 | MI |
6 | HANSHI ZUO | 14601533 | 1655 | 1690 | 5 | 5.5 | 2 | MI |
7 | HANSEN SONG | 15055204 | 1686 | 1687 | 6 | 5.0 | 3 | OH |
To arrange the data so that the round information is clear, the data is pivoted longer, using regex expressions to extract values.
Then, the data is pivoted wider, to create a color and type column.
df_v4 <- df_v3 |>
fill(uscf_id, .direction="up") |>
rename("R_1" = "V4",
"R_2" = "V5",
"R_3" = "V6",
"R_4" = "V7",
"R_5" = "V8",
"R_6" = "V9",
"R_7" = "V10"
) |>
select(-ID, -V1,-V2,-V3,-rtg_pre,-rtg_post,-V11,-norm,-pair_id,-total_pts) |>
relocate(uscf_id)|>
tail(-2) |>
group_by(uscf_id) |>
mutate(
across(
matches("^R_")
,~max(str_extract(.x,"[A-Z](?=\\s+\\d+)"), na.rm = TRUE),
.names = "type_{.col}"
),
across(
matches("^R_")
,~max(str_extract(.x,"\\d+"), na.rm = TRUE),
.names = "playerid_{.col}"
),
across(
matches("^R_")
,~max(str_extract(str_trim(.x),"^(B|W)$"), na.rm = TRUE),
.names = "color_{.col}"
)
) |>
select(-matches("^R_\\d$")) |>
distinct() |>
pivot_longer(
cols = matches("_R_"),
names_to = c("metric","round"),
names_pattern = "(.*)_R_(.*)"
)
df_v5 <- df_v4 |>
pivot_wider(
id_cols = c(uscf_id,round),
names_from = metric,
values_from = value
) |>
rename("opp_pair_id"="playerid")
Finally, a table is developed to show the data for each round.
tbl2 <- df_v5 |>
mutate(pt = case_when(type == "D"~.5, # converting to points
type== "W"~1,
type== "L"~0,
TRUE ~NA)
) |>
relocate(uscf_id,round,type,pt,opp_pair_id)
uscf_id | round | type | pt | opp_pair_id | color |
---|---|---|---|---|---|
15445895 | 1 | W | 1.0 | 39 | W |
15445895 | 2 | W | 1.0 | 21 | B |
15445895 | 3 | W | 1.0 | 18 | W |
15445895 | 4 | W | 1.0 | 14 | B |
15445895 | 5 | W | 1.0 | 7 | W |
15445895 | 6 | D | 0.5 | 12 | B |
# select Player's Name, Player's State, Total Number of Pts, Player's Pre-Rating,
# Avg Pre-Chess Rating of Opponent for 1st player matched they were matched with
avg_opp_rating <- left_join(
tbl2,
mutate(
tbl1,
pair_id,
rtg_pre_num = as.numeric(str_extract(rtg_pre, "^ \\d+")),
.keep = "used"
),
by=c("opp_pair_id"="pair_id")
) |>
group_by(uscf_id) |>
summarise(
total_opponents = sum(!is.na(rtg_pre_num), na.rm = TRUE),
mean_opp_rtg = round(mean(rtg_pre_num, na.rm = TRUE),digits=0)
) |>
arrange(desc(mean_opp_rtg)) |>
left_join(
tbl1,
by="uscf_id"
) |>
arrange(ID) |>
select(player_name,player_state,total_pts,rtg_pre,mean_opp_rtg)
knitr::kable(head(avg_opp_rating, format ="markdown"))
player_name | player_state | total_pts | rtg_pre | mean_opp_rtg |
---|---|---|---|---|
GARY HUA | ON | 6.0 | 1794 | 1605 |
DAKSHESH DARURI | MI | 6.0 | 1553 | 1557 |
ADITYA BAJAJ | MI | 6.0 | 1384 | 1646 |
PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
HANSHI ZUO | MI | 5.5 | 1655 | 1564 |
HANSEN SONG | OH | 5.0 | 1686 | 1519 |
write_csv(avg_opp_rating,"output.csv")