• Load Packages
  • Load Text File
  • Introduction
  • Split Data by Separators
  • Assign Row IDs
  • Create New Columns and Extract Values using the Stringr Package
  • Create the Player Table
  • Tidy the Data by Pivoting Longer and Wider
  • Create the Rounds Table

Github Repository

Load Packages

library(tidyverse)
library(readr)
library(textdata)

Load Text File

txt_lines <- readLines("tournamentinfo.txt")
txt_data <- txt_lines[!grepl("^-+$", txt_lines)]

Introduction

In this report, a text file showing chess data is transformed so it may be used for future analysis. The data in the text file appears like this: Chess Data

Split Data by Separators

First, the data is split using the separators within the rows, and saved as a dataframe.

split_data <- strsplit(txt_data, "\\|")
df <- do.call(rbind, split_data) |> as.data.frame()

Assign Row IDs

Then, a column titled ID is calculated to create a unique ID for each data entry group, as organized in the text file.

df_v2 <- df |> mutate(ID =ceiling(row_number()/2))

Create New Columns and Extract Values using the Stringr Package

To begin the data transformation, regex statements are used to extract values.

df_v3 <- df_v2 |> 
  mutate(uscf_id = str_extract(V2,"\\d{8}"),
         rtg_pre = str_extract(V2, "R:\\s\\d+[P]\\d+|R:\\s\\d+"),
         rtg_post = str_extract(V2, "(->)\\d+[a-zA-Z]\\d+|(->)\\d+"),
         norm = str_extract(V3, "[N]:\\d+"),
         pair_id = str_extract(V1,"\\d+"),
         total_pts = str_extract(V3,"\\d.\\d"),
         player_state = str_extract(V1,"[A-Z]{2}"),
         rtg_pre = str_remove(rtg_pre,"R:"),
         rtg_post = str_remove(rtg_post,"(->)"),
         norm = str_remove(norm, "[N]:")
         ) |> 
  fill(pair_id,
       total_pts) |> 
  fill(rtg_pre,
       rtg_post,
       norm,
       player_state,
       .direction="up")

Create the Player Table

Then we create the first table, including the player information.

tbl1 <- df_v3 |> 
  select(V2,ID,uscf_id,pair_id,rtg_pre,rtg_post,total_pts,norm,player_state) |> 
  tail(-2) |> 
  mutate(player_name = if_else(is.na(uscf_id),V2,NA))

tbl1 <- tbl1 |> 
    fill(`player_name`,.direction="down") |> 
    fill(`uscf_id`,.direction="up") |> 
    relocate(`ID`,`player_name`,`uscf_id`,`rtg_pre`,`rtg_post`) |>
    select(-V2) |> 
    distinct() #remove duplicates
ID player_name uscf_id rtg_pre rtg_post pair_id total_pts norm player_state
2 GARY HUA 15445895 1794 1817 1 6.0 2 ON
3 DAKSHESH DARURI 14598900 1553 1663 2 6.0 2 MI
4 ADITYA BAJAJ 14959604 1384 1640 3 6.0 2 MI
5 PATRICK H SCHILLING 12616049 1716 1744 4 5.5 2 MI
6 HANSHI ZUO 14601533 1655 1690 5 5.5 2 MI
7 HANSEN SONG 15055204 1686 1687 6 5.0 3 OH

Tidy the Data by Pivoting Longer and Wider

To arrange the data so that the round information is clear, the data is pivoted longer, using regex expressions to extract values.

Then, the data is pivoted wider, to create a color and type column.

df_v4 <- df_v3 |> 
  fill(uscf_id, .direction="up") |> 
  rename("R_1" = "V4",
         "R_2" = "V5",
         "R_3" = "V6",
         "R_4" = "V7",
         "R_5" = "V8",
         "R_6" = "V9",
         "R_7" = "V10"
         ) |> 
  select(-ID, -V1,-V2,-V3,-rtg_pre,-rtg_post,-V11,-norm,-pair_id,-total_pts) |> 
  relocate(uscf_id)|> 
  tail(-2) |> 
  group_by(uscf_id) |>
  mutate(
    across(
      matches("^R_")
      ,~max(str_extract(.x,"[A-Z](?=\\s+\\d+)"), na.rm = TRUE),
      .names = "type_{.col}"
    ),
    across(
      matches("^R_")
      ,~max(str_extract(.x,"\\d+"), na.rm = TRUE),
      .names = "playerid_{.col}"
    ),
    across(
      matches("^R_")
      ,~max(str_extract(str_trim(.x),"^(B|W)$"), na.rm = TRUE),
      .names = "color_{.col}"
    )
  ) |>
  select(-matches("^R_\\d$")) |>
  distinct() |> 
  pivot_longer(
    cols = matches("_R_"),
    names_to = c("metric","round"),
    names_pattern = "(.*)_R_(.*)"
  )
  
df_v5 <- df_v4 |> 
  pivot_wider(
    id_cols = c(uscf_id,round), 
    names_from = metric,
    values_from = value
  ) |> 
  rename("opp_pair_id"="playerid")

Create the Rounds Table

Finally, a table is developed to show the data for each round.

tbl2 <- df_v5 |>
  mutate(pt = case_when(type == "D"~.5, # converting to points 
                        type== "W"~1,
                        type== "L"~0,
                        TRUE ~NA)
          ) |> 
  relocate(uscf_id,round,type,pt,opp_pair_id)
uscf_id round type pt opp_pair_id color
15445895 1 W 1.0 39 W
15445895 2 W 1.0 21 B
15445895 3 W 1.0 18 W
15445895 4 W 1.0 14 B
15445895 5 W 1.0 7 W
15445895 6 D 0.5 12 B
# select Player's Name, Player's State, Total Number of Pts, Player's Pre-Rating,
# Avg Pre-Chess Rating of Opponent for 1st player matched they were matched with
avg_opp_rating <- left_join(
  tbl2,
  mutate(
    tbl1,
    pair_id,
    rtg_pre_num = as.numeric(str_extract(rtg_pre, "^ \\d+")),
    .keep = "used"
  ),
  by=c("opp_pair_id"="pair_id")
) |>
  group_by(uscf_id) |>
  summarise(
    total_opponents = sum(!is.na(rtg_pre_num), na.rm = TRUE),
    mean_opp_rtg = round(mean(rtg_pre_num, na.rm = TRUE),digits=0)
  ) |>
  arrange(desc(mean_opp_rtg)) |> 
  left_join(
    tbl1,
    by="uscf_id"
  ) |> 
  arrange(ID) |> 
  select(player_name,player_state,total_pts,rtg_pre,mean_opp_rtg)

knitr::kable(head(avg_opp_rating, format ="markdown"))
player_name player_state total_pts rtg_pre mean_opp_rtg
GARY HUA ON 6.0 1794 1605
DAKSHESH DARURI MI 6.0 1553 1557
ADITYA BAJAJ MI 6.0 1384 1646
PATRICK H SCHILLING MI 5.5 1716 1574
HANSHI ZUO MI 5.5 1655 1564
HANSEN SONG OH 5.0 1686 1519
write_csv(avg_opp_rating,"output.csv")