Assignment

Load Packages

Load Text File

Introduction

Split Data by Separators

Assign Row IDs

Create New Columns and Extract Values using the Stringr Package

Create the Player Table

Tidy the Data by Pivoting Longer and Wider

Create the Rounds Table

Github Repository

Load Packages

library(tidyverse)
library(readr)
library(textdata)

Load Text File

txt_lines <- readLines("tournamentinfo.txt")
txt_data <- txt_lines[!grepl("^-+$", txt_lines)]

Introduction

In this report, a text file showing chess data is transformed so it may be used for future analysis. The data in the text file appears like this:

Split Data by Separators

First, the data is split using the separators within the rows, and saved as a dataframe.

split_data <- strsplit(txt_data, "\\|")
df <- do.call(rbind, split_data) |> as.data.frame()

Assign Row IDs

Then, a column titled ID is calculated to create a unique ID for each data entry group, as organized in the text file.

df_v2 <- df |> mutate(ID =ceiling(row_number()/2))

Create New Columns and Extract Values using the Stringr Package

To begin the data transformation, regex statements are used to extract values.

df_v3 <- df_v2 |> 
  mutate(uscf_id = str_extract(V2,"\\d{8}"),
         rtg_pre = str_extract(V2, "R:\\s\\d+[P]\\d+|R:\\s\\d+"),
         rtg_post = str_extract(V2, "(->)\\d+[a-zA-Z]\\d+|(->)\\d+"),
         norm = str_extract(V3, "[N]:\\d+"),
         pair_id = str_extract(V1,"\\d+"),
         total_pts = str_extract(V3,"\\d.\\d"),
         player_state = str_extract(V1,"[A-Z]{2}"),
         rtg_pre = str_remove(rtg_pre,"R:"),
         rtg_post = str_remove(rtg_post,"(->)"),
         norm = str_remove(norm, "[N]:")
         ) |> 
  fill(pair_id,
       total_pts) |> 
  fill(rtg_pre,
       rtg_post,
       norm,
       player_state,
       .direction="up")

Create the Player Table

Then we create the first table, including the player information.

tbl1 <- df_v3 |> 
  select(V2,ID,uscf_id,pair_id,rtg_pre,rtg_post,total_pts,norm,player_state) |> 
  tail(-2) |> 
  mutate(player_name = if_else(is.na(uscf_id),V2,NA))

tbl1 <- tbl1 |> 
    fill(`player_name`,.direction="down") |> 
    fill(`uscf_id`,.direction="up") |> 
    relocate(`ID`,`player_name`,`uscf_id`,`rtg_pre`,`rtg_post`) |>
    select(-V2) |> 
    distinct() #remove duplicates

ID	player_name	uscf_id	rtg_pre	rtg_post	pair_id	total_pts	norm	player_state
2	GARY HUA	15445895	1794	1817	1	6.0	2	ON
3	DAKSHESH DARURI	14598900	1553	1663	2	6.0	2	MI
4	ADITYA BAJAJ	14959604	1384	1640	3	6.0	2	MI
5	PATRICK H SCHILLING	12616049	1716	1744	4	5.5	2	MI
6	HANSHI ZUO	14601533	1655	1690	5	5.5	2	MI
7	HANSEN SONG	15055204	1686	1687	6	5.0	3	OH

Tidy the Data by Pivoting Longer and Wider

To arrange the data so that the round information is clear, the data is pivoted longer, using regex expressions to extract values.

Then, the data is pivoted wider, to create a color and type column.

df_v4 <- df_v3 |> 
  fill(uscf_id, .direction="up") |> 
  rename("R_1" = "V4",
         "R_2" = "V5",
         "R_3" = "V6",
         "R_4" = "V7",
         "R_5" = "V8",
         "R_6" = "V9",
         "R_7" = "V10"
         ) |> 
  select(-ID, -V1,-V2,-V3,-rtg_pre,-rtg_post,-V11,-norm,-pair_id,-total_pts) |> 
  relocate(uscf_id)|> 
  tail(-2) |> 
  group_by(uscf_id) |>
  mutate(
    across(
      matches("^R_")
      ,~max(str_extract(.x,"[A-Z](?=\\s+\\d+)"), na.rm = TRUE),
      .names = "type_{.col}"
    ),
    across(
      matches("^R_")
      ,~max(str_extract(.x,"\\d+"), na.rm = TRUE),
      .names = "playerid_{.col}"
    ),
    across(
      matches("^R_")
      ,~max(str_extract(str_trim(.x),"^(B|W)$"), na.rm = TRUE),
      .names = "color_{.col}"
    )
  ) |>
  select(-matches("^R_\\d$")) |>
  distinct() |> 
  pivot_longer(
    cols = matches("_R_"),
    names_to = c("metric","round"),
    names_pattern = "(.*)_R_(.*)"
  )
  
df_v5 <- df_v4 |> 
  pivot_wider(
    id_cols = c(uscf_id,round), 
    names_from = metric,
    values_from = value
  ) |> 
  rename("opp_pair_id"="playerid")

Create the Rounds Table

Finally, a table is developed to show the data for each round.

tbl2 <- df_v5 |>
  mutate(pt = case_when(type == "D"~.5, # converting to points 
                        type== "W"~1,
                        type== "L"~0,
                        TRUE ~NA)
          ) |> 
  relocate(uscf_id,round,type,pt,opp_pair_id)

uscf_id	round	type	pt	opp_pair_id	color
15445895	1	W	1.0	39	W
15445895	2	W	1.0	21	B
15445895	3	W	1.0	18	W
15445895	4	W	1.0	14	B
15445895	5	W	1.0	7	W
15445895	6	D	0.5	12	B

# select Player's Name, Player's State, Total Number of Pts, Player's Pre-Rating,
# Avg Pre-Chess Rating of Opponent for 1st player matched they were matched with
avg_opp_rating <- left_join(
  tbl2,
  mutate(
    tbl1,
    pair_id,
    rtg_pre_num = as.numeric(str_extract(rtg_pre, "^ \\d+")),
    .keep = "used"
  ),
  by=c("opp_pair_id"="pair_id")
) |>
  group_by(uscf_id) |>
  summarise(
    total_opponents = sum(!is.na(rtg_pre_num), na.rm = TRUE),
    mean_opp_rtg = round(mean(rtg_pre_num, na.rm = TRUE),digits=0)
  ) |>
  arrange(desc(mean_opp_rtg)) |> 
  left_join(
    tbl1,
    by="uscf_id"
  ) |> 
  arrange(ID) |> 
  select(player_name,player_state,total_pts,rtg_pre,mean_opp_rtg)

knitr::kable(head(avg_opp_rating, format ="markdown"))

player_name	player_state	total_pts	rtg_pre	mean_opp_rtg
GARY HUA	ON	6.0	1794	1605
DAKSHESH DARURI	MI	6.0	1553	1557
ADITYA BAJAJ	MI	6.0	1384	1646
PATRICK H SCHILLING	MI	5.5	1716	1574
HANSHI ZUO	MI	5.5	1655	1564
HANSEN SONG	OH	5.0	1686	1519

write_csv(avg_opp_rating,"output.csv")

Assignment_5

Alyssa Gurkas

2025-03-19