Overview

In this project, we were given a text file with chess tournament results where the information had a cross-table structure. Our job was to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Here is an example of what the file looks like:

image text-file-example

Read in data

Load necessary packages

library(tidyverse)
df <- read_csv("https://raw.githubusercontent.com/justinm0rgan/data607/main/Projects/1/data/7645617.txt", show_col_types = F, skip = 1)

Create dataframe

Create empty data frame with correct data types and final structure.

df2 <- data.frame(matrix(ncol = 6, nrow = 64))
colnames(df2) <- c("num", "name", "state", "total_pts", "pre_rating", "avg_rating_opp")

Clean data

Steps to clean data:

1. Filter every 3rd row to exclude dashed line

df <- df %>% 
  filter((row_number() %% 3 == 0) | (row_number() %% 3 == !0))

2. Isolate rows with num, name and total_pts data.

Use regex to extract appropriate data for each column.

df_num_name_pts <-  df %>% 
  filter(row_number() %% 2 == 0)

head(df_num_name_pts)
  # A tibble: 6 × 1
    `Pair | Player Name                     |Total|Round|Round|Round|Round|Round…`
    <chr>                                                                         
  1 1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  1…
  2 2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  2…
  3 3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  1…
  4 4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  1…
  5 5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  1…
  6 6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  2…
df2$num <- df_num_name_pts$`Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round|` %>% 
  str_extract("^.{2}") %>% 
  trimws() %>% 
  as.numeric()
df2$name <- df_num_name_pts$`Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round|` %>% 
  str_extract("([A-Z]+\\s[A-Z]+) | ([A-Z]+\\s[A-Z]+\\s[A-Z]+)") %>% 
  trimws()
df2$total_pts <- df_num_name_pts$`Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round|` %>% 
  str_extract("\\d\\.\\d") %>% 
  as.numeric()

3. Isolate rows with state and pre_rating.

df_get_state_pre <- df[-1,] #delete first row
df_get_state_pre <- df_get_state_pre %>%   
  filter(row_number() %% 2 == 0)

head(df_get_state_pre)
  # A tibble: 6 × 1
    `Pair | Player Name                     |Total|Round|Round|Round|Round|Round…`
    <chr>                                                                         
  1 ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B  …
  2 MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W  …
  3 MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B  …
  4 MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B  …
  5 MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W  …
  6 OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W  …
df2$state <- df_get_state_pre$`Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round|` %>% 
  str_extract("[A-Z]+")
df2$pre_rating <- df_get_state_pre$`Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round|` %>% 
  str_extract("(\\d+).*?(\\d+)") %>% 
  str_sub(15,19) %>% 
  trimws() %>% 
  as.numeric()

4. Create data frame to calculate avg_rating_opp.

# extract rounds with opponent numbers
df_opp <- df_num_name_pts$`Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round|` %>% 
  str_extract_all("\\s\\d{1,2}.*", simplify = T) %>% 
  data.frame()

# create oppoenent df
df_opp <- read.csv(text=sub(
  "\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)$", 
  ",\\1,\\2,\\3,\\4,\\5,\\6", df_opp$.), 
        col.names=c('rnd1', 'rnd2', 'rnd3','rnd4','rnd5','rnd6','rnd7'), header=FALSE)

# set player id
df_opp$player_id <- 1:nrow(df_opp)

# rearrange
df_opp <- df_opp %>% 
  select(player_id, rnd1, rnd2, rnd3, rnd4, rnd5, rnd6, rnd7)

head(df_opp)
    player_id  rnd1 rnd2 rnd3 rnd4 rnd5 rnd6 rnd7
  1         1  39|W 21|W 18|W 14|W  7|D 12|D   4|
  2         2  63|W 58|L  4|W 17|W 16|W 20|W   7|
  3         3   8|W 61|W 25|W 21|W 11|W 13|W  12|
  4         4  23|D 28|W  2|W 26|D  5|W 19|D   1|
  5         5  45|W 37|D 12|D 13|D  4|W 14|W  17|
  6         6  34|D 29|L 11|W 35|D 10|W 27|W  21|
a. Extract numbers (opponent) from each column
# extract numbers from each column
df_opp <- df_opp %>% 
  as_tibble() %>% 
  mutate(across(.cols = rnd1:rnd7, .fns=str_extract, pattern = "\\d+"))

#  change dtype of columns to numeric
df_opp <- df_opp %>% 
  mutate(across(.cols = rnd1:rnd7, .fns = as.numeric))

head(df_opp)
  # A tibble: 6 × 8
    player_id  rnd1  rnd2  rnd3  rnd4  rnd5  rnd6  rnd7
        <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
  1         1    39    21    18    14     7    12     4
  2         2    63    58     4    17    16    20     7
  3         3     8    61    25    21    11    13    12
  4         4    23    28     2    26     5    19     1
  5         5    45    37    12    13     4    14    17
  6         6    34    29    11    35    10    27    21
b. Match pre-rating from main data frame to opponents data frame.
df_opp$rnd1_opp <- df2$pre_rating[match(df_opp$rnd1, df2$num)]
df_opp$rnd2_opp <- df2$pre_rating[match(df_opp$rnd2, df2$num)]
df_opp$rnd3_opp <- df2$pre_rating[match(df_opp$rnd3, df2$num)]
df_opp$rnd4_opp <- df2$pre_rating[match(df_opp$rnd4, df2$num)]
df_opp$rnd5_opp <- df2$pre_rating[match(df_opp$rnd5, df2$num)]
df_opp$rnd6_opp <- df2$pre_rating[match(df_opp$rnd6, df2$num)]
df_opp$rnd7_opp <- df2$pre_rating[match(df_opp$rnd7, df2$num)]
df_opp <- df_opp %>% 
  select(player_id, rnd1_opp:rnd7_opp)
c. Get row mean
df_opp$opp_mean <- df_opp[,2:8] %>% 
  rowMeans(na.rm = T) %>% 
  round(3)

head(df_opp)
  # A tibble: 6 × 9
    player_id rnd1_opp rnd2_opp rnd3_opp rnd4_opp rnd5_opp rnd6_opp rnd7_opp
        <int>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
  1         1     1436     1563     1600     1610     1649     1663     1716
  2         2     1175      917     1716     1629     1604     1595     1649
  3         3     1641      955     1745     1563     1712     1666     1663
  4         4     1363     1507     1553     1579     1655     1564     1794
  5         5     1242      980     1663     1666     1716     1610     1629
  6         6     1399     1602     1712     1438     1365     1552     1563
  # … with 1 more variable: opp_mean <dbl>

5. Look up Average Opponent Pre-Rating

df2$avg_rating_opp <-  df_opp$opp_mean[match(df_opp$player_id, df2$num)]

head(df2)
    num                name state total_pts pre_rating avg_rating_opp
  1   1            GARY HUA    ON       6.0       1794       1605.286
  2   2     DAKSHESH DARURI    MI       6.0       1553       1469.286
  3   3        ADITYA BAJAJ    MI       6.0       1384       1563.571
  4   4 PATRICK H SCHILLING    MI       5.5       1716       1573.571
  5   5          HANSHI ZUO    MI       5.5       1655       1500.857
  6   6         HANSEN SONG    OH       5.0       1686       1518.714

Export final product to csv

df2 %>% 
  write_csv('./data/cleaned_chess_data.csv')