Overview

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

Load the libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)

Read the data

df = read.table(url('https://raw.githubusercontent.com/gillianmcgovern0/cuny-data-607/refs/heads/main/tournamentinfo.txt'), sep='\t', skip = 1) # skip reading the first line since it only contains dashes
head(df, 10)
##                                                                                            V1
## 1   Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 2   Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 3   -----------------------------------------------------------------------------------------
## 4       1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 5      ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 6   -----------------------------------------------------------------------------------------
## 7       2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## 8      MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 9   -----------------------------------------------------------------------------------------
## 10      3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|

Cleaning, Transforming and Tidying the Data

There are many things wrong with this messy data frame. The first thing is that all the data is in one column, and there are special characters such as \ and -. So to make it easier to read, let’s remove the special characters and add columns:

df <- df %>%
  filter(V1 != "-----------------------------------------------------------------------------------------") %>%
  separate(V1, c('pair', 'player_name', 'total', 'round_1', 'round_2', 'round_3', 'round_4', 'round_5', 'round_6', 'round_7'), sep="\\|")
## Warning: Expected 10 pieces. Additional pieces discarded in 130 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
df <- df[-c(1:2),] # remove the first 2 rows since they're old headers
head(df, 10)
##      pair                       player_name total round_1 round_2 round_3
## 3      1   GARY HUA                         6.0     W  39   W  21   W  18
## 4     ON   15445895 / R: 1794   ->1817      N:2     W       B       W    
## 5      2   DAKSHESH DARURI                  6.0     W  63   W  58   L   4
## 6     MI   14598900 / R: 1553   ->1663      N:2     B       W       B    
## 7      3   ADITYA BAJAJ                     6.0     L   8   W  61   W  25
## 8     MI   14959604 / R: 1384   ->1640      N:2     W       B       W    
## 9      4   PATRICK H SCHILLING              5.5     W  23   D  28   W   2
## 10    MI   12616049 / R: 1716   ->1744      N:2     W       B       W    
## 11     5   HANSHI ZUO                       5.5     W  45   W  37   D  12
## 12    MI   14601533 / R: 1655   ->1690      N:2     B       W       B    
##    round_4 round_5 round_6 round_7
## 3    W  14   W   7   D  12   D   4
## 4    B       W       B       W    
## 5    W  17   W  16   W  20   W   7
## 6    W       B       W       B    
## 7    W  21   W  11   W  13   W  12
## 8    B       W       B       W    
## 9    W  26   D   5   W  19   D   1
## 10   B       W       B       B    
## 11   D  13   D   4   W  14   W  17
## 12   W       B       W       B

The data frame is now looking better, but there are 2 rows devoted to each player, and the values of the 2nd of those 2 rows corresponds to different variables than the ones we currently have set up.

Let’s give a new player_id column to correspond each row with a player:

# Identify rows that correspond to a single player by adding a new `player_id` variable
amount_of_players <- nrow(df) / 2
df$player_id <- rep(1:amount_of_players, each = 2)
head(df, 10)
##      pair                       player_name total round_1 round_2 round_3
## 3      1   GARY HUA                         6.0     W  39   W  21   W  18
## 4     ON   15445895 / R: 1794   ->1817      N:2     W       B       W    
## 5      2   DAKSHESH DARURI                  6.0     W  63   W  58   L   4
## 6     MI   14598900 / R: 1553   ->1663      N:2     B       W       B    
## 7      3   ADITYA BAJAJ                     6.0     L   8   W  61   W  25
## 8     MI   14959604 / R: 1384   ->1640      N:2     W       B       W    
## 9      4   PATRICK H SCHILLING              5.5     W  23   D  28   W   2
## 10    MI   12616049 / R: 1716   ->1744      N:2     W       B       W    
## 11     5   HANSHI ZUO                       5.5     W  45   W  37   D  12
## 12    MI   14601533 / R: 1655   ->1690      N:2     B       W       B    
##    round_4 round_5 round_6 round_7 player_id
## 3    W  14   W   7   D  12   D   4         1
## 4    B       W       B       W             1
## 5    W  17   W  16   W  20   W   7         2
## 6    W       B       W       B             2
## 7    W  21   W  11   W  13   W  12         3
## 8    B       W       B       W             3
## 9    W  26   D   5   W  19   D   1         4
## 10   B       W       B       B             4
## 11   D  13   D   4   W  14   W  17         5
## 12   W       B       W       B             5

A tidy data frame contains one observation for each row, so let’s apply that by adding on the even rows as new columns to the odd rows ( the main row for each player):

# Using `player_id`, turn the 2nd row for each player, and add it on to the 1st row as new variables
df2 <- df %>% 
   group_by(player_id) %>%
   mutate(row_number = row_number()) %>% # use row number to make creating the new variable names easier
   ungroup %>%
   pivot_wider(names_from = row_number, values_from = c(pair, player_name, total, round_1, round_2, round_3, round_4, round_5, round_6, round_7))
head(df2, 10)
## # A tibble: 10 × 21
##    player_id pair_1 pair_2 player_name_1 player_name_2 total_1 total_2 round_1_1
##        <int> <chr>  <chr>  <chr>         <chr>         <chr>   <chr>   <chr>    
##  1         1 "    … "   O… " GARY HUA  … " 15445895 /… "6.0  " "N:2  " W  39    
##  2         2 "    … "   M… " DAKSHESH D… " 14598900 /… "6.0  " "N:2  " W  63    
##  3         3 "    … "   M… " ADITYA BAJ… " 14959604 /… "6.0  " "N:2  " L   8    
##  4         4 "    … "   M… " PATRICK H … " 12616049 /… "5.5  " "N:2  " W  23    
##  5         5 "    … "   M… " HANSHI ZUO… " 14601533 /… "5.5  " "N:2  " W  45    
##  6         6 "    … "   O… " HANSEN SON… " 15055204 /… "5.0  " "N:3  " W  34    
##  7         7 "    … "   M… " GARY DEE S… " 11146376 /… "5.0  " "N:3  " W  57    
##  8         8 "    … "   M… " EZEKIEL HO… " 15142253 /… "5.0  " "N:3  " W   3    
##  9         9 "    … "   O… " STEFANO LE… " 14954524 /… "5.0  " "N:2  " W  25    
## 10        10 "   1… "   M… " ANVIT RAO … " 14150362 /… "5.0  " "N:3  " D  16    
## # ℹ 13 more variables: round_1_2 <chr>, round_2_1 <chr>, round_2_2 <chr>,
## #   round_3_1 <chr>, round_3_2 <chr>, round_4_1 <chr>, round_4_2 <chr>,
## #   round_5_1 <chr>, round_5_2 <chr>, round_6_1 <chr>, round_6_2 <chr>,
## #   round_7_1 <chr>, round_7_2 <chr>

Now each observation is a row, but now the data frame is pretty large/wide. We know what we want for the final data frame, so let’s first get rid of the data we know we won’t need:

# Remove columns we won't need for this data frame to make things simpler
cols_to_delete <- c("pair_1","total_2", "round_1_2", "round_2_2", "round_3_2", "round_4_2", "round_5_2", "round_6_2", "round_7_2", "round_3_2")
df3 <- df2[ , !(names(df2) %in% cols_to_delete)]
head(df3, 10)
## # A tibble: 10 × 12
##    player_id pair_2   player_name_1    player_name_2 total_1 round_1_1 round_2_1
##        <int> <chr>    <chr>            <chr>         <chr>   <chr>     <chr>    
##  1         1 "   ON " " GARY HUA     … " 15445895 /… "6.0  " W  39     W  21    
##  2         2 "   MI " " DAKSHESH DARU… " 14598900 /… "6.0  " W  63     W  58    
##  3         3 "   MI " " ADITYA BAJAJ … " 14959604 /… "6.0  " L   8     W  61    
##  4         4 "   MI " " PATRICK H SCH… " 12616049 /… "5.5  " W  23     D  28    
##  5         5 "   MI " " HANSHI ZUO   … " 14601533 /… "5.5  " W  45     W  37    
##  6         6 "   OH " " HANSEN SONG  … " 15055204 /… "5.0  " W  34     D  29    
##  7         7 "   MI " " GARY DEE SWAT… " 11146376 /… "5.0  " W  57     W  46    
##  8         8 "   MI " " EZEKIEL HOUGH… " 15142253 /… "5.0  " W   3     W  32    
##  9         9 "   ON " " STEFANO LEE  … " 14954524 /… "5.0  " W  25     L  18    
## 10        10 "   MI " " ANVIT RAO    … " 14150362 /… "5.0  " D  16     L  19    
## # ℹ 5 more variables: round_3_1 <chr>, round_4_1 <chr>, round_5_1 <chr>,
## #   round_6_1 <chr>, round_7_1 <chr>

This data frame is still untidy because each cell does not represent a single value, particularly the column named player_name_2. We need the pre-rating from this column, which we know is the 2nd number listed in the string. So let’s use regex to grab that 2nd number and add it as a new column called pre_rating:

df4 <- df3 %>%
  mutate(pre_rating = str_extract_all(player_name_2, "[0-9]+", simplify = TRUE)[,2])
df4 <- df4[,-4] # Remove `player_name_2` since we extracted all the necessary data
head(df4, 10)
## # A tibble: 10 × 12
##    player_id pair_2   player_name_1        total_1 round_1_1 round_2_1 round_3_1
##        <int> <chr>    <chr>                <chr>   <chr>     <chr>     <chr>    
##  1         1 "   ON " " GARY HUA         … "6.0  " W  39     W  21     W  18    
##  2         2 "   MI " " DAKSHESH DARURI  … "6.0  " W  63     W  58     L   4    
##  3         3 "   MI " " ADITYA BAJAJ     … "6.0  " L   8     W  61     W  25    
##  4         4 "   MI " " PATRICK H SCHILLI… "5.5  " W  23     D  28     W   2    
##  5         5 "   MI " " HANSHI ZUO       … "5.5  " W  45     W  37     D  12    
##  6         6 "   OH " " HANSEN SONG      … "5.0  " W  34     D  29     L  11    
##  7         7 "   MI " " GARY DEE SWATHELL… "5.0  " W  57     W  46     W  13    
##  8         8 "   MI " " EZEKIEL HOUGHTON … "5.0  " W   3     W  32     L  14    
##  9         9 "   ON " " STEFANO LEE      … "5.0  " W  25     L  18     W  59    
## 10        10 "   MI " " ANVIT RAO        … "5.0  " D  16     L  19     W  55    
## # ℹ 5 more variables: round_4_1 <chr>, round_5_1 <chr>, round_6_1 <chr>,
## #   round_7_1 <chr>, pre_rating <chr>

Let’s continue making this data frame tidy by ensuring each cell contains a single value. The last columns we need to fix are the ones that contain the information if the player won or lost the round and the “player ID” of the player’s opponent. Let’s create new variables for both of these values. Additionally, we won’t need the information if the player won or lost for this project, so let’s get rid of those columns to make things simpler:

# Separate the rows containing 2 values into separate variables to improve tidiness
df5 <- df4 %>% 
  separate(round_1_1, c('round_1_final', 'round_1_opp'), sep="\\s+") %>%
  separate(round_2_1, c('round_2_final', 'round_2_opp'), sep="\\s+") %>%
  separate(round_3_1, c('round_3_final', 'round_3_opp'), sep="\\s+") %>%
  separate(round_4_1, c('round_4_final', 'round_4_opp'), sep="\\s+") %>%
  separate(round_5_1, c('round_5_final', 'round_5_opp'), sep="\\s+") %>%
  separate(round_6_1, c('round_6_final', 'round_6_opp'), sep="\\s+") %>%
  separate(round_7_1, c('round_7_final', 'round_7_opp'), sep="\\s+")
cols_to_delete <- c("round_1_final","round_2_final", "round_3_final", "round_4_final", "round_5_final", "round_6_final", "round_7_final") # we don't need this data for this project
df5 <- df5[ , !(names(df5) %in% cols_to_delete)]
df4 <- df4[,-4] # Remove the original column bc we extracted the necessary data
head(df5, 10)
## # A tibble: 10 × 12
##    player_id pair_2   player_name_1  total_1 round_1_opp round_2_opp round_3_opp
##        <int> <chr>    <chr>          <chr>   <chr>       <chr>       <chr>      
##  1         1 "   ON " " GARY HUA   … "6.0  " 39          21          18         
##  2         2 "   MI " " DAKSHESH DA… "6.0  " 63          58          4          
##  3         3 "   MI " " ADITYA BAJA… "6.0  " 8           61          25         
##  4         4 "   MI " " PATRICK H S… "5.5  " 23          28          2          
##  5         5 "   MI " " HANSHI ZUO … "5.5  " 45          37          12         
##  6         6 "   OH " " HANSEN SONG… "5.0  " 34          29          11         
##  7         7 "   MI " " GARY DEE SW… "5.0  " 57          46          13         
##  8         8 "   MI " " EZEKIEL HOU… "5.0  " 3           32          14         
##  9         9 "   ON " " STEFANO LEE… "5.0  " 25          18          59         
## 10        10 "   MI " " ANVIT RAO  … "5.0  " 16          19          55         
## # ℹ 5 more variables: round_4_opp <chr>, round_5_opp <chr>, round_6_opp <chr>,
## #   round_7_opp <chr>, pre_rating <chr>

This data is still untidy since, if we add more rounds to the tournament, we would need to continuously add more columns making the data frame wider and wider. Additionally, we’ll need to perform statistics on the opponent ratings, so to avoid adding a bunch of columns, let’s use a pivot_longer to create a new variable called opp_id:

df6 <- df5 %>%
  pivot_longer(cols = round_1_opp:round_7_opp, names_to = "round", values_to = "opp_id", values_drop_na = TRUE)
head(df6, 10)
## # A tibble: 10 × 7
##    player_id pair_2   player_name_1              total_1 pre_rating round opp_id
##        <int> <chr>    <chr>                      <chr>   <chr>      <chr> <chr> 
##  1         1 "   ON " " GARY HUA               … "6.0  " 1794       roun… 39    
##  2         1 "   ON " " GARY HUA               … "6.0  " 1794       roun… 21    
##  3         1 "   ON " " GARY HUA               … "6.0  " 1794       roun… 18    
##  4         1 "   ON " " GARY HUA               … "6.0  " 1794       roun… 14    
##  5         1 "   ON " " GARY HUA               … "6.0  " 1794       roun… 7     
##  6         1 "   ON " " GARY HUA               … "6.0  " 1794       roun… 12    
##  7         1 "   ON " " GARY HUA               … "6.0  " 1794       roun… 4     
##  8         2 "   MI " " DAKSHESH DARURI        … "6.0  " 1553       roun… 63    
##  9         2 "   MI " " DAKSHESH DARURI        … "6.0  " 1553       roun… 58    
## 10         2 "   MI " " DAKSHESH DARURI        … "6.0  " 1553       roun… 4

Now each observation represents an individual player and a round of chess the player played. So if a new round occurs, new rows can be inserted (this won’t be the final data frame, but this type of data frame we’ll be easier to work with in order to reach the final data frame for this project). One thing I would probably explore doing is creating a data frame for each player so if a new round is played, you wouldn’t need to add 2 rows to a single data frame which has a risk of accidentally forgetting to add a round for a player. Alternatively, I would explore making each observation a player combination where each observation represents a single game played, again only adding 1 row of data for each game played as opposed to 2. This does not seem necessary for this project, so I won’t explore this here.

We know we’ll need the average opponent rating for each player, so let’s create a separate data frame specifically to give information about the player pre-ratings:

df_ratings <- df6 %>%
  select(player_id, player_name_1, pre_rating)
df_ratings_final <- distinct(df_ratings) # Get rid of the duplicate rows
df_ratings_final
## # A tibble: 64 × 3
##    player_id player_name_1                       pre_rating
##        <int> <chr>                               <chr>     
##  1         1 " GARY HUA                        " 1794      
##  2         2 " DAKSHESH DARURI                 " 1553      
##  3         3 " ADITYA BAJAJ                    " 1384      
##  4         4 " PATRICK H SCHILLING             " 1716      
##  5         5 " HANSHI ZUO                      " 1655      
##  6         6 " HANSEN SONG                     " 1686      
##  7         7 " GARY DEE SWATHELL               " 1649      
##  8         8 " EZEKIEL HOUGHTON                " 1641      
##  9         9 " STEFANO LEE                     " 1411      
## 10        10 " ANVIT RAO                       " 1365      
## # ℹ 54 more rows

Now let’s use match the player_id from df_ratings_final and opp_id from the original data frame to create a new variable opp_rating for the opponent rating in the original data frame:

df7 <- df6 %>%
  mutate(opp_rating = df_ratings_final$pre_rating[as.numeric(opp_id)]) # grab pre-rating from `df_ratings_final`
head(df7, 10)
## # A tibble: 10 × 8
##    player_id pair_2   player_name_1   total_1 pre_rating round opp_id opp_rating
##        <int> <chr>    <chr>           <chr>   <chr>      <chr> <chr>  <chr>     
##  1         1 "   ON " " GARY HUA    … "6.0  " 1794       roun… 39     1436      
##  2         1 "   ON " " GARY HUA    … "6.0  " 1794       roun… 21     1563      
##  3         1 "   ON " " GARY HUA    … "6.0  " 1794       roun… 18     1600      
##  4         1 "   ON " " GARY HUA    … "6.0  " 1794       roun… 14     1610      
##  5         1 "   ON " " GARY HUA    … "6.0  " 1794       roun… 7      1649      
##  6         1 "   ON " " GARY HUA    … "6.0  " 1794       roun… 12     1663      
##  7         1 "   ON " " GARY HUA    … "6.0  " 1794       roun… 4      1716      
##  8         2 "   MI " " DAKSHESH DAR… "6.0  " 1553       roun… 63     1175      
##  9         2 "   MI " " DAKSHESH DAR… "6.0  " 1553       roun… 58     917       
## 10         2 "   MI " " DAKSHESH DAR… "6.0  " 1553       roun… 4      1716

A player does not have to actually play a round of chess for each round. For example, the player could skip a round. For this project, we do not care about this information, so let’s just remove the observations where a player didn’t play a round of chess.

Last bit of information we need is the average opponent rating for each player. For that, let’s create a new data frame and perform some statistics. Let’s group by player_id, and find the average of the opp_rating column and summarize the data:

df8 <- df7 %>%
  filter(!is.na(as.numeric(opp_rating))) %>% # Don't count rounds if it's a B or an H
  group_by(player_id) %>%
  summarise(opp_mean = mean(as.numeric(opp_rating)))
head(df8, 10)
## # A tibble: 10 × 2
##    player_id opp_mean
##        <int>    <dbl>
##  1         1    1605.
##  2         2    1469.
##  3         3    1564.
##  4         4    1574.
##  5         5    1501.
##  6         6    1519.
##  7         7    1372.
##  8         8    1468.
##  9         9    1523.
## 10        10    1554.

Using the data frame that had a single player in each row (since we know the final data frame is more of a summary data frame, which will have a row/observation for a single player), let’s add on the new average opponent rating column:

df9 <- df5 %>%
  select(player_id , player_name_1, pair_2, total_1, pre_rating) %>%
  mutate(avg_opp_rating = df8$opp_mean[as.numeric(player_id)]) %>%
  select(player_name_1, pair_2, total_1, pre_rating, avg_opp_rating)
head(df9, 10)
## # A tibble: 10 × 5
##    player_name_1                       pair_2  total_1 pre_rating avg_opp_rating
##    <chr>                               <chr>   <chr>   <chr>               <dbl>
##  1 " GARY HUA                        " "   ON… "6.0  " 1794                1605.
##  2 " DAKSHESH DARURI                 " "   MI… "6.0  " 1553                1469.
##  3 " ADITYA BAJAJ                    " "   MI… "6.0  " 1384                1564.
##  4 " PATRICK H SCHILLING             " "   MI… "5.5  " 1716                1574.
##  5 " HANSHI ZUO                      " "   MI… "5.5  " 1655                1501.
##  6 " HANSEN SONG                     " "   OH… "5.0  " 1686                1519.
##  7 " GARY DEE SWATHELL               " "   MI… "5.0  " 1649                1372.
##  8 " EZEKIEL HOUGHTON                " "   MI… "5.0  " 1641                1468.
##  9 " STEFANO LEE                     " "   ON… "5.0  " 1411                1523.
## 10 " ANVIT RAO                       " "   MI… "5.0  " 1365                1554.

After selecting the specific columns needed for this task, we now have our final data frame.

To improve our data frame, let’s:

names(df9) <- c('player_name', 'player_state', 'total_points', 'player_pre_rating', 'avg_opp_pre_rating')
# Remove white space and make sure number values are not strings
df_final <- df9 %>% 
  mutate(across(where(is.character), str_trim))
df_final$total_points <- as.numeric(as.character(df_final$total_points))
df_final$player_pre_rating <- as.numeric(as.character(df_final$player_pre_rating))
df_final$avg_opp_pre_rating <- round(as.numeric(as.character(df_final$avg_opp_pre_rating)))
df_final
## # A tibble: 64 × 5
##    player_name    player_state total_points player_pre_rating avg_opp_pre_rating
##    <chr>          <chr>               <dbl>             <dbl>              <dbl>
##  1 GARY HUA       ON                    6                1794               1605
##  2 DAKSHESH DARU… MI                    6                1553               1469
##  3 ADITYA BAJAJ   MI                    6                1384               1564
##  4 PATRICK H SCH… MI                    5.5              1716               1574
##  5 HANSHI ZUO     MI                    5.5              1655               1501
##  6 HANSEN SONG    OH                    5                1686               1519
##  7 GARY DEE SWAT… MI                    5                1649               1372
##  8 EZEKIEL HOUGH… MI                    5                1641               1468
##  9 STEFANO LEE    ON                    5                1411               1523
## 10 ANVIT RAO      MI                    5                1365               1554
## # ℹ 54 more rows

This data frame has all the necessary data, so let’s write it to a csv called gtm_project1 in the user’s Downloads folder:

write.csv(df_final, "~/Downloads/gtm_project1.csv", row.names = FALSE)