In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605
1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(dplyr)
First we load the raw data from the text file and look at how it is structured.
raw_tournament_data <- readLines("https://raw.githubusercontent.com/william-forero/Data-607/refs/heads/main/tournamentinfo.txt")
## Warning in
## readLines("https://raw.githubusercontent.com/william-forero/Data-607/refs/heads/main/tournamentinfo.txt"):
## incomplete final line found on
## 'https://raw.githubusercontent.com/william-forero/Data-607/refs/heads/main/tournamentinfo.txt'
head(raw_tournament_data,21)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
## [11] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [12] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [13] "-----------------------------------------------------------------------------------------"
## [14] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [15] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [16] "-----------------------------------------------------------------------------------------"
## [17] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [18] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [19] "-----------------------------------------------------------------------------------------"
## [20] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
## [21] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
We can see that there is a repetitive structure.
Each players information is split into two rows: Row 1 always begins with a number Row 2 always begins with a letter
Players are split by a row of “—”
First I will remove the headers and any rows that are all ‘—’
# Removing the lines consisting of all hyphens
tournament_data <- raw_tournament_data[!str_detect(raw_tournament_data, "^-+$")]
# Remove the headers
tournament_data <- tournament_data[-(1:2)]
head(tournament_data,10)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [4] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [5] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [6] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [7] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [8] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [9] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [10] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
Next we parse out the two player row formats
# Checking the first 7 characters of the row and determining if it has a number
player_row_1 <- tournament_data[str_detect(str_sub(tournament_data, 1, 7), "[0-9]")]
# Checking the first 7 characters of the row and determining if it has a letter
player_row_2 <- tournament_data[str_detect(str_sub(tournament_data, 1, 7), "[A-Z]")]
head(player_row_1)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
head(player_row_2)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
Now we break each field out into its own column. The format will depend on the row type (row 1 or 2 of the players data)
# First I split by the delimeter |
player_row1_df <- read_delim(I(player_row_1),"|",trim_ws = TRUE, col_names = FALSE)
## Rows: 64 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "|"
## chr (8): X2, X4, X5, X6, X7, X8, X9, X10
## dbl (2): X1, X3
## lgl (1): X11
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the columns
names(player_row1_df) <- c("pair_num","player_name","total_points",
"round1","round2","round3","round4","round5",
"round6","round7")
## Warning: The `value` argument of `names<-()` must have the same length as `x` as of
## tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `value` argument of `names<-()` can't be empty as of tibble 3.0.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
player_row1_df <- player_row1_df |> select(-last_col())
head(player_row1_df)
## # A tibble: 6 × 10
## pair_num player_name total_points round1 round2 round3 round4 round5 round6
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 GARY HUA 6 W 39 W 21 W 18 W 14 W 7 D 12
## 2 2 DAKSHESH DARU… 6 W 63 W 58 L 4 W 17 W 16 W 20
## 3 3 ADITYA BAJAJ 6 L 8 W 61 W 25 W 21 W 11 W 13
## 4 4 PATRICK H SCH… 5.5 W 23 D 28 W 2 W 26 D 5 W 19
## 5 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13 D 4 W 14
## 6 6 HANSEN SONG 5 W 34 D 29 L 11 W 35 D 10 W 27
## # ℹ 1 more variable: round7 <chr>
Next we parse out the rows of the second format
# Parsing out the 2nd row of each players data
player_row2_df <- data_frame(player_state = str_trim(substr(player_row_2, 1, 6), side="both"),
uscf_id = substr(player_row_2, 8, 16),
pre_rating = as.numeric(substr(player_row_2, 22, 26)),
post_rating = as.numeric(substr(player_row_2, 32, 35))
)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
head(player_row2_df)
## # A tibble: 6 × 4
## player_state uscf_id pre_rating post_rating
## <chr> <chr> <dbl> <dbl>
## 1 ON " 15445895" 1794 1817
## 2 MI " 14598900" 1553 1663
## 3 MI " 14959604" 1384 1640
## 4 MI " 12616049" 1716 1744
## 5 MI " 14601533" 1655 1690
## 6 OH " 15055204" 1686 1687
Now we combine the two dfs and pivot longer
# We combine the two data frames into one comprehensive player data frame
full_player_data <- bind_cols(player_row1_df,player_row2_df)
# Now we pivot longer so that the rounds become rows, we also split outcome and opponent
player_data_long <- full_player_data |>
pivot_longer(
cols = starts_with("round"),
names_to = "round",
values_to = "round_result"
) |>
extract(
round_result,
into = c("rd_outcome", "rd_opponent"),
regex = "([A-Z])\\s*(\\d+)"
) |>
# This is used to remove round rows where the player did not have an opponent
filter(!is.na(rd_opponent)) |>
mutate(rd_opponent = as.numeric(rd_opponent))
head(player_data_long)
## # A tibble: 6 × 10
## pair_num player_name total_points player_state uscf_id pre_rating post_rating
## <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 GARY HUA 6 ON " 15445… 1794 1817
## 2 1 GARY HUA 6 ON " 15445… 1794 1817
## 3 1 GARY HUA 6 ON " 15445… 1794 1817
## 4 1 GARY HUA 6 ON " 15445… 1794 1817
## 5 1 GARY HUA 6 ON " 15445… 1794 1817
## 6 1 GARY HUA 6 ON " 15445… 1794 1817
## # ℹ 3 more variables: round <chr>, rd_outcome <chr>, rd_opponent <dbl>
We can create a column that provides the opponents pre-rating by first creating an intermediary table that contains the player id as well as the pre_rating. Once we have that we can join the opponents pre_rating onto the larger df to be used to calculate the average opponent pre-rating.
# Creating an intermediary table to pull the opponents rating
player_row1_subset <- player_row1_df |> select(1)
player_row2_subset <- player_row2_df |> select(3)
player_pre_ratings <- bind_cols(player_row1_subset,player_row2_subset)
player_data_long <- player_data_long |>
left_join(player_pre_ratings, by = c("rd_opponent" = "pair_num")) |>
rename(`opp_pre_rating` = pre_rating.y) |>
rename(`pre_rating` = pre_rating.x)
head(player_data_long)
## # A tibble: 6 × 11
## pair_num player_name total_points player_state uscf_id pre_rating post_rating
## <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 GARY HUA 6 ON " 15445… 1794 1817
## 2 1 GARY HUA 6 ON " 15445… 1794 1817
## 3 1 GARY HUA 6 ON " 15445… 1794 1817
## 4 1 GARY HUA 6 ON " 15445… 1794 1817
## 5 1 GARY HUA 6 ON " 15445… 1794 1817
## 6 1 GARY HUA 6 ON " 15445… 1794 1817
## # ℹ 4 more variables: round <chr>, rd_outcome <chr>, rd_opponent <dbl>,
## # opp_pre_rating <dbl>
Now we can extract the fields we want to provide in csv format
# Calulate the average opponent rating
final_player_results <- player_data_long |>
group_by(pair_num,player_name, player_state, total_points, pre_rating) |>
summarise(avg_opp_pre_rating = round(mean(opp_pre_rating, na.rm = TRUE)))
## `summarise()` has grouped output by 'pair_num', 'player_name', 'player_state',
## 'total_points'. You can override using the `.groups` argument.
print(final_player_results)
## # A tibble: 64 × 6
## # Groups: pair_num, player_name, player_state, total_points [64]
## pair_num player_name player_state total_points pre_rating avg_opp_pre_rating
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 GARY HUA ON 6 1794 1605
## 2 2 DAKSHESH DA… MI 6 1553 1469
## 3 3 ADITYA BAJAJ MI 6 1384 1564
## 4 4 PATRICK H S… MI 5.5 1716 1574
## 5 5 HANSHI ZUO MI 5.5 1655 1501
## 6 6 HANSEN SONG OH 5 1686 1519
## 7 7 GARY DEE SW… MI 5 1649 1372
## 8 8 EZEKIEL HOU… MI 5 1641 1468
## 9 9 STEFANO LEE ON 5 1411 1523
## 10 10 ANVIT RAO MI 5 1365 1554
## # ℹ 54 more rows
Now that we have the data in the format that we want, we can export to a CSV file.
write.csv(final_player_results, "tournamentinfo.csv")