Project 1

Author

Naomi Buell

Published

February 15, 2024

Introduction

In this project, I take a text file with chess tournament results, and create Quarto file that generates a .CSV file with the following information for all of the players: Player’s Name, Player’s state, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents.

Data Transformation

First, I determine column widths by browsing the text file.

# Defining the column widths based on the structure of the file
col_widths <- c(8, 33, 6, 6, 6, 6, 6, 6, 6, 6)

Next, I import the data with columns of fixed width.

# URL of text file saved on github so data is accessible for anyone running the script.
file_url <-
  "https://github.com/naomibuell/DATA607/raw/main/DATA607_project1.txt"

# Read the fixed-width format file into a data frame
chess_data <-
  read_fwf(file_url,
           fwf_widths(col_widths),
           skip = 4,
           show_col_types = FALSE) |>
  filter(X1 != "--------") |> # remove blank rows of dashes
  mutate_all(~ str_replace_all(as.character(.), c("\\|" = ""))) |> # removing "|"s from the data
  mutate(row_id = row_number(), # creating row ids
         pid = round_half_up((row_number()) / 2, digits = 0)) # creating person IDs

Since text file structure has 2 rows per person-level observation, I transform it to just 1 row per observation by splitting each player’s two rows into separate sets of columns.

chess_data_left <-
  chess_data |> # get left 12 cols of per-person obs
  filter(pid == as.integer(X1))
chess_data_right <-
  chess_data |> # get right 12 cols of per-person obs
  filter(pid == row_id / 2)

Then I join the data back together and clean up the variables.

# Join and clean variables
chess_joined <-
  full_join(chess_data_left, chess_data_right, by = "pid") |> # join left and right sides of data based on pid var
  janitor::clean_names() |> # rename vars
  rename(
    name = x2_x,
    state = x1_y,
    total = x3_x,
    opp1 = x4_x,
    opp2 = x5_x,
    opp3 = x6_x,
    opp4 = x7_x,
    opp5 = x8_x,
    opp6 = x9_x,
    opp7 = x10_x,
    rating = x2_y
  ) |>
  select(-c(starts_with("row_id"), starts_with("x"))) |> # dropping unneeded columns
  mutate(
    # trimming strings and assigning var types
    name = str_trim(name),
    total = parse_double(total),
    opp1 = parse_number(opp1),
    opp2 = parse_number(opp2),
    opp3 = parse_number(opp3),
    opp4 = parse_number(opp4),
    opp5 = parse_number(opp5),
    opp6 = parse_number(opp6),
    opp7 = parse_number(opp7),
    state = as.factor(str_trim(state)),
    rating = parse_number(str_extract(rating, " \\d+"))
  )

Here is the tidied data set with 1 row per observation. Variables opp1 through opp7 represent each player’s opponents’ ID.

head(chess_joined)
# A tibble: 6 × 12
  name        total  opp1  opp2  opp3  opp4  opp5  opp6  opp7   pid state rating
  <chr>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>  <dbl>
1 GARY HUA      6      39    21    18    14     7    12     4     1 ON      1794
2 DAKSHESH D…   6      63    58     4    17    16    20     7     2 MI      1553
3 ADITYA BAJ…   6       8    61    25    21    11    13    12     3 MI      1384
4 PATRICK H …   5.5    23    28     2    26     5    19     1     4 MI      1716
5 HANSHI ZUO    5.5    45    37    12    13     4    14    17     5 MI      1655
6 HANSEN SONG   5      34    29    11    35    10    27    21     6 OH      1686

Using these IDs, I find the corresponding opponents’ rating, and assign the average rating of each player’s opponents to a new variable opp_avg. I also drop variables from the data that are not required by the assignment.

# Look up average pre-chess rating of opponents:
chess_lookup <- chess_joined |>
  mutate(
    opp1 = ifelse(opp1 %in% pid, rating[match(opp1, pid)], NA),
    opp2 = ifelse(opp2 %in% pid, rating[match(opp2, pid)], NA),
    opp3 = ifelse(opp3 %in% pid, rating[match(opp3, pid)], NA),
    opp4 = ifelse(opp4 %in% pid, rating[match(opp4, pid)], NA),
    opp5 = ifelse(opp5 %in% pid, rating[match(opp5, pid)], NA),
    opp6 = ifelse(opp6 %in% pid, rating[match(opp6, pid)], NA),
    opp7 = ifelse(opp7 %in% pid, rating[match(opp7, pid)], NA)
  )

chess_avgs <- chess_lookup |>
  mutate(opp_avg = rowMeans(select(chess_lookup, starts_with("opp")), na.rm = TRUE)) |>
  select(-c(pid, matches("opp\\d"))) # drop unneeded vars

Now, the data set has all required info: player’s name (name), player’s state (state), total number of points (total), player’s pre-rating (rating), and average pre-chess rating of opponents (opp_avg).

head(chess_avgs)
# A tibble: 6 × 5
  name                total state rating opp_avg
  <chr>               <dbl> <fct>  <dbl>   <dbl>
1 GARY HUA              6   ON      1794   1605.
2 DAKSHESH DARURI       6   MI      1553   1469.
3 ADITYA BAJAJ          6   MI      1384   1564.
4 PATRICK H SCHILLING   5.5 MI      1716   1574.
5 HANSHI ZUO            5.5 MI      1655   1501.
6 HANSEN SONG           5   OH      1686   1519.

Lastly, I export the file as a .CSV.

write_csv(chess_avgs, "DATA607chessdata.csv")

Before I conclude, I also explore the top players in the final data set.

chess_explore <- mutate(chess_avgs, ratio = rating / opp_avg)
arrange(chess_explore, desc(rating)) # top player ratings
# A tibble: 64 × 6
   name                     total state rating opp_avg ratio
   <chr>                    <dbl> <fct>  <dbl>   <dbl> <dbl>
 1 GARY HUA                   6   ON      1794   1605.  1.12
 2 LOREN SCHWIEBERT           3.5 MI      1745   1363.  1.28
 3 PATRICK H SCHILLING        5.5 MI      1716   1574.  1.09
 4 CAMERON WILLIAM MC LEMAN   4.5 MI      1712   1468.  1.17
 5 HANSEN SONG                5   OH      1686   1519.  1.11
 6 TORRANCE HENRY JR          4.5 MI      1666   1498.  1.11
 7 KENNETH J TACK             4.5 MI      1663   1506.  1.10
 8 HANSHI ZUO                 5.5 MI      1655   1501.  1.10
 9 GARY DEE SWATHELL          5   MI      1649   1372.  1.20
10 EZEKIEL HOUGHTON           5   MI      1641   1468.  1.12
# ℹ 54 more rows
arrange(chess_explore, desc(opp_avg)) # top player opponents' ratings
# A tibble: 64 × 6
   name                       total state rating opp_avg ratio
   <chr>                      <dbl> <fct>  <dbl>   <dbl> <dbl>
 1 GARY HUA                     6   ON      1794   1605. 1.12 
 2 PATRICK H SCHILLING          5.5 MI      1716   1574. 1.09 
 3 ADITYA BAJAJ                 6   MI      1384   1564. 0.885
 4 ANVIT RAO                    5   MI      1365   1554. 0.878
 5 BRIAN LIU                    3   MI      1423   1539. 0.925
 6 STEFANO LEE                  5   ON      1411   1523. 0.926
 7 SOFIA ADINA STANESCU-BELLU   3.5 MI      1507   1522. 0.990
 8 HANSEN SONG                  5   OH      1686   1519. 1.11 
 9 BRADLEY SHAW                 4.5 MI      1610   1515  1.06 
10 MAX ZHU                      3.5 ON      1579   1507. 1.05 
# ℹ 54 more rows
arrange(chess_explore, desc(ratio)) # highest ratio of player:opponents ratings
# A tibble: 64 × 6
   name                     total state rating opp_avg ratio
   <chr>                    <dbl> <fct>  <dbl>   <dbl> <dbl>
 1 GEORGE AVERY JONES         3.5 ON      1522   1144.  1.33
 2 ASHWIN BALAJI              1   MI      1530   1186   1.29
 3 LOREN SCHWIEBERT           3.5 MI      1745   1363.  1.28
 4 GAURAV GIDWANI             3.5 MI      1552   1222.  1.27
 5 JOSHUA DAVID LEE           3.5 MI      1438   1150.  1.25
 6 CHIEDOZIE OKORIE           3.5 MI      1602   1314.  1.22
 7 GARY DEE SWATHELL          5   MI      1649   1372.  1.20
 8 EUGENE L MCCLURE           4   MI      1555   1300.  1.20
 9 RISHI SHETTY               3.5 MI      1494   1260.  1.19
10 CAMERON WILLIAM MC LEMAN   4.5 MI      1712   1468.  1.17
# ℹ 54 more rows

Conclusion

In this assignment, I used REGEX, joins, and other techniques to transform and tidy chess tournament data from a .TXT file. By browsing the final data set, I found that Gary Hua had both the highest chess rating and, in turn, the highest rated chess opponents, on average. Interestingly, Aditya Bajaj played the third toughest opponents on average, despite being ranked 36th out of all pre-chess ratings of players in the tournament.