Project_1

Author

Sam Barbaro

Approach

I will import the text file from Github, use | as a delimiter remove dashes, pivot wider so the two rows for each player are aligned, clean the data, and calculate the new number that references other opponents’ ranks.

Importing the data

library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.5.2
Warning: package 'ggplot2' was built under R version 4.5.2
Warning: package 'tibble' was built under R version 4.5.2
Warning: package 'readr' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.1     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
new_data <- read.table("https://raw.githubusercontent.com/samanthabarbaro/data607/refs/heads/main/chess.txt", sep = "|", row.names = NULL, fill = TRUE)

#Can I drop blanks? (I tried this first with == "" and it didn't work)

filtered_new_data <- subset(new_data, V2 != "")

Attempting to add labels and pivot wider

#add labels and then pivot wider
#I will add a repeating row to the left side (e.g., 1, 2) and then use those to pivot the data wider

filtered_new_data <- filtered_new_data |>
    mutate(new_col = rep(c(1, 2), length.out = n())) 

Breaking columns up and adding labels

final_chess <- filtered_new_data |>
  mutate(person_id = rep(1:(n()/2), each = 2)) |>
  pivot_wider(
    id_cols = person_id,
    names_from = c(new_col),
    values_from = c(V1, V2, V3, V4, V5, V6, V7, V8, V9, V10)
  )

  
print(final_chess)
# A tibble: 65 × 21
   person_id V1_1    V1_2  V2_1  V2_2  V3_1  V3_2  V4_1  V4_2  V5_1  V5_2  V6_1 
       <int> <chr>   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1         1 " Pair… " Nu… " Pl… " US… "Tot… " Pt… Round "  1… Round "  2… Round
 2         2 "    1… "   … " GA… " 15… "6.0… "N:2… W  39 "W  … W  21 "B  … W  18
 3         3 "    2… "   … " DA… " 14… "6.0… "N:2… W  63 "B  … W  58 "W  … L   4
 4         4 "    3… "   … " AD… " 14… "6.0… "N:2… L   8 "W  … W  61 "B  … W  25
 5         5 "    4… "   … " PA… " 12… "5.5… "N:2… W  23 "W  … D  28 "B  … W   2
 6         6 "    5… "   … " HA… " 14… "5.5… "N:2… W  45 "B  … W  37 "W  … D  12
 7         7 "    6… "   … " HA… " 15… "5.0… "N:3… W  34 "W  … D  29 "B  … L  11
 8         8 "    7… "   … " GA… " 11… "5.0… "N:3… W  57 "W  … W  46 "B  … W  13
 9         9 "    8… "   … " EZ… " 15… "5.0… "N:3… W   3 "B  … W  32 "W  … L  14
10        10 "    9… "   … " ST… " 14… "5.0… "N:2… W  25 "W  … L  18 "B  … W  59
# ℹ 55 more rows
# ℹ 9 more variables: V6_2 <chr>, V7_1 <chr>, V7_2 <chr>, V8_1 <chr>,
#   V8_2 <chr>, V9_1 <chr>, V9_2 <chr>, V10_1 <chr>, V10_2 <chr>
#let's clean up the names

more_final_chess <- final_chess |> select(player_id = V1_1, hometown = V1_2, player_name = V2_1, ranks = V2_2, total_pts = V3_1, first_opp = V4_1, second_opp = V5_1, third_opp = V6_1, fourth_opp = V7_1, fifth_opp = V8_1, sixth_opp = V9_1, seventh_opp = V10_1)


#parse out ranks 
#there are p + numbers on some ranks that will need to go

separated_chess <- more_final_chess |>
    separate(col = ranks, into = c("USCF_ID", "scores"), sep = "/") |> separate(col = scores, into = c("pre_rank", "post_rank"), sep = "->") |> slice(-1)


#Let's delete some strings
#there were hidden spaces here and NAs weren't showing, so I used str_trim
del_chess <- separated_chess|>
  mutate((across(ends_with("_opp"), ~ str_sub(.x, start = 3))), pre_rank = str_sub(pre_rank, start = 4), across(everything(), ~ str_trim(.x) |> na_if("")))


#getting rid of everything after the p

chess_p <- del_chess |> 
    mutate(pre_rank = str_remove(pre_rank, "P.*")) |> 
    mutate(post_rank = str_remove(post_rank, "P.*"))

view(chess_p)

Creating a reference for finding the opponent pre-tournament ranks

#reference table
ref_tab <- chess_p |> select(player_id, opp_one_rnk = pre_rank)


ref <- ref_tab |> 
  mutate(id = player_id)


#attempting a join

ref <- chess_p |> 
    select(player_id, opp_one_rnk = pre_rank)


chess_final <- chess_p |>
    left_join(ref, by = c("first_opp" = "player_id"))


#let's try to join this all at once though 

ref <- chess_p |> 
  select(player_id, temp_rank = pre_rank)


opp_cols <- c("first_opp", "second_opp", "third_opp", "fourth_opp", 
              "fifth_opp", "sixth_opp", "seventh_opp")

# Join everything

#chess_final <- reduce(opp_cols, function(data, col_name) {
#    new_rank_name <- paste0(col_name, "_rnk")
    
#    data |>
#        left_join(rank_lookup, by = setNames("player_id", col_name)) |>
#        rename(!!new_rank_name := temp_rank)

#}, .init = chess_p)


#it occured to me that I could pivot longer, join just once and then just summarize (much easier)

chess_long <- chess_p |>
    pivot_longer(
        cols = c(first_opp, second_opp, third_opp, fourth_opp, fifth_opp, sixth_opp, seventh_opp),
        names_to = "round",
        values_to = "opponent_id"
    )

#*now* left join -- just once

chess_joined <- chess_long |>
    left_join(ref, by = c("opponent_id" = "player_id"))

chess_joined |> mutate(temp_rank = as.numeric(temp_rank))
# A tibble: 448 × 10
   player_id hometown player_name     USCF_ID pre_rank post_rank total_pts round
   <chr>     <chr>    <chr>           <chr>   <chr>    <chr>     <chr>     <chr>
 1 1         ON       GARY HUA        154458… 1794     1817      6.0       firs…
 2 1         ON       GARY HUA        154458… 1794     1817      6.0       seco…
 3 1         ON       GARY HUA        154458… 1794     1817      6.0       thir…
 4 1         ON       GARY HUA        154458… 1794     1817      6.0       four…
 5 1         ON       GARY HUA        154458… 1794     1817      6.0       fift…
 6 1         ON       GARY HUA        154458… 1794     1817      6.0       sixt…
 7 1         ON       GARY HUA        154458… 1794     1817      6.0       seve…
 8 2         MI       DAKSHESH DARURI 145989… 1553     1663      6.0       firs…
 9 2         MI       DAKSHESH DARURI 145989… 1553     1663      6.0       seco…
10 2         MI       DAKSHESH DARURI 145989… 1553     1663      6.0       thir…
# ℹ 438 more rows
# ℹ 2 more variables: opponent_id <chr>, temp_rank <dbl>
#Arrange and get averages


chess_averages <- chess_joined |> 
    group_by(player_id, player_name, hometown, total_pts, pre_rank) |> mutate(temp_rank = as.numeric(temp_rank)) |>
    summarize(avg_opponent_rank = round(mean(temp_rank, na.rm = TRUE),0))
`summarise()` has grouped output by 'player_id', 'player_name', 'hometown',
'total_pts'. You can override using the `.groups` argument.
#Organizing by rank and converting players' names to regular case:

very_final_chess <- chess_averages |> mutate(player_id = as.numeric(player_id)) |> arrange(player_id) |> mutate(player_name = str_to_title(player_name))

print(very_final_chess)
# A tibble: 64 × 6
# Groups:   player_id, player_name, hometown, total_pts [64]
   player_id player_name         hometown total_pts pre_rank avg_opponent_rank
       <dbl> <chr>               <chr>    <chr>     <chr>                <dbl>
 1         1 Gary Hua            ON       6.0       1794                  1605
 2         2 Dakshesh Daruri     MI       6.0       1553                  1469
 3         3 Aditya Bajaj        MI       6.0       1384                  1564
 4         4 Patrick H Schilling MI       5.5       1716                  1574
 5         5 Hanshi Zuo          MI       5.5       1655                  1501
 6         6 Hansen Song         OH       5.0       1686                  1519
 7         7 Gary Dee Swathell   MI       5.0       1649                  1372
 8         8 Ezekiel Houghton    MI       5.0       1641                  1468
 9         9 Stefano Lee         ON       5.0       1411                  1523
10        10 Anvit Rao           MI       5.0       1365                  1554
# ℹ 54 more rows
glimpse(very_final_chess)
Rows: 64
Columns: 6
Groups: player_id, player_name, hometown, total_pts [64]
$ player_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
$ player_name       <chr> "Gary Hua", "Dakshesh Daruri", "Aditya Bajaj", "Patr…
$ hometown          <chr> "ON", "MI", "MI", "MI", "MI", "OH", "MI", "MI", "ON"…
$ total_pts         <chr> "6.0", "6.0", "6.0", "5.5", "5.5", "5.0", "5.0", "5.…
$ pre_rank          <chr> "1794", "1553", "1384", "1716", "1655", "1686", "164…
$ avg_opponent_rank <dbl> 1605, 1469, 1564, 1574, 1501, 1519, 1372, 1468, 1523…