I will import the text file from Github, use | as a delimiter remove dashes, pivot wider so the two rows for each player are aligned, clean the data, and calculate the new number that references other opponents’ ranks.
Importing the data
library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.5.2
Warning: package 'ggplot2' was built under R version 4.5.2
Warning: package 'tibble' was built under R version 4.5.2
Warning: package 'readr' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.6
✔ forcats 1.0.1 ✔ stringr 1.5.2
✔ ggplot2 4.0.1 ✔ tibble 3.3.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.1.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
new_data <-read.table("https://raw.githubusercontent.com/samanthabarbaro/data607/refs/heads/main/chess.txt", sep ="|", row.names =NULL, fill =TRUE)#Can I drop blanks? (I tried this first with == "" and it didn't work)filtered_new_data <-subset(new_data, V2 !="")
Attempting to add labels and pivot wider
#add labels and then pivot wider#I will add a repeating row to the left side (e.g., 1, 2) and then use those to pivot the data widerfiltered_new_data <- filtered_new_data |>mutate(new_col =rep(c(1, 2), length.out =n()))
# A tibble: 65 × 21
person_id V1_1 V1_2 V2_1 V2_2 V3_1 V3_2 V4_1 V4_2 V5_1 V5_2 V6_1
<int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 " Pair… " Nu… " Pl… " US… "Tot… " Pt… Round " 1… Round " 2… Round
2 2 " 1… " … " GA… " 15… "6.0… "N:2… W 39 "W … W 21 "B … W 18
3 3 " 2… " … " DA… " 14… "6.0… "N:2… W 63 "B … W 58 "W … L 4
4 4 " 3… " … " AD… " 14… "6.0… "N:2… L 8 "W … W 61 "B … W 25
5 5 " 4… " … " PA… " 12… "5.5… "N:2… W 23 "W … D 28 "B … W 2
6 6 " 5… " … " HA… " 14… "5.5… "N:2… W 45 "B … W 37 "W … D 12
7 7 " 6… " … " HA… " 15… "5.0… "N:3… W 34 "W … D 29 "B … L 11
8 8 " 7… " … " GA… " 11… "5.0… "N:3… W 57 "W … W 46 "B … W 13
9 9 " 8… " … " EZ… " 15… "5.0… "N:3… W 3 "B … W 32 "W … L 14
10 10 " 9… " … " ST… " 14… "5.0… "N:2… W 25 "W … L 18 "B … W 59
# ℹ 55 more rows
# ℹ 9 more variables: V6_2 <chr>, V7_1 <chr>, V7_2 <chr>, V8_1 <chr>,
# V8_2 <chr>, V9_1 <chr>, V9_2 <chr>, V10_1 <chr>, V10_2 <chr>
#let's clean up the namesmore_final_chess <- final_chess |>select(player_id = V1_1, hometown = V1_2, player_name = V2_1, ranks = V2_2, total_pts = V3_1, first_opp = V4_1, second_opp = V5_1, third_opp = V6_1, fourth_opp = V7_1, fifth_opp = V8_1, sixth_opp = V9_1, seventh_opp = V10_1)#parse out ranks #there are p + numbers on some ranks that will need to goseparated_chess <- more_final_chess |>separate(col = ranks, into =c("USCF_ID", "scores"), sep ="/") |>separate(col = scores, into =c("pre_rank", "post_rank"), sep ="->") |>slice(-1)#Let's delete some strings#there were hidden spaces here and NAs weren't showing, so I used str_trimdel_chess <- separated_chess|>mutate((across(ends_with("_opp"), ~str_sub(.x, start =3))), pre_rank =str_sub(pre_rank, start =4), across(everything(), ~str_trim(.x) |>na_if("")))#getting rid of everything after the pchess_p <- del_chess |>mutate(pre_rank =str_remove(pre_rank, "P.*")) |>mutate(post_rank =str_remove(post_rank, "P.*"))view(chess_p)
Creating a reference for finding the opponent pre-tournament ranks
#reference tableref_tab <- chess_p |>select(player_id, opp_one_rnk = pre_rank)ref <- ref_tab |>mutate(id = player_id)#attempting a joinref <- chess_p |>select(player_id, opp_one_rnk = pre_rank)chess_final <- chess_p |>left_join(ref, by =c("first_opp"="player_id"))#let's try to join this all at once though ref <- chess_p |>select(player_id, temp_rank = pre_rank)opp_cols <-c("first_opp", "second_opp", "third_opp", "fourth_opp", "fifth_opp", "sixth_opp", "seventh_opp")# Join everything#chess_final <- reduce(opp_cols, function(data, col_name) {# new_rank_name <- paste0(col_name, "_rnk")# data |># left_join(rank_lookup, by = setNames("player_id", col_name)) |># rename(!!new_rank_name := temp_rank)#}, .init = chess_p)#it occured to me that I could pivot longer, join just once and then just summarize (much easier)chess_long <- chess_p |>pivot_longer(cols =c(first_opp, second_opp, third_opp, fourth_opp, fifth_opp, sixth_opp, seventh_opp),names_to ="round",values_to ="opponent_id" )#*now* left join -- just oncechess_joined <- chess_long |>left_join(ref, by =c("opponent_id"="player_id"))chess_joined |>mutate(temp_rank =as.numeric(temp_rank))
# A tibble: 448 × 10
player_id hometown player_name USCF_ID pre_rank post_rank total_pts round
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 ON GARY HUA 154458… 1794 1817 6.0 firs…
2 1 ON GARY HUA 154458… 1794 1817 6.0 seco…
3 1 ON GARY HUA 154458… 1794 1817 6.0 thir…
4 1 ON GARY HUA 154458… 1794 1817 6.0 four…
5 1 ON GARY HUA 154458… 1794 1817 6.0 fift…
6 1 ON GARY HUA 154458… 1794 1817 6.0 sixt…
7 1 ON GARY HUA 154458… 1794 1817 6.0 seve…
8 2 MI DAKSHESH DARURI 145989… 1553 1663 6.0 firs…
9 2 MI DAKSHESH DARURI 145989… 1553 1663 6.0 seco…
10 2 MI DAKSHESH DARURI 145989… 1553 1663 6.0 thir…
# ℹ 438 more rows
# ℹ 2 more variables: opponent_id <chr>, temp_rank <dbl>
#Arrange and get averageschess_averages <- chess_joined |>group_by(player_id, player_name, hometown, total_pts, pre_rank) |>mutate(temp_rank =as.numeric(temp_rank)) |>summarize(avg_opponent_rank =round(mean(temp_rank, na.rm =TRUE),0))
`summarise()` has grouped output by 'player_id', 'player_name', 'hometown',
'total_pts'. You can override using the `.groups` argument.
#Organizing by rank and converting players' names to regular case:very_final_chess <- chess_averages |>mutate(player_id =as.numeric(player_id)) |>arrange(player_id) |>mutate(player_name =str_to_title(player_name))print(very_final_chess)
# A tibble: 64 × 6
# Groups: player_id, player_name, hometown, total_pts [64]
player_id player_name hometown total_pts pre_rank avg_opponent_rank
<dbl> <chr> <chr> <chr> <chr> <dbl>
1 1 Gary Hua ON 6.0 1794 1605
2 2 Dakshesh Daruri MI 6.0 1553 1469
3 3 Aditya Bajaj MI 6.0 1384 1564
4 4 Patrick H Schilling MI 5.5 1716 1574
5 5 Hanshi Zuo MI 5.5 1655 1501
6 6 Hansen Song OH 5.0 1686 1519
7 7 Gary Dee Swathell MI 5.0 1649 1372
8 8 Ezekiel Houghton MI 5.0 1641 1468
9 9 Stefano Lee ON 5.0 1411 1523
10 10 Anvit Rao MI 5.0 1365 1554
# ℹ 54 more rows