R Markdown

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyr)

url <- "https://raw.githubusercontent.com/j-song-npc/607-Project1/refs/heads/main/tournamentinfo.txt"

tournament <- read.table(url, sep = "|", 
                         header = FALSE, 
                         strip.white = TRUE, 
                         fill = TRUE) 


colnames(tournament) <- c("ID", "Player_Name", "Total points", "Round_1", "Round_2", "Round_3", "Round_4", "Round_5", "Round_6", "Round_7")

##Initial table cleanup

tournament <- tournament %>% distinct() 
tournament <- tournament %>% select(where(~ !all(is.na(.))))
tournament <- tournament %>% slice(-c(1,2,3)) 

##Split rows and combine tables

tournament <- tournament %>%
  mutate(row_id = rep(1:(n()/2), each = 2))

player_data <- tournament %>% 
  filter(row_number() %% 2 == 1)
secondary_data <- tournament %>% 
  filter(row_number() %% 2 == 0)
full_data <- left_join(player_data, secondary_data, 
                       by = "row_id", 
                       suffix =c("_player", "_secondary"))

##Clean new dataframe

colnames(full_data) <- c("ID", "Player_Name", "Total_points", "Round_1", "Round_2", "Round_3", "Round_4","Round_5", "Round_6", "Round_7","row_id", "State", "USCF_ID", "N", "Side_Round1", "Side_Round2", "Side_Round3", "Side_Round4", "Side_Round5", "Side_Round6", "Side_Round7")

full_data <- full_data %>% 
    select(-row_id)

##Split data in USCF/ID into new columns

full_data <- full_data %>%
  separate(USCF_ID, 
           into = c("USCF_ID", "Rating_Change"), 
           sep = " / R: ")
full_data <- full_data %>%
    separate(Rating_Change, into = c("Initial_Rating", "Final_Rating"),
             sep = "\\s*->\\s*", remove = FALSE)

##Flip table

full_data <- full_data %>%
    pivot_longer(cols = starts_with("Round"),  
     names_to = "Round_",
     values_to = "Result")

full_data <- full_data %>%
  separate(Result, into = c("Result", "Opponent_ID"), 
           sep = "\\s+")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 40 rows [82, 108, 112,
## 152, 189, 203, 250, 253, 257, 264, 285, 286, 287, 302, 332, 334, 342, 343, 348,
## 365, ...].
full_data <- full_data %>%
     mutate(Initial_Rating = 
              gsub("[A-Za-z].*", "", Initial_Rating))
full_data <- full_data %>%  
   mutate(Final_Rating = 
            gsub("[A-Za-z].*", "", Final_Rating))

##Mean of initial ratings

full_data <- full_data %>% 
   left_join(select(full_data, ID, Initial_Rating), 
             by = c("Opponent_ID" = "ID"), 
             suffix = c("_player", "_opponent"))
## Warning in left_join(., select(full_data, ID, Initial_Rating), by = c(Opponent_ID = "ID"), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 22 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
full_data <- full_data %>% distinct()

full_data <- full_data %>% 
  filter(!is.na(Initial_Rating_opponent))

full_data$Initial_Rating_opponent <- as.numeric(gsub("[^0-9.-]", "", full_data$Initial_Rating_opponent))

Clean_data <- full_data %>%
  group_by(Player_Name, State, Total_points, Initial_Rating_player) %>%
  summarize(Avg_Opponent_Rating = round(mean(Initial_Rating_opponent, na.rm = TRUE)))
## `summarise()` has grouped output by 'Player_Name', 'State', 'Total_points'. You
## can override using the `.groups` argument.
write.csv(Clean_data, file = "Tournament_initial_ratings.csv", row.names = FALSE)