Used text file with chess tournament results where the information has some structure tournamentinfo.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(readr)
library(RCurl)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::complete() masks RCurl::complete()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
file_path <- "C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/tournamentinfo.txt"
chess_data <- readLines(file_path, warn = FALSE)
# Remove separator lines and empty lines
chess_data <- chess_data[!grepl("^[-\\s]+$", chess_data)]
#Assigning Regex
rxname <- "[[:digit:]]+[[:space:]]+[|]([:space:]+[:alpha:]+){2,}"
rxstate <- "(?<=[:space:])[:alpha:]{2}(?=[:space:][|])"
rxtotalpoints <-"[:digit:][:punct:][:digit:]"
rxplayerstartingscore <- "(?<=R:[:space:]{1,2})[:digit:]+"
rxplayersfinalscore <- "(?<=->[:space:]?)[:digit:]+"
rxrnds <- "(?<=[:digit:][:punct:][:digit:][:space:]{1,2}[|]{1}).{40,}"
#Creating Vectors
player_name <- unlist(str_extract_all(chess_data,rxname))
player_state <- unlist(str_extract_all(chess_data,rxstate))
total_points <- unlist(str_extract_all(chess_data,rxtotalpoints))
player_starting_score <- unlist(str_extract_all(chess_data,rxplayerstartingscore))
player_final_score <- unlist(str_extract_all(chess_data,rxplayersfinalscore))
rounds <- unlist(str_extract_all(chess_data,rxrnds))
#Data frame
chess_table <- data.frame(
player_name,
player_state,
player_starting_score,
player_final_score,
total_points,
rounds)
#Rounds
col_names <- c("round_1","round_2","round_3","round_4","round_5","round_6","round_7")
chess_table <- chess_table%>%
separate(rounds,col_names,sep = "[|]")
## Warning: Expected 7 pieces. Additional pieces discarded in 64 rows [1, 2, 3, 4, 5, 6, 7,
## 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
#New Data frame Transposed
chess_pivot <- chess_table%>%
pivot_longer(col_names,names_to = "round",values_to = "result")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(col_names)
##
## # Now:
## data %>% select(all_of(col_names))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
result_col_names <- c("result","opp_id")
id_col_names <- c("id","player_name")
chess_pivot <- chess_pivot%>%
separate(result,result_col_names)%>%
separate(player_name,id_col_names, sep = " [|]")
#Opponent rating
chess_opp_table <- chess_pivot%>%
select(id, player_state, player_starting_score,player_final_score)%>%
distinct()%>%
rename(opp_id=id,
opp_player_state = player_state ,
opp_player_starting_score = player_starting_score,
opp_player_final_score = player_final_score)
chess_table_comp<- merge(chess_pivot,chess_opp_table,all = TRUE)%>%
mutate(starting_rating_diff = as.numeric(player_starting_score) - as.numeric(opp_player_starting_score))
str(chess_table_comp)
## 'data.frame': 448 obs. of 13 variables:
## $ opp_id : chr "" "" "" "" ...
## $ id : chr "56" "53" "41" "53" ...
## $ player_name : chr " MARISA RICCI" " JOSE C YBARRA" " KYLE WILLIAM MURPHY" " JOSE C YBARRA" ...
## $ player_state : chr "MI" "MI" "MI" "MI" ...
## $ player_starting_score : chr "1153" "1393" "1403" "1393" ...
## $ player_final_score : chr "1140" "1359" "1341" "1359" ...
## $ total_points : chr "2.0" "2.0" "3.0" "2.0" ...
## $ round : chr "round_5" "round_1" "round_5" "round_3" ...
## $ result : chr "H" "H" "X" "H" ...
## $ opp_player_state : chr NA NA NA NA ...
## $ opp_player_starting_score: chr NA NA NA NA ...
## $ opp_player_final_score : chr NA NA NA NA ...
## $ starting_rating_diff : num NA NA NA NA NA NA NA NA NA NA ...
#Calculation
chess_table_smry <- chess_table_comp%>%
mutate(opp_player_starting_score = as.numeric(opp_player_starting_score), opp_player_final_score = as.numeric(opp_player_final_score), player_starting_score = as.numeric(player_starting_score), player_final_score = as.numeric(player_final_score), id = as.integer(id), total_points = as.numeric(total_points))%>%
#Filter out the records with no match played
filter(!is.na(opp_player_state))%>%
#Group by player
group_by(id,player_name, player_state, player_starting_score,player_final_score, total_points)%>%
#Add average opponent rating and number of matches
summarise(avg_opp_starting_score = round(mean(opp_player_starting_score),2), avg_opp_final_score = round(mean(opp_player_final_score),2), num_of_matches = length(round))
## `summarise()` has grouped output by 'id', 'player_name', 'player_state',
## 'player_starting_score', 'player_final_score'. You can override using the
## `.groups` argument.
write.csv(chess_table_comp,"chess-table.csv", row.names = FALSE)
write.csv(chess_table_smry,"chess-table-summary.csv", row.names = FALSE)
Generated csv file chess-table-summary.