Load the libraries to use:
library(readr)
library(stringr)
# Read the file as a string:
tournament_str <- read_file("tournamentinfo.txt")
# split at end of line and trim trailing white space:
tournament_split <- strsplit(tournament_str, "\n")[[1]] |> trimws()
# remove row separator:
sep <- "-----------------------------------------------------------------------------------------"
without_sep <- tournament_split[tournament_split != sep]
# split at column sep "|"
tournament_df <- without_sep |> strsplit(split = "\\|") |>
# bind rows:
do.call(what = rbind) |>
# convert to dataframe:
as.data.frame()
# add column names:
colnames(tournament_df) <- c(
"pair_num", "player_name", "total_pts", paste0("round", 1:7)
)
# remove the first 2 rows, not needed:
tournament_df <- tournament_df[-c(1:2), ]
# remove rownames, not needed:
rownames(tournament_df) <- NULL
# take a look at the first 10 entries:
head(tournament_df, 10)
## pair_num player_name total_pts round1 round2 round3
## 1 1 GARY HUA 6.0 W 39 W 21 W 18
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B W
## 3 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4
## 4 MI 14598900 / R: 1553 ->1663 N:2 B W B
## 5 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25
## 6 MI 14959604 / R: 1384 ->1640 N:2 W B W
## 7 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2
## 8 MI 12616049 / R: 1716 ->1744 N:2 W B W
## 9 5 HANSHI ZUO 5.5 W 45 W 37 D 12
## 10 MI 14601533 / R: 1655 ->1690 N:2 B W B
## round4 round5 round6 round7
## 1 W 14 W 7 D 12 D 4
## 2 B W B W
## 3 W 17 W 16 W 20 W 7
## 4 W B W B
## 5 W 21 W 11 W 13 W 12
## 6 B W B W
## 7 W 26 D 5 W 19 D 1
## 8 B W B B
## 9 D 13 D 4 W 14 W 17
## 10 W B W B
# note: player names are on odd numbered rows, additional info is on even rows
Create a skeleton of the required dataframe:
required_info <- data.frame(
player_name = character(),
player_state = character(),
total_pts = numeric(),
player_prerating = numeric(),
avg_prechess_rating_of_opponents = numeric()
)
Loop through odd row numbers of tournament_df and fill
required_info as needed:
odd_rows <- seq(from = 1, to = nrow(tournament_df), by = 2)
pattern <- "\\bR:\\s*(\\d+)(?:\\D|$)"
# eg. string <- "15445895 / R: 1794 ->1817"
# the regex pattern would match 1794
# string2 <- "15495066 / R: 1563P22->1562"
# the regex pattern would match 1563
for (i in odd_rows) {
player_name <- tournament_df$player_name[i] |> trimws()
player_state <- tournament_df$pair_num[i + 1] |> trimws()
total_pts <- tournament_df$total_pts[i] |> trimws() |> as.numeric()
player_prerating <- {
x <- tournament_df$player_name[i + 1] |> trimws()
str_match(x, pattern)[2] |> as.numeric()
}
avg_prechess_rating_of_opponents <- {
# all rounds in current row:
all_rounds <- tournament_df[i, 4:ncol(tournament_df)] |>
as.character()
# get pair nums of the opponents:
opponents_pair_num <- str_extract_all(all_rounds, "\\d+") |>
unlist()
opponents_info <- tournament_df$player_name[
which(trimws(tournament_df$pair_num) %in% opponents_pair_num) + 1
] |>
str_match_all(pattern) |>
unlist()
# opponents_info looks like this:
# [1] "R: 1563P" "1563" "R: 1436P" "1436"
# subset only even indices:
opponents_info[seq(from = 2, to = length(opponents_info), by = 2)] |>
as.numeric() |>
# calculate pre-tournament opponent's rating:
mean() |>
round()
}
# add row to 'required_info':
required_info <- rbind(
required_info,
data.frame(
player_name, player_state, total_pts, player_prerating,
avg_prechess_rating_of_opponents
)
)
}
Take a look at first 10 rows of required_info:
head(required_info, 10)
## player_name player_state total_pts player_prerating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## 7 GARY DEE SWATHELL MI 5.0 1649
## 8 EZEKIEL HOUGHTON MI 5.0 1641
## 9 STEFANO LEE ON 5.0 1411
## 10 ANVIT RAO MI 5.0 1365
## avg_prechess_rating_of_opponents
## 1 1605
## 2 1469
## 3 1564
## 4 1574
## 5 1501
## 6 1519
## 7 1372
## 8 1468
## 9 1523
## 10 1554
Generate a csv file:
write.csv(required_info, file = "required_info.csv", row.names = FALSE)