library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Read the file and store the data
file <- readLines("tournamentinfo.txt")
## Warning in readLines("tournamentinfo.txt"): incomplete final line found on
## 'tournamentinfo.txt'
# RStudio gives me a warning about an incomplete final line & I didn't want to modify the given file. Opening the text file in RStudio shows a count of 196 lines. Trying to get the number of rows kept giving me NULL.
# First player is on line 5, then the rest of the players are on every third line after
# The game results are also on the same row
player_rows <- file[seq(5, 196, 3)]
#It's 64 chess players, the correct number
player_rows
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
## [7] " 7 | GARY DEE SWATHELL |5.0 |W 57|W 46|W 13|W 11|L 1|W 9|L 2|"
## [8] " 8 | EZEKIEL HOUGHTON |5.0 |W 3|W 32|L 14|L 9|W 47|W 28|W 19|"
## [9] " 9 | STEFANO LEE |5.0 |W 25|L 18|W 59|W 8|W 26|L 7|W 20|"
## [10] " 10 | ANVIT RAO |5.0 |D 16|L 19|W 55|W 31|D 6|W 25|W 18|"
## [11] " 11 | CAMERON WILLIAM MC LEMAN |4.5 |D 38|W 56|W 6|L 7|L 3|W 34|W 26|"
## [12] " 12 | KENNETH J TACK |4.5 |W 42|W 33|D 5|W 38|H |D 1|L 3|"
## [13] " 13 | TORRANCE HENRY JR |4.5 |W 36|W 27|L 7|D 5|W 33|L 3|W 32|"
## [14] " 14 | BRADLEY SHAW |4.5 |W 54|W 44|W 8|L 1|D 27|L 5|W 31|"
## [15] " 15 | ZACHARY JAMES HOUGHTON |4.5 |D 19|L 16|W 30|L 22|W 54|W 33|W 38|"
## [16] " 16 | MIKE NIKITIN |4.0 |D 10|W 15|H |W 39|L 2|W 36|U |"
## [17] " 17 | RONALD GRZEGORCZYK |4.0 |W 48|W 41|L 26|L 2|W 23|W 22|L 5|"
## [18] " 18 | DAVID SUNDEEN |4.0 |W 47|W 9|L 1|W 32|L 19|W 38|L 10|"
## [19] " 19 | DIPANKAR ROY |4.0 |D 15|W 10|W 52|D 28|W 18|L 4|L 8|"
## [20] " 20 | JASON ZHENG |4.0 |L 40|W 49|W 23|W 41|W 28|L 2|L 9|"
## [21] " 21 | DINH DANG BUI |4.0 |W 43|L 1|W 47|L 3|W 40|W 39|L 6|"
## [22] " 22 | EUGENE L MCCLURE |4.0 |W 64|D 52|L 28|W 15|H |L 17|W 40|"
## [23] " 23 | ALAN BUI |4.0 |L 4|W 43|L 20|W 58|L 17|W 37|W 46|"
## [24] " 24 | MICHAEL R ALDRICH |4.0 |L 28|L 47|W 43|L 25|W 60|W 44|W 39|"
## [25] " 25 | LOREN SCHWIEBERT |3.5 |L 9|W 53|L 3|W 24|D 34|L 10|W 47|"
## [26] " 26 | MAX ZHU |3.5 |W 49|W 40|W 17|L 4|L 9|D 32|L 11|"
## [27] " 27 | GAURAV GIDWANI |3.5 |W 51|L 13|W 46|W 37|D 14|L 6|U |"
## [28] " 28 | SOFIA ADINA STANESCU-BELLU |3.5 |W 24|D 4|W 22|D 19|L 20|L 8|D 36|"
## [29] " 29 | CHIEDOZIE OKORIE |3.5 |W 50|D 6|L 38|L 34|W 52|W 48|U |"
## [30] " 30 | GEORGE AVERY JONES |3.5 |L 52|D 64|L 15|W 55|L 31|W 61|W 50|"
## [31] " 31 | RISHI SHETTY |3.5 |L 58|D 55|W 64|L 10|W 30|W 50|L 14|"
## [32] " 32 | JOSHUA PHILIP MATHEWS |3.5 |W 61|L 8|W 44|L 18|W 51|D 26|L 13|"
## [33] " 33 | JADE GE |3.5 |W 60|L 12|W 50|D 36|L 13|L 15|W 51|"
## [34] " 34 | MICHAEL JEFFERY THOMAS |3.5 |L 6|W 60|L 37|W 29|D 25|L 11|W 52|"
## [35] " 35 | JOSHUA DAVID LEE |3.5 |L 46|L 38|W 56|L 6|W 57|D 52|W 48|"
## [36] " 36 | SIDDHARTH JHA |3.5 |L 13|W 57|W 51|D 33|H |L 16|D 28|"
## [37] " 37 | AMIYATOSH PWNANANDAM |3.5 |B |L 5|W 34|L 27|H |L 23|W 61|"
## [38] " 38 | BRIAN LIU |3.0 |D 11|W 35|W 29|L 12|H |L 18|L 15|"
## [39] " 39 | JOEL R HENDON |3.0 |L 1|W 54|W 40|L 16|W 44|L 21|L 24|"
## [40] " 40 | FOREST ZHANG |3.0 |W 20|L 26|L 39|W 59|L 21|W 56|L 22|"
## [41] " 41 | KYLE WILLIAM MURPHY |3.0 |W 59|L 17|W 58|L 20|X |U |U |"
## [42] " 42 | JARED GE |3.0 |L 12|L 50|L 57|D 60|D 61|W 64|W 56|"
## [43] " 43 | ROBERT GLEN VASEY |3.0 |L 21|L 23|L 24|W 63|W 59|L 46|W 55|"
## [44] " 44 | JUSTIN D SCHILLING |3.0 |B |L 14|L 32|W 53|L 39|L 24|W 59|"
## [45] " 45 | DEREK YAN |3.0 |L 5|L 51|D 60|L 56|W 63|D 55|W 58|"
## [46] " 46 | JACOB ALEXANDER LAVALLEY |3.0 |W 35|L 7|L 27|L 50|W 64|W 43|L 23|"
## [47] " 47 | ERIC WRIGHT |2.5 |L 18|W 24|L 21|W 61|L 8|D 51|L 25|"
## [48] " 48 | DANIEL KHAIN |2.5 |L 17|W 63|H |D 52|H |L 29|L 35|"
## [49] " 49 | MICHAEL J MARTIN |2.5 |L 26|L 20|D 63|D 64|W 58|H |U |"
## [50] " 50 | SHIVAM JHA |2.5 |L 29|W 42|L 33|W 46|H |L 31|L 30|"
## [51] " 51 | TEJAS AYYAGARI |2.5 |L 27|W 45|L 36|W 57|L 32|D 47|L 33|"
## [52] " 52 | ETHAN GUO |2.5 |W 30|D 22|L 19|D 48|L 29|D 35|L 34|"
## [53] " 53 | JOSE C YBARRA |2.0 |H |L 25|H |L 44|U |W 57|U |"
## [54] " 54 | LARRY HODGE |2.0 |L 14|L 39|L 61|B |L 15|L 59|W 64|"
## [55] " 55 | ALEX KONG |2.0 |L 62|D 31|L 10|L 30|B |D 45|L 43|"
## [56] " 56 | MARISA RICCI |2.0 |H |L 11|L 35|W 45|H |L 40|L 42|"
## [57] " 57 | MICHAEL LU |2.0 |L 7|L 36|W 42|L 51|L 35|L 53|B |"
## [58] " 58 | VIRAJ MOHILE |2.0 |W 31|L 2|L 41|L 23|L 49|B |L 45|"
## [59] " 59 | SEAN M MC CORMICK |2.0 |L 41|B |L 9|L 40|L 43|W 54|L 44|"
## [60] " 60 | JULIA SHEN |1.5 |L 33|L 34|D 45|D 42|L 24|H |U |"
## [61] " 61 | JEZZEL FARKAS |1.5 |L 32|L 3|W 54|L 47|D 42|L 30|L 37|"
## [62] " 62 | ASHWIN BALAJI |1.0 |W 55|U |U |U |U |U |U |"
## [63] " 63 | THOMAS JOSEPH HOSMER |1.0 |L 2|L 48|D 49|L 43|L 45|H |U |"
## [64] " 64 | BEN LI |1.0 |L 22|D 30|L 31|D 49|L 46|L 42|L 54|"
# The first time a player's state shows up is on line 6
rating_rows <- file[seq(6, 196, 3)]
# The ratings of all 64 players are all there
length(rating_rows)
## [1] 64
The Regex Part
# Credit to ChatGPT for help with regex, I did the parts that did not involve regular expressions on my own
player_names <- trimws(gsub("\\d+ \\|\\s+([A-Z ]+)\\s+\\|.*", "\\1", player_rows))
print(player_names)
## [1] "GARY HUA"
## [2] "DAKSHESH DARURI"
## [3] "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING"
## [5] "HANSHI ZUO"
## [6] "HANSEN SONG"
## [7] "GARY DEE SWATHELL"
## [8] "EZEKIEL HOUGHTON"
## [9] "STEFANO LEE"
## [10] "ANVIT RAO"
## [11] "CAMERON WILLIAM MC LEMAN"
## [12] "KENNETH J TACK"
## [13] "TORRANCE HENRY JR"
## [14] "BRADLEY SHAW"
## [15] "ZACHARY JAMES HOUGHTON"
## [16] "MIKE NIKITIN"
## [17] "RONALD GRZEGORCZYK"
## [18] "DAVID SUNDEEN"
## [19] "DIPANKAR ROY"
## [20] "JASON ZHENG"
## [21] "DINH DANG BUI"
## [22] "EUGENE L MCCLURE"
## [23] "ALAN BUI"
## [24] "MICHAEL R ALDRICH"
## [25] "LOREN SCHWIEBERT"
## [26] "MAX ZHU"
## [27] "GAURAV GIDWANI"
## [28] "28 | SOFIA ADINA STANESCU-BELLU |3.5 |W 24|D 4|W 22|D 19|L 20|L 8|D 36|"
## [29] "CHIEDOZIE OKORIE"
## [30] "GEORGE AVERY JONES"
## [31] "RISHI SHETTY"
## [32] "JOSHUA PHILIP MATHEWS"
## [33] "JADE GE"
## [34] "MICHAEL JEFFERY THOMAS"
## [35] "JOSHUA DAVID LEE"
## [36] "SIDDHARTH JHA"
## [37] "AMIYATOSH PWNANANDAM"
## [38] "BRIAN LIU"
## [39] "JOEL R HENDON"
## [40] "FOREST ZHANG"
## [41] "KYLE WILLIAM MURPHY"
## [42] "JARED GE"
## [43] "ROBERT GLEN VASEY"
## [44] "JUSTIN D SCHILLING"
## [45] "DEREK YAN"
## [46] "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT"
## [48] "DANIEL KHAIN"
## [49] "MICHAEL J MARTIN"
## [50] "SHIVAM JHA"
## [51] "TEJAS AYYAGARI"
## [52] "ETHAN GUO"
## [53] "JOSE C YBARRA"
## [54] "LARRY HODGE"
## [55] "ALEX KONG"
## [56] "MARISA RICCI"
## [57] "MICHAEL LU"
## [58] "VIRAJ MOHILE"
## [59] "SEAN M MC CORMICK"
## [60] "JULIA SHEN"
## [61] "JEZZEL FARKAS"
## [62] "ASHWIN BALAJI"
## [63] "THOMAS JOSEPH HOSMER"
## [64] "BEN LI"
# Extract total points as numeric values
total_points <- as.numeric(stringr::str_extract(player_rows, "\\d+\\.\\d+"))
print(total_points)
## [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0 4.0 4.0
## [20] 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.0
## [39] 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5 2.5 2.0 2.0 2.0 2.0 2.0
## [58] 2.0 2.0 1.5 1.5 1.0 1.0 1.0
length(total_points)
## [1] 64
state <- trimws(gsub("^(\\s*)([A-Z]+)\\s*\\|.*", "\\2", rating_rows))
print(state)
## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI"
## [16] "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI" "MI" "ON"
## [31] "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [46] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [61] "ON" "MI" "MI" "MI"
#There's 64 states, one for each player, the output is 64
length(state)
## [1] 64
pre_rating <- as.integer(str_extract(str_extract(rating_rows, "[^\\d]\\d{3,4}[^\\d]"), "\\d+"))
pre_rating
## [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610 1220
## [16] 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507 1602 1522
## [31] 1494 1441 1449 1399 1438 1355 980 1423 1436 1348 1403 1332 1283 1199 1242
## [46] 377 1362 1382 1291 1056 1011 935 1393 1270 1186 1153 1092 917 853 967
## [61] 955 1530 1175 1163
# Create a vector to store opponents for each player
opponents <- vector("list", length(player_rows))
# Function to extract opponents from a line
extract_opponents <- function(line) {
opponent_pattern <- "\\b(W|L|D) +(\\d+)\\b"
opponent_matches <- gregexpr(opponent_pattern, line)
opponent_indices <- regmatches(line, opponent_matches)
extracted_opponents <- numeric()
for (indices in opponent_indices) {
opponent <- as.numeric(sub(opponent_pattern, "\\2", indices))
extracted_opponents <- c(extracted_opponents, opponent)
}
return(extracted_opponents)
}
# Loop through player data and extract opponents
for (i in 1:length(player_rows)) {
player_opponents <- extract_opponents(player_rows[i])
opponents[[i]] <- player_opponents
}
# This part is done by me because it doesn't involve regex
calc_avg_rating <- function(players) {
total <- 0
num_of_opponents <- length(players)
for (i in 1:num_of_opponents) {
opponent_num <- players[i]
opponent_rating <- pre_rating[opponent_num]
total <- total + opponent_rating
}
rounded_avg <- round(total/num_of_opponents)
return(rounded_avg)
}
average_opponent_ratings <- c()
for (i in 1:64) {
average_opponent_ratings[i] <- calc_avg_rating(opponents[[i]])
}
Creating the Data Frame
chess_players <- data.frame(player_names, state, total_points, pre_rating, average_opponent_ratings)
# Get the average pre_rating for every player
print(mean(chess_players$pre_rating))
## [1] 1378.5
#Get the median pre_rating
print(median(chess_players$pre_rating))
## [1] 1407
# Plot the pre_ratings of each player per state
chess_players %>%
ggplot(aes(x = state, y = pre_rating)) + geom_point()
head(chess_players)
## player_names state total_points pre_rating average_opponent_ratings
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
Renaming and Generating the CSV File
# Renaming the columns
names(chess_players) <- c("Player's Name", "Player's State", "Total Number of Points", "Player's Pre-Rating", "Average Pre Chess Rating Of Opponents")
write.csv(chess_players, "chess_stats.csv", row.names = FALSE)
Viewing the CSV File
chess_data <- read.csv("chess_stats.csv")
View(chess_data)