Project 1

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Read the file and store the data

file <- readLines("tournamentinfo.txt")

## Warning in readLines("tournamentinfo.txt"): incomplete final line found on
## 'tournamentinfo.txt'

# RStudio gives me a warning about an incomplete final line & I didn't want to modify the given file. Opening the text file in RStudio shows a count of 196 lines. Trying to get the number of rows kept giving me NULL.

# First player is on line 5, then the rest of the players are on every third line after
# The game results are also on the same row

player_rows <- file[seq(5, 196, 3)]
#It's 64 chess players, the correct number
player_rows

##  [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
##  [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
##  [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
##  [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
##  [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
##  [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"
##  [7] "    7 | GARY DEE SWATHELL               |5.0  |W  57|W  46|W  13|W  11|L   1|W   9|L   2|"
##  [8] "    8 | EZEKIEL HOUGHTON                |5.0  |W   3|W  32|L  14|L   9|W  47|W  28|W  19|"
##  [9] "    9 | STEFANO LEE                     |5.0  |W  25|L  18|W  59|W   8|W  26|L   7|W  20|"
## [10] "   10 | ANVIT RAO                       |5.0  |D  16|L  19|W  55|W  31|D   6|W  25|W  18|"
## [11] "   11 | CAMERON WILLIAM MC LEMAN        |4.5  |D  38|W  56|W   6|L   7|L   3|W  34|W  26|"
## [12] "   12 | KENNETH J TACK                  |4.5  |W  42|W  33|D   5|W  38|H    |D   1|L   3|"
## [13] "   13 | TORRANCE HENRY JR               |4.5  |W  36|W  27|L   7|D   5|W  33|L   3|W  32|"
## [14] "   14 | BRADLEY SHAW                    |4.5  |W  54|W  44|W   8|L   1|D  27|L   5|W  31|"
## [15] "   15 | ZACHARY JAMES HOUGHTON          |4.5  |D  19|L  16|W  30|L  22|W  54|W  33|W  38|"
## [16] "   16 | MIKE NIKITIN                    |4.0  |D  10|W  15|H    |W  39|L   2|W  36|U    |"
## [17] "   17 | RONALD GRZEGORCZYK              |4.0  |W  48|W  41|L  26|L   2|W  23|W  22|L   5|"
## [18] "   18 | DAVID SUNDEEN                   |4.0  |W  47|W   9|L   1|W  32|L  19|W  38|L  10|"
## [19] "   19 | DIPANKAR ROY                    |4.0  |D  15|W  10|W  52|D  28|W  18|L   4|L   8|"
## [20] "   20 | JASON ZHENG                     |4.0  |L  40|W  49|W  23|W  41|W  28|L   2|L   9|"
## [21] "   21 | DINH DANG BUI                   |4.0  |W  43|L   1|W  47|L   3|W  40|W  39|L   6|"
## [22] "   22 | EUGENE L MCCLURE                |4.0  |W  64|D  52|L  28|W  15|H    |L  17|W  40|"
## [23] "   23 | ALAN BUI                        |4.0  |L   4|W  43|L  20|W  58|L  17|W  37|W  46|"
## [24] "   24 | MICHAEL R ALDRICH               |4.0  |L  28|L  47|W  43|L  25|W  60|W  44|W  39|"
## [25] "   25 | LOREN SCHWIEBERT                |3.5  |L   9|W  53|L   3|W  24|D  34|L  10|W  47|"
## [26] "   26 | MAX ZHU                         |3.5  |W  49|W  40|W  17|L   4|L   9|D  32|L  11|"
## [27] "   27 | GAURAV GIDWANI                  |3.5  |W  51|L  13|W  46|W  37|D  14|L   6|U    |"
## [28] "   28 | SOFIA ADINA STANESCU-BELLU      |3.5  |W  24|D   4|W  22|D  19|L  20|L   8|D  36|"
## [29] "   29 | CHIEDOZIE OKORIE                |3.5  |W  50|D   6|L  38|L  34|W  52|W  48|U    |"
## [30] "   30 | GEORGE AVERY JONES              |3.5  |L  52|D  64|L  15|W  55|L  31|W  61|W  50|"
## [31] "   31 | RISHI SHETTY                    |3.5  |L  58|D  55|W  64|L  10|W  30|W  50|L  14|"
## [32] "   32 | JOSHUA PHILIP MATHEWS           |3.5  |W  61|L   8|W  44|L  18|W  51|D  26|L  13|"
## [33] "   33 | JADE GE                         |3.5  |W  60|L  12|W  50|D  36|L  13|L  15|W  51|"
## [34] "   34 | MICHAEL JEFFERY THOMAS          |3.5  |L   6|W  60|L  37|W  29|D  25|L  11|W  52|"
## [35] "   35 | JOSHUA DAVID LEE                |3.5  |L  46|L  38|W  56|L   6|W  57|D  52|W  48|"
## [36] "   36 | SIDDHARTH JHA                   |3.5  |L  13|W  57|W  51|D  33|H    |L  16|D  28|"
## [37] "   37 | AMIYATOSH PWNANANDAM            |3.5  |B    |L   5|W  34|L  27|H    |L  23|W  61|"
## [38] "   38 | BRIAN LIU                       |3.0  |D  11|W  35|W  29|L  12|H    |L  18|L  15|"
## [39] "   39 | JOEL R HENDON                   |3.0  |L   1|W  54|W  40|L  16|W  44|L  21|L  24|"
## [40] "   40 | FOREST ZHANG                    |3.0  |W  20|L  26|L  39|W  59|L  21|W  56|L  22|"
## [41] "   41 | KYLE WILLIAM MURPHY             |3.0  |W  59|L  17|W  58|L  20|X    |U    |U    |"
## [42] "   42 | JARED GE                        |3.0  |L  12|L  50|L  57|D  60|D  61|W  64|W  56|"
## [43] "   43 | ROBERT GLEN VASEY               |3.0  |L  21|L  23|L  24|W  63|W  59|L  46|W  55|"
## [44] "   44 | JUSTIN D SCHILLING              |3.0  |B    |L  14|L  32|W  53|L  39|L  24|W  59|"
## [45] "   45 | DEREK YAN                       |3.0  |L   5|L  51|D  60|L  56|W  63|D  55|W  58|"
## [46] "   46 | JACOB ALEXANDER LAVALLEY        |3.0  |W  35|L   7|L  27|L  50|W  64|W  43|L  23|"
## [47] "   47 | ERIC WRIGHT                     |2.5  |L  18|W  24|L  21|W  61|L   8|D  51|L  25|"
## [48] "   48 | DANIEL KHAIN                    |2.5  |L  17|W  63|H    |D  52|H    |L  29|L  35|"
## [49] "   49 | MICHAEL J MARTIN                |2.5  |L  26|L  20|D  63|D  64|W  58|H    |U    |"
## [50] "   50 | SHIVAM JHA                      |2.5  |L  29|W  42|L  33|W  46|H    |L  31|L  30|"
## [51] "   51 | TEJAS AYYAGARI                  |2.5  |L  27|W  45|L  36|W  57|L  32|D  47|L  33|"
## [52] "   52 | ETHAN GUO                       |2.5  |W  30|D  22|L  19|D  48|L  29|D  35|L  34|"
## [53] "   53 | JOSE C YBARRA                   |2.0  |H    |L  25|H    |L  44|U    |W  57|U    |"
## [54] "   54 | LARRY HODGE                     |2.0  |L  14|L  39|L  61|B    |L  15|L  59|W  64|"
## [55] "   55 | ALEX KONG                       |2.0  |L  62|D  31|L  10|L  30|B    |D  45|L  43|"
## [56] "   56 | MARISA RICCI                    |2.0  |H    |L  11|L  35|W  45|H    |L  40|L  42|"
## [57] "   57 | MICHAEL LU                      |2.0  |L   7|L  36|W  42|L  51|L  35|L  53|B    |"
## [58] "   58 | VIRAJ MOHILE                    |2.0  |W  31|L   2|L  41|L  23|L  49|B    |L  45|"
## [59] "   59 | SEAN M MC CORMICK               |2.0  |L  41|B    |L   9|L  40|L  43|W  54|L  44|"
## [60] "   60 | JULIA SHEN                      |1.5  |L  33|L  34|D  45|D  42|L  24|H    |U    |"
## [61] "   61 | JEZZEL FARKAS                   |1.5  |L  32|L   3|W  54|L  47|D  42|L  30|L  37|"
## [62] "   62 | ASHWIN BALAJI                   |1.0  |W  55|U    |U    |U    |U    |U    |U    |"
## [63] "   63 | THOMAS JOSEPH HOSMER            |1.0  |L   2|L  48|D  49|L  43|L  45|H    |U    |"
## [64] "   64 | BEN LI                          |1.0  |L  22|D  30|L  31|D  49|L  46|L  42|L  54|"

# The first time a player's state shows up is on line 6
rating_rows <- file[seq(6, 196, 3)]
# The ratings of all 64 players are all there
length(rating_rows)

## [1] 64

The Regex Part

# Credit to ChatGPT for help with regex, I did the parts that did not involve regular expressions on my own

player_names <- trimws(gsub("\\d+ \\|\\s+([A-Z ]+)\\s+\\|.*", "\\1", player_rows))
print(player_names)

##  [1] "GARY HUA"                                                                              
##  [2] "DAKSHESH DARURI"                                                                       
##  [3] "ADITYA BAJAJ"                                                                          
##  [4] "PATRICK H SCHILLING"                                                                   
##  [5] "HANSHI ZUO"                                                                            
##  [6] "HANSEN SONG"                                                                           
##  [7] "GARY DEE SWATHELL"                                                                     
##  [8] "EZEKIEL HOUGHTON"                                                                      
##  [9] "STEFANO LEE"                                                                           
## [10] "ANVIT RAO"                                                                             
## [11] "CAMERON WILLIAM MC LEMAN"                                                              
## [12] "KENNETH J TACK"                                                                        
## [13] "TORRANCE HENRY JR"                                                                     
## [14] "BRADLEY SHAW"                                                                          
## [15] "ZACHARY JAMES HOUGHTON"                                                                
## [16] "MIKE NIKITIN"                                                                          
## [17] "RONALD GRZEGORCZYK"                                                                    
## [18] "DAVID SUNDEEN"                                                                         
## [19] "DIPANKAR ROY"                                                                          
## [20] "JASON ZHENG"                                                                           
## [21] "DINH DANG BUI"                                                                         
## [22] "EUGENE L MCCLURE"                                                                      
## [23] "ALAN BUI"                                                                              
## [24] "MICHAEL R ALDRICH"                                                                     
## [25] "LOREN SCHWIEBERT"                                                                      
## [26] "MAX ZHU"                                                                               
## [27] "GAURAV GIDWANI"                                                                        
## [28] "28 | SOFIA ADINA STANESCU-BELLU      |3.5  |W  24|D   4|W  22|D  19|L  20|L   8|D  36|"
## [29] "CHIEDOZIE OKORIE"                                                                      
## [30] "GEORGE AVERY JONES"                                                                    
## [31] "RISHI SHETTY"                                                                          
## [32] "JOSHUA PHILIP MATHEWS"                                                                 
## [33] "JADE GE"                                                                               
## [34] "MICHAEL JEFFERY THOMAS"                                                                
## [35] "JOSHUA DAVID LEE"                                                                      
## [36] "SIDDHARTH JHA"                                                                         
## [37] "AMIYATOSH PWNANANDAM"                                                                  
## [38] "BRIAN LIU"                                                                             
## [39] "JOEL R HENDON"                                                                         
## [40] "FOREST ZHANG"                                                                          
## [41] "KYLE WILLIAM MURPHY"                                                                   
## [42] "JARED GE"                                                                              
## [43] "ROBERT GLEN VASEY"                                                                     
## [44] "JUSTIN D SCHILLING"                                                                    
## [45] "DEREK YAN"                                                                             
## [46] "JACOB ALEXANDER LAVALLEY"                                                              
## [47] "ERIC WRIGHT"                                                                           
## [48] "DANIEL KHAIN"                                                                          
## [49] "MICHAEL J MARTIN"                                                                      
## [50] "SHIVAM JHA"                                                                            
## [51] "TEJAS AYYAGARI"                                                                        
## [52] "ETHAN GUO"                                                                             
## [53] "JOSE C YBARRA"                                                                         
## [54] "LARRY HODGE"                                                                           
## [55] "ALEX KONG"                                                                             
## [56] "MARISA RICCI"                                                                          
## [57] "MICHAEL LU"                                                                            
## [58] "VIRAJ MOHILE"                                                                          
## [59] "SEAN M MC CORMICK"                                                                     
## [60] "JULIA SHEN"                                                                            
## [61] "JEZZEL FARKAS"                                                                         
## [62] "ASHWIN BALAJI"                                                                         
## [63] "THOMAS JOSEPH HOSMER"                                                                  
## [64] "BEN LI"

# Extract total points as numeric values
total_points <- as.numeric(stringr::str_extract(player_rows, "\\d+\\.\\d+"))
print(total_points)

##  [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0 4.0 4.0
## [20] 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.0
## [39] 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5 2.5 2.0 2.0 2.0 2.0 2.0
## [58] 2.0 2.0 1.5 1.5 1.0 1.0 1.0

length(total_points)

## [1] 64

state <- trimws(gsub("^(\\s*)([A-Z]+)\\s*\\|.*", "\\2", rating_rows))
print(state)

##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI"
## [16] "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI" "MI" "ON"
## [31] "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [46] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [61] "ON" "MI" "MI" "MI"

#There's 64 states, one for each player, the output is 64
length(state)

## [1] 64

pre_rating <- as.integer(str_extract(str_extract(rating_rows, "[^\\d]\\d{3,4}[^\\d]"), "\\d+"))
pre_rating

##  [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610 1220
## [16] 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507 1602 1522
## [31] 1494 1441 1449 1399 1438 1355  980 1423 1436 1348 1403 1332 1283 1199 1242
## [46]  377 1362 1382 1291 1056 1011  935 1393 1270 1186 1153 1092  917  853  967
## [61]  955 1530 1175 1163

# Create a vector to store opponents for each player
opponents <- vector("list", length(player_rows))

# Function to extract opponents from a line
extract_opponents <- function(line) {
  opponent_pattern <- "\\b(W|L|D) +(\\d+)\\b"
  opponent_matches <- gregexpr(opponent_pattern, line)
  opponent_indices <- regmatches(line, opponent_matches)
  extracted_opponents <- numeric()
  for (indices in opponent_indices) {
    opponent <- as.numeric(sub(opponent_pattern, "\\2", indices))
    extracted_opponents <- c(extracted_opponents, opponent)
  }
  return(extracted_opponents)
}

# Loop through player data and extract opponents
for (i in 1:length(player_rows)) {
  player_opponents <- extract_opponents(player_rows[i])
  opponents[[i]] <- player_opponents
}

# This part is done by me because it doesn't involve regex

calc_avg_rating <- function(players) {
  total <- 0
  num_of_opponents <- length(players)
  for (i in 1:num_of_opponents) {
    opponent_num <- players[i]
    opponent_rating <- pre_rating[opponent_num]
    total <- total + opponent_rating
  }
  rounded_avg <- round(total/num_of_opponents)
  return(rounded_avg)
}

average_opponent_ratings <- c()

for (i in 1:64) {
  average_opponent_ratings[i] <- calc_avg_rating(opponents[[i]])
}

Creating the Data Frame

chess_players <- data.frame(player_names, state, total_points, pre_rating, average_opponent_ratings)

# Get the average pre_rating for every player
print(mean(chess_players$pre_rating))

## [1] 1378.5

#Get the median pre_rating
print(median(chess_players$pre_rating))

## [1] 1407

# Plot the pre_ratings of each player per state
chess_players %>%
  ggplot(aes(x = state, y = pre_rating)) + geom_point()

head(chess_players)

##          player_names state total_points pre_rating average_opponent_ratings
## 1            GARY HUA    ON          6.0       1794                     1605
## 2     DAKSHESH DARURI    MI          6.0       1553                     1469
## 3        ADITYA BAJAJ    MI          6.0       1384                     1564
## 4 PATRICK H SCHILLING    MI          5.5       1716                     1574
## 5          HANSHI ZUO    MI          5.5       1655                     1501
## 6         HANSEN SONG    OH          5.0       1686                     1519

Renaming and Generating the CSV File

# Renaming the columns
names(chess_players) <- c("Player's Name", "Player's State", "Total Number of Points", "Player's Pre-Rating", "Average Pre Chess Rating Of Opponents")

write.csv(chess_players, "chess_stats.csv", row.names = FALSE)

Viewing the CSV File

chess_data <- read.csv("chess_stats.csv")
View(chess_data)

Project 1

Kelly Eng

2023-09-20