Import libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)

Load the data

text_data <- read_lines("https://raw.githubusercontent.com/jewelercart/R/main/tournamentinfo.txt")
head(text_data)
## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

Preprocessing the data

Getting names of all the players

player_names <- character(0)
total_point <- numeric(0)
# Define a regular expression pattern to match player names
pattern <- "^\\s*\\d+\\s+\\|\\s+(.+?)\\s+\\|.*$"

# Iterate through lines in the file
for (line in text_data) {
  # Use regular expression to extract player names
  if (grepl(pattern, line)) {
    match_data <- str_match(line, pattern)
    player_name <- match_data[2]
    player_names <- c(player_names, player_name)
    point<- str_extract(line, "[[:digit:]]+\\.[[:digit:]]")
    total_point<- c(total_point, as.numeric(point))
  }
  
}

# Print the extracted player names
print("Players are : ")
## [1] "Players are : "
print(player_names)
##  [1] "GARY HUA"                   "DAKSHESH DARURI"           
##  [3] "ADITYA BAJAJ"               "PATRICK H SCHILLING"       
##  [5] "HANSHI ZUO"                 "HANSEN SONG"               
##  [7] "GARY DEE SWATHELL"          "EZEKIEL HOUGHTON"          
##  [9] "STEFANO LEE"                "ANVIT RAO"                 
## [11] "CAMERON WILLIAM MC LEMAN"   "KENNETH J TACK"            
## [13] "TORRANCE HENRY JR"          "BRADLEY SHAW"              
## [15] "ZACHARY JAMES HOUGHTON"     "MIKE NIKITIN"              
## [17] "RONALD GRZEGORCZYK"         "DAVID SUNDEEN"             
## [19] "DIPANKAR ROY"               "JASON ZHENG"               
## [21] "DINH DANG BUI"              "EUGENE L MCCLURE"          
## [23] "ALAN BUI"                   "MICHAEL R ALDRICH"         
## [25] "LOREN SCHWIEBERT"           "MAX ZHU"                   
## [27] "GAURAV GIDWANI"             "SOFIA ADINA STANESCU-BELLU"
## [29] "CHIEDOZIE OKORIE"           "GEORGE AVERY JONES"        
## [31] "RISHI SHETTY"               "JOSHUA PHILIP MATHEWS"     
## [33] "JADE GE"                    "MICHAEL JEFFERY THOMAS"    
## [35] "JOSHUA DAVID LEE"           "SIDDHARTH JHA"             
## [37] "AMIYATOSH PWNANANDAM"       "BRIAN LIU"                 
## [39] "JOEL R HENDON"              "FOREST ZHANG"              
## [41] "KYLE WILLIAM MURPHY"        "JARED GE"                  
## [43] "ROBERT GLEN VASEY"          "JUSTIN D SCHILLING"        
## [45] "DEREK YAN"                  "JACOB ALEXANDER LAVALLEY"  
## [47] "ERIC WRIGHT"                "DANIEL KHAIN"              
## [49] "MICHAEL J MARTIN"           "SHIVAM JHA"                
## [51] "TEJAS AYYAGARI"             "ETHAN GUO"                 
## [53] "JOSE C YBARRA"              "LARRY HODGE"               
## [55] "ALEX KONG"                  "MARISA RICCI"              
## [57] "MICHAEL LU"                 "VIRAJ MOHILE"              
## [59] "SEAN M MC CORMICK"          "JULIA SHEN"                
## [61] "JEZZEL FARKAS"              "ASHWIN BALAJI"             
## [63] "THOMAS JOSEPH HOSMER"       "BEN LI"
print("Total points are: ")
## [1] "Total points are: "
print(total_point)
##  [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0 4.0 4.0
## [20] 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.0
## [39] 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5 2.5 2.0 2.0 2.0 2.0 2.0
## [58] 2.0 2.0 1.5 1.5 1.0 1.0 1.0
player_states=character(0)
## Firs I will select all the rows containg a player's state ON, MI or OH
states_data <- grep("\\b(ON|MI|OH)\\b", text_data, value = TRUE)
##Now I can match player's state and add to a variable
Pre_rating = numeric(0)
for (line in states_data){
 st <- str_extract(line,'ON|MI|OH') 
 player_states <- c(player_states, st)
 
}
print(player_states)
##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI"
## [16] "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI" "MI" "ON"
## [31] "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [46] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [61] "ON" "MI" "MI" "MI"

We cann also extract subpart of string without using the loop as follows:

rating<-str_extract_all(states_data, ".\\: \\s?[[:digit:]]{3,4}")
rating <- gsub(rating, pattern="R: ", replacement="", fixed = TRUE)
pre_rating <- as.numeric(rating)
print(pre_rating)
##  [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610 1220
## [16] 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507 1602 1522
## [31] 1494 1441 1449 1399 1438 1355  980 1423 1436 1348 1403 1332 1283 1199 1242
## [46]  377 1362 1382 1291 1056 1011  935 1393 1270 1186 1153 1092  917  853  967
## [61]  955 1530 1175 1163
text_data <- text_data[-c(0:4)]
text_data<- text_data[sapply(text_data, nchar)>0]
text_data_od <- text_data[c(seq(1, length(text_data), 3))]
opponent_player <- str_extract_all(text_data_od, "[[:digit:]]{1,2}")

opp_numeric = numeric(0)
for (line in opponent_player){
 players<- line[4: length(line)]
 opp_numeric <- c(opp_numeric, list((players)))
 
}

print(head(opp_numeric))
## [[1]]
## [1] "39" "21" "18" "14" "7"  "12" "4" 
## 
## [[2]]
## [1] "63" "58" "4"  "17" "16" "20" "7" 
## 
## [[3]]
## [1] "8"  "61" "25" "21" "11" "13" "12"
## 
## [[4]]
## [1] "23" "28" "2"  "26" "5"  "19" "1" 
## 
## [[5]]
## [1] "45" "37" "12" "13" "4"  "14" "17"
## 
## [[6]]
## [1] "34" "29" "11" "35" "10" "27" "21"
opponent_avg_rating<-list()
for (i in 1:length(opp_numeric)){
  opponent_avg_rating[i]<- round(mean(as.numeric(unlist(opp_numeric[i]))), 2)
}
opponent_avg_rating<- unlist(opponent_avg_rating)
opponent_avg_rating
##  [1] 16.43 26.43 21.57 14.86 20.29 23.86 19.86 21.71 23.29 24.29 24.29 20.33
## [13] 20.43 24.29 30.29 20.40 23.86 22.29 19.29 27.43 25.57 36.00 32.14 40.86
## [25] 25.71 23.14 27.83 19.00 38.00 46.86 40.14 31.57 33.86 31.43 43.29 33.00
## [37] 30.00 20.00 28.57 34.71 38.50 51.43 41.57 36.83 49.71 35.57 29.71 39.20
## [49] 46.20 35.17 39.57 31.00 42.00 42.00 36.83 34.60 37.33 31.83 38.50 35.60
## [61] 35.00 55.00 37.40 39.14
df<- cbind.data.frame(player_names, player_states, total_point, pre_rating, opponent_avg_rating)
colnames(df)<- c("Player's name", "Player's state", "Total number of points", "Player's Pre-Rating", "Opponent's Average Pre-Rating")
head(df)
write.csv(df, "chess_rating.csv")