In this project, a text file with chess tournament results is given. The information in the file has some structure.
tournament.txt - File structure
This RMarkdown doc shows how to generates a .CSV file with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605
1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
library(stringr)
txt <- readLines("tournamentinfo.txt", warn = F)
head(txt)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
tail(txt)
## [1] " 63 | THOMAS JOSEPH HOSMER |1.0 |L 2|L 48|D 49|L 43|L 45|H |U |"
## [2] " MI | 15057092 / R: 1175 ->1125 | |W |B |W |B |B | | |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 64 | BEN LI |1.0 |L 22|D 30|L 31|D 49|L 46|L 42|L 54|"
## [5] " MI | 15006561 / R: 1163 ->1112 | |B |W |W |B |W |B |B |"
## [6] "-----------------------------------------------------------------------------------------"
names <- unlist(str_extract_all(txt, "\\d+ \\| [[:alpha:]- ]{2,}\\s+\\|"))
names <- str_trim(unlist(str_extract_all(names, "[[:alpha:]- ]{2,}")))
names[27:28]
## [1] "GAURAV GIDWANI" "SOFIA ADINA STANESCU-BELLU"
pattern <-"R:\\s+(\\d+)"
pre_rates <- unlist(str_extract_all(txt, pattern))
head(pre_rates)
## [1] "R: 1794" "R: 1553" "R: 1384" "R: 1716" "R: 1655" "R: 1686"
pre_rates <- gsub(pattern, "\\1", pre_rates)
pre_rates <-as.integer(pre_rates)
pre_rates
## [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610
## [15] 1220 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507
## [29] 1602 1522 1494 1441 1449 1399 1438 1355 980 1423 1436 1348 1403 1332
## [43] 1283 1199 1242 377 1362 1382 1291 1056 1011 935 1393 1270 1186 1153
## [57] 1092 917 853 967 955 1530 1175 1163
states <- unlist(str_extract_all(txt, " [[:upper:]]{2} \\|"))
states <- unlist(str_extract_all(states, "[[:upper:]]{2}"))
states
## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
# Just the first 9 backreferences are included in the pattern below
pattern1 <-"(\\d\\.\\d)\\s+\\|([[:upper:]])\\s+(\\d{0,2}?)\\|([[:upper:]])\\s+(\\d{0,2}?)\\|([[:upper:]])\\s+(\\d{0,2}?)\\|([[:upper:]])\\s+(\\d{0,2}?)\\|[[:upper:]]\\s+\\d{0,2}?\\|[[:upper:]]\\s+\\d{0,2}?\\|[[:upper:]]\\s+\\d{0,2}?\\|"
scores1 <- unlist(str_extract_all(txt,pattern1))
scores1[1:5]
## [1] "6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] "6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] "6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] "5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] "5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
pts <- gsub(pattern1, "\\1", scores1)
pts <- as.numeric(pts)
pts[1:5]
## [1] 6.0 6.0 6.0 5.5 5.5
r1_status <- gsub(pattern1, "\\2", scores1)
r1_status[1:5]
## [1] "W" "W" "L" "W" "W"
r1_opponent <- gsub(pattern1, "\\3", scores1)
r1_opponent <- as.integer(r1_opponent)
r1_opponent[1:5]
## [1] 39 63 8 23 45
r2_status <- gsub(pattern1, "\\4", scores1)
r2_opponent <- gsub(pattern1, "\\5", scores1)
r2_opponent <- as.integer(r2_opponent)
tail(r2_opponent)
## [1] NA 34 3 NA 48 30
r3_status <- gsub(pattern1, "\\6", scores1)
r3_opponent <- gsub(pattern1, "\\7", scores1)
r3_opponent <- as.integer(r3_opponent)
r4_status <- gsub(pattern1, "\\8", scores1)
r4_opponent <- gsub(pattern1, "\\9", scores1)
r4_opponent <- as.integer(r4_opponent)
Because we can only use backreferences \1 through \9 in the replacement text, a new pattern is used to include backreferences for rounds 5 to 7 in each string.
# text pattern backreferenced in the end of each string
pattern2 <-"\\d\\.\\d\\s+\\|[[:upper:]]\\s+\\d{0,2}?\\|[[:upper:]]\\s+\\d{0,2}?\\|[[:upper:]]\\s+\\d{0,2}?\\|[[:upper:]]\\s+\\d{0,2}?\\|([[:upper:]])\\s+(\\d{0,2}?)\\|([[:upper:]])\\s+(\\d{0,2}?)\\|([[:upper:]])\\s+(\\d{0,2}?)\\|"
scores2 <- unlist(str_extract_all(txt,pattern2))
r5_status <- gsub(pattern2, "\\1", scores2)
r5_opponent <- gsub(pattern2, "\\2", scores2)
r5_opponent <- as.integer(r5_opponent)
head(r5_status)
## [1] "W" "W" "W" "D" "D" "D"
r6_status <- gsub(pattern2, "\\3", scores2)
r6_opponent <- gsub(pattern2, "\\4", scores2)
r6_opponent <- as.integer(r6_opponent)
r7_status <- gsub(pattern2, "\\5", scores2)
r7_opponent <- gsub(pattern2, "\\6", scores2)
r7_opponent <- as.integer(r7_opponent)
tail(r7_opponent)
## [1] 44 NA 37 NA NA 54
df <- data.frame("Name"=names, "State"=states,"Total.Pts"=pts, "R1.Player"=r1_opponent, "R2.Player"=r2_opponent, "R3.Player"=r3_opponent, "R4.Player"=r4_opponent, "R5.Player"=r5_opponent, "R6.Player"=r6_opponent, "R7.Player"=r7_opponent,"Pre rating"=pre_rates, stringsAsFactors = F)
head(df)
## Name State Total.Pts R1.Player R2.Player R3.Player
## 1 GARY HUA ON 6.0 39 21 18
## 2 DAKSHESH DARURI MI 6.0 63 58 4
## 3 ADITYA BAJAJ MI 6.0 8 61 25
## 4 PATRICK H SCHILLING MI 5.5 23 28 2
## 5 HANSHI ZUO MI 5.5 45 37 12
## 6 HANSEN SONG OH 5.0 34 29 11
## R4.Player R5.Player R6.Player R7.Player Pre.rating
## 1 14 7 12 4 1794
## 2 17 16 20 7 1553
## 3 21 11 13 12 1384
## 4 26 5 19 1 1716
## 5 13 4 14 17 1655
## 6 35 10 27 21 1686
sapply(df, class)
## Name State Total.Pts R1.Player R2.Player R3.Player
## "character" "character" "numeric" "integer" "integer" "integer"
## R4.Player R5.Player R6.Player R7.Player Pre.rating
## "integer" "integer" "integer" "integer" "integer"
dim(df)
## [1] 64 11
The following function collects the non-NA values of the oppenents for player and compute the average of pre-ratings of the opponents.
avrg <- function(df){
avrg_vec <- vector('numeric')
values <- df[,11]
for (i in 1:64){
index <- which(!is.na(df[i,4:10]))
opponents <- as.integer(df[i,4:10][index])
temp <- round(mean(values[opponents]),0)
avrg_vec <- c(avrg_vec,temp)
}
return(avrg_vec)
}
Now applying the function above to the data frame:
averages <- avrg(df)
ExtraCol <-data.frame("Average"=averages)
df <-cbind(df,ExtraCol)
df[c(1:4,12),]
## Name State Total.Pts R1.Player R2.Player R3.Player
## 1 GARY HUA ON 6.0 39 21 18
## 2 DAKSHESH DARURI MI 6.0 63 58 4
## 3 ADITYA BAJAJ MI 6.0 8 61 25
## 4 PATRICK H SCHILLING MI 5.5 23 28 2
## 12 KENNETH J TACK MI 4.5 42 33 5
## R4.Player R5.Player R6.Player R7.Player Pre.rating Average
## 1 14 7 12 4 1794 1605
## 2 17 16 20 7 1553 1469
## 3 21 11 13 12 1384 1564
## 4 26 5 19 1 1716 1574
## 12 38 NA 1 3 1663 1506
write.csv(df,'ChessTournment.csv')