Input:

a text file with chess tournament results where the information has some structure.

Output:

an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents. For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

library(stringr)
all_data = readLines("tournamentinfo.txt") #loading the data
## Warning in readLines("tournamentinfo.txt"): incomplete final line found on
## 'tournamentinfo.txt'
head(all_data) #looking at the data
## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
summary(all_data)
##    Length     Class      Mode 
##       196 character character

We are going to spend some time in the pre-processing process, removing the extra characters and spaces

first<-all_data[1] #the first line has some extra characters
print(first)
## [1] "-----------------------------------------------------------------------------------------"
c<-grep(first, all_data) #list of the lines looking just like the first line
all_data1<-all_data[-c] #We remove all the extra lines from our data

print(all_data1[c(1,2)])
## [1] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [2] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
all_data2<-all_data1[c(-1,-2)] #We remove the first two lines which contain some headers

all_data3<-gsub("\\s+"," ",all_data2) #we remove the extra spaces in our data
all_data3<-gsub("^\\s+","",all_data3)
head(all_data3)
## [1] "1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"       
## [2] "ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |" 
## [3] "2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [4] "MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |" 
## [5] "3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"  
## [6] "MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"

In the input file, the informations about a player are displayed on two lines. In the section below, we are going to make one line for each player.

i<-seq(1,length(all_data3)-1,2)
j<-i+1
newdf<-as.data.frame(paste(all_data3[i], all_data3[j])) 
head(newdf)
##                                                                                           paste(all_data3[i], all_data3[j])
## 1           1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 2    2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7| MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## 3      3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12| MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |
## 4 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1| MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |
## 5        5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17| MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |
## 6      6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21| OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |
names(newdf)<-"y"
asvector<-as.vector(newdf$y)
#in the section below, we remove the delimiters |
i<-1:length(asvector)
df<-as.data.frame(matrix(c(unlist(strsplit(asvector[i],"\\|"))), byrow=T, ncol=20, nrow=64),stringsAsFactors = FALSE)

In the following portion of code below, we extract the pre ratings from each player.

#column before processing
head(df$V12)
## [1] " 15445895 / R: 1794 ->1817 " " 14598900 / R: 1553 ->1663 "
## [3] " 14959604 / R: 1384 ->1640 " " 12616049 / R: 1716 ->1744 "
## [5] " 14601533 / R: 1655 ->1690 " " 15055204 / R: 1686 ->1687 "
i<-1:length(asvector)
df[i,"V12"]<- str_replace_all(str_extract(gsub("^\\s+|\\s+","",df[i,"V12"]),":\\d{3,4}P*"),":|P","") 
#column afterwards
head(df$V12)
## [1] "1794" "1553" "1384" "1716" "1655" "1686"

In the following section of code, we compute the average pre ratings for each player oponents.

col<-c("V4","V5","V6","V7","V8","V9","V10") #these column contain the informations required
df1<-apply(df[,col],1,function(x) return(as.numeric(df[as.numeric(gsub("^W|^D|^L","",gsub("U|H|B|X","0", gsub("^\\s+|\\s+","",x[1:7])))),12])))

df2<-unlist(lapply(df1,mean))
print(df2) #we need to round our results in order to comply with the given output format.
##  [1] 1605.286 1469.286 1563.571 1573.571 1500.857 1518.714 1372.143
##  [8] 1468.429 1523.143 1554.143 1467.571 1506.167 1497.857 1515.000
## [15] 1483.857 1385.800 1498.571 1480.000 1426.286 1410.857 1470.429
## [22] 1300.333 1213.857 1357.000 1363.286 1506.857 1221.667 1522.143
## [29] 1313.500 1144.143 1259.857 1378.714 1276.857 1375.286 1149.714
## [36] 1388.167 1384.800 1539.167 1429.571 1390.571 1248.500 1149.857
## [43] 1106.571 1327.000 1152.000 1357.714 1392.000 1355.800 1285.800
## [50] 1296.000 1356.143 1494.571 1345.333 1206.167 1406.000 1414.400
## [57] 1363.000 1391.000 1319.000 1330.200 1327.286 1186.000 1350.200
## [64] 1263.000
df3<-round(df2,0)
print(df3)
##  [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515
## [15] 1484 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522
## [29] 1314 1144 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150
## [43] 1107 1327 1152 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414
## [57] 1363 1391 1319 1330 1327 1186 1350 1263
df4<-as.data.frame(df3)
results<-as.data.frame(unlist(cbind(df$V2,df$V11,df$V3,df$V12,df4$df3)))
head(results)
##                      V1   V2   V3   V4   V5
## 1             GARY HUA   ON  6.0  1794 1605
## 2      DAKSHESH DARURI   MI  6.0  1553 1469
## 3         ADITYA BAJAJ   MI  6.0  1384 1564
## 4  PATRICK H SCHILLING   MI  5.5  1716 1574
## 5           HANSHI ZUO   MI  5.5  1655 1501
## 6          HANSEN SONG   OH  5.0  1686 1519
#Assigning the column's names
thecolnames<-c("Name", "State","Number-Points", "Pre-Rating", "AVG-Opponent- Pre-rating")
names(results)<-thecolnames
head(results)
##                    Name State Number-Points Pre-Rating
## 1             GARY HUA    ON           6.0        1794
## 2      DAKSHESH DARURI    MI           6.0        1553
## 3         ADITYA BAJAJ    MI           6.0        1384
## 4  PATRICK H SCHILLING    MI           5.5        1716
## 5           HANSHI ZUO    MI           5.5        1655
## 6          HANSEN SONG    OH           5.0        1686
##   AVG-Opponent- Pre-rating
## 1                     1605
## 2                     1469
## 3                     1564
## 4                     1574
## 5                     1501
## 6                     1519
# Write CSV in R
write.csv(results, file = "results_tournament.csv") #creating the csv file.