Reading data

library(stringr)
library(knitr)
tournament <- read.delim("https://raw.githubusercontent.com/hrensimin05/Data_607/master/tournamentinfo.txt")
head(tournament)
##   X.........................................................................................
## 1  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 2  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 3  -----------------------------------------------------------------------------------------
## 4      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 5     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 6  -----------------------------------------------------------------------------------------
#removing three top rows
tournament <-tournament[-c(1:3),]
tournament %>% head() 
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [5] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "-----------------------------------------------------------------------------------------"
#removing unnecessary rows 
#first step
tournament_new <- data.frame(str_replace_all(tournament,"-",""))
head(tournament_new)
##                                                        str_replace_all.tournament..........
## 1     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 2     ON | 15445895 / R: 1794   >1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 3                                                                                          
## 4     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## 5     MI | 14598900 / R: 1553   >1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 6
#second step 
tournament_new <- data.frame(tournament_new[!apply(tournament_new == "", 1, all),])
head(tournament_new)
##                                      tournament_new..apply.tournament_new........1..all....
## 1     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 2     ON | 15445895 / R: 1794   >1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 3     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## 4     MI | 14598900 / R: 1553   >1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 5     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
## 6     MI | 14959604 / R: 1384   >1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |

Extracting data

#Extracting data file 

pre<-"(?<!\\>\\s)(?<=\\s{1,2}|\\s\\:)(\\d{3,4}(?=\\s|P))"
id_data <- "\\d{1,2}(?=\\s\\|)"
name_data <- "([A-Z]+\\s){2,}"
state_data <- "([A-Z]){2}\\s(?=\\|)"
points_data <- "\\d\\.\\d"

pre_points <-as.integer( unlist(str_extract_all(unlist(tournament_new), pre)))
player_id <-as.integer( unlist(str_extract_all(unlist(tournament_new), id_data)))             
players_names <- unlist(str_extract_all(unlist(tournament_new), name_data))
players_states <- unlist(str_extract_all(unlist(tournament_new), state_data))
total_points <- as.numeric(unlist(str_extract_all(unlist(tournament_new), points_data)))
#creating a new data frame with headers
tournament_df<-data.frame(player_id,players_names,players_states,total_points,pre_points)

#extracting opponents IDs
df<-str_extract_all(tournament,"[WDL]...\\d{1,2}")
df<-str_extract_all(df,"\\.?\\d{1,2}")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
df<-str_replace_all(df,"\\b[0]\\b",".")
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
df1<-str_detect(df,fixed("."))
df<-df[!(df1)]
head(df)
## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")"  
## [2] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")"  
## [3] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")" 
## [4] "c(\"23\", \"28\", \"2\", \"26\", \"5\", \"19\", \"1\")"   
## [5] "c(\"45\", \"37\", \"12\", \"13\", \"4\", \"14\", \"17\")" 
## [6] "c(\"34\", \"29\", \"11\", \"35\", \"10\", \"27\", \"21\")"

Calculating average

average<-array(0,dim=nrow(tournament_df))
for (i in 1:nrow(tournament_df)){ 
  results<-as.numeric(str_split(unlist(str_extract_all(df[i],"\\d{1,2}"))," "))
  average[i]<-mean(tournament_df[results,colnames(tournament_df)=="pre_points"])
}
tournament_df$avg_rating<-average

head(tournament_df)
##   player_id        players_names players_states total_points pre_points
## 1         1            GARY HUA             ON           6.0       1794
## 2         2     DAKSHESH DARURI             MI           6.0       1553
## 3         3        ADITYA BAJAJ             MI           6.0       1384
## 4         4 PATRICK H SCHILLING             MI           5.5       1716
## 5         5          HANSHI ZUO             MI           5.5       1655
## 6         6         HANSEN SONG             OH           5.0       1686
##   avg_rating
## 1   1605.286
## 2   1469.286
## 3   1563.571
## 4   1573.571
## 5   1500.857
## 6   1518.714

Writing a new file

write.csv(tournament_df,"Chess_tournamentWithAverage",row.names=FALSE)