library(stringr)
library(knitr)
tournament <- read.delim("https://raw.githubusercontent.com/hrensimin05/Data_607/master/tournamentinfo.txt")
head(tournament)
## X.........................................................................................
## 1 Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## 2 Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## 3 -----------------------------------------------------------------------------------------
## 4 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 5 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 6 -----------------------------------------------------------------------------------------
#removing three top rows
tournament <-tournament[-c(1:3),]
tournament %>% head()
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [5] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [6] "-----------------------------------------------------------------------------------------"
#removing unnecessary rows
#first step
tournament_new <- data.frame(str_replace_all(tournament,"-",""))
head(tournament_new)
## str_replace_all.tournament..........
## 1 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 2 ON | 15445895 / R: 1794 >1817 |N:2 |W |B |W |B |W |B |W |
## 3
## 4 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## 5 MI | 14598900 / R: 1553 >1663 |N:2 |B |W |B |W |B |W |B |
## 6
#second step
tournament_new <- data.frame(tournament_new[!apply(tournament_new == "", 1, all),])
head(tournament_new)
## tournament_new..apply.tournament_new........1..all....
## 1 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 2 ON | 15445895 / R: 1794 >1817 |N:2 |W |B |W |B |W |B |W |
## 3 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## 4 MI | 14598900 / R: 1553 >1663 |N:2 |B |W |B |W |B |W |B |
## 5 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|
## 6 MI | 14959604 / R: 1384 >1640 |N:2 |W |B |W |B |W |B |W |
#Extracting data file
pre<-"(?<!\\>\\s)(?<=\\s{1,2}|\\s\\:)(\\d{3,4}(?=\\s|P))"
id_data <- "\\d{1,2}(?=\\s\\|)"
name_data <- "([A-Z]+\\s){2,}"
state_data <- "([A-Z]){2}\\s(?=\\|)"
points_data <- "\\d\\.\\d"
pre_points <-as.integer( unlist(str_extract_all(unlist(tournament_new), pre)))
player_id <-as.integer( unlist(str_extract_all(unlist(tournament_new), id_data)))
players_names <- unlist(str_extract_all(unlist(tournament_new), name_data))
players_states <- unlist(str_extract_all(unlist(tournament_new), state_data))
total_points <- as.numeric(unlist(str_extract_all(unlist(tournament_new), points_data)))
#creating a new data frame with headers
tournament_df<-data.frame(player_id,players_names,players_states,total_points,pre_points)
#extracting opponents IDs
df<-str_extract_all(tournament,"[WDL]...\\d{1,2}")
df<-str_extract_all(df,"\\.?\\d{1,2}")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
df<-str_replace_all(df,"\\b[0]\\b",".")
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
df1<-str_detect(df,fixed("."))
df<-df[!(df1)]
head(df)
## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")"
## [2] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")"
## [3] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")"
## [4] "c(\"23\", \"28\", \"2\", \"26\", \"5\", \"19\", \"1\")"
## [5] "c(\"45\", \"37\", \"12\", \"13\", \"4\", \"14\", \"17\")"
## [6] "c(\"34\", \"29\", \"11\", \"35\", \"10\", \"27\", \"21\")"
average<-array(0,dim=nrow(tournament_df))
for (i in 1:nrow(tournament_df)){
results<-as.numeric(str_split(unlist(str_extract_all(df[i],"\\d{1,2}"))," "))
average[i]<-mean(tournament_df[results,colnames(tournament_df)=="pre_points"])
}
tournament_df$avg_rating<-average
head(tournament_df)
## player_id players_names players_states total_points pre_points
## 1 1 GARY HUA ON 6.0 1794
## 2 2 DAKSHESH DARURI MI 6.0 1553
## 3 3 ADITYA BAJAJ MI 6.0 1384
## 4 4 PATRICK H SCHILLING MI 5.5 1716
## 5 5 HANSHI ZUO MI 5.5 1655
## 6 6 HANSEN SONG OH 5.0 1686
## avg_rating
## 1 1605.286
## 2 1469.286
## 3 1563.571
## 4 1573.571
## 5 1500.857
## 6 1518.714
write.csv(tournament_df,"Chess_tournamentWithAverage",row.names=FALSE)