library(stringr)
library(ggplot2)
player_names="tournamentinfo.txt"
data=readLines(player_names)
## Warning in readLines(player_names): incomplete final line found on
## 'tournamentinfo.txt'
data[1:22]
##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------" 
## [11] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|" 
## [12] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [13] "-----------------------------------------------------------------------------------------" 
## [14] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|" 
## [15] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |" 
## [16] "-----------------------------------------------------------------------------------------" 
## [17] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|" 
## [18] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [19] "-----------------------------------------------------------------------------------------" 
## [20] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|" 
## [21] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |" 
## [22] "-----------------------------------------------------------------------------------------"
line_vector<-str_detect(data,"\\-----")
chess_data<-data[!line_vector]
chess_data<-chess_data[3:length(chess_data)]
chess_data<-str_split(chess_data,"\\|")
ext_data<-str_extract_all(chess_data,"[[:alpha:]-?[:alpha:]  ?]{2,}")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
names_loc<-str_detect(unlist(ext_data),"[[:alpha:]]{3,}")
names<-unlist(ext_data)[names_loc]
head(names)
## [1] " GARY HUA                        " " DAKSHESH DARURI                 "
## [3] " ADITYA BAJAJ                    " " PATRICK H SCHILLING             "
## [5] " HANSHI ZUO                      " " HANSEN SONG                     "
state_loc<-str_detect(unlist(ext_data),"[[:alpha:]]{2}")
state<-unlist(ext_data)[(state_loc)&(!names_loc)]
head(state)
## [1] "   ON " "   MI " "   MI " "   MI " "   MI " "   OH "
num_data<-str_extract_all(chess_data,"\\d{1,}+\\.?.?")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
pt_loc<-str_detect(unlist(num_data),"\\d\\.\\d")
Points<-unlist(num_data)[(pt_loc)]
head(Points)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
rtg_data<-str_extract_all(chess_data,"(( \\:)|(\\>))?.?\\d{1,}P*\\.?")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
pre_loc<-str_detect(unlist(rtg_data),"\\b\\d{3,4}P?\\b")
post_loc<-str_detect(unlist(rtg_data),"\\>.?\\b\\d{3,4}P?\\b")
pre_rating<-unlist(rtg_data)[(pre_loc)&(!post_loc)]
pre_rating<-str_replace_all(pre_rating,"P","")
post_rating<-unlist(rtg_data)[post_loc]
post_rating<-str_replace_all(post_rating,"([>P])","")
head(pre_rating)
## [1] " 1794" " 1553" " 1384" " 1716" " 1655" " 1686"
head(post_rating)
## [1] "1817" "1663" "1640" "1744" "1690" "1687"
head(post_rating)
## [1] "1817" "1663" "1640" "1744" "1690" "1687"
game_data<-str_extract_all(chess_data,"[WDL]...\\d{1,2}")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
game_data<-str_extract_all(game_data,"\\.?\\d{1,2}")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
game_data<-str_replace_all(game_data,"\\b[0]\\b",".")
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
gm_zl<-str_detect(game_data,fixed("."))
game_data<-game_data[!(gm_zl)]
head(game_data)
## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")"  
## [2] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")"  
## [3] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")" 
## [4] "c(\"23\", \"28\", \"2\", \"26\", \"5\", \"19\", \"1\")"   
## [5] "c(\"45\", \"37\", \"12\", \"13\", \"4\", \"14\", \"17\")" 
## [6] "c(\"34\", \"29\", \"11\", \"35\", \"10\", \"27\", \"21\")"
Position<-seq(1,64,by=1)
Player_names<-str_trim(names,"both")
State<-str_trim(state,"both")
pre_rating<-str_trim(pre_rating,"both")
post_rating<-str_trim(post_rating,"both")
playerranks<-cbind(Position,Player_names,State,Points,pre_rating,post_rating)
playerranks<-as.data.frame(playerranks)
#numeric data are stored as factors, convert to numeric
playerranks$pts<-as.numeric(as.character(playerranks$Points))
playerranks$pre_rating<-as.numeric(as.character(playerranks$pre_rating))
playerranks$post_rating<-as.numeric(as.character(playerranks$post_rating))
head(playerranks)
##   Position        Player_names State Points pre_rating post_rating pts
## 1        1            GARY HUA    ON    6.0       1794        1817 6.0
## 2        2     DAKSHESH DARURI    MI    6.0       1553        1663 6.0
## 3        3        ADITYA BAJAJ    MI    6.0       1384        1640 6.0
## 4        4 PATRICK H SCHILLING    MI    5.5       1716        1744 5.5
## 5        5          HANSHI ZUO    MI    5.5       1655        1690 5.5
## 6        6         HANSEN SONG    OH    5.0       1686        1687 5.0
#calculate the mean of opponent rankings
result<-array(0,dim=nrow(playerranks))
#loop through all the players
for (i in 1:nrow(playerranks)){ 
  #extract opponent data and query rankings
  match_res<-as.numeric(str_split(unlist(str_extract_all(game_data[i],"\\d{1,2}"))," "))
  #calcuate the average of opponent rankings
result[i]<-mean(playerranks[match_res,colnames(playerranks)=="pre_rating"])
};

playerranks$avg_rating<-result
head(playerranks)
##   Position        Player_names State Points pre_rating post_rating pts
## 1        1            GARY HUA    ON    6.0       1794        1817 6.0
## 2        2     DAKSHESH DARURI    MI    6.0       1553        1663 6.0
## 3        3        ADITYA BAJAJ    MI    6.0       1384        1640 6.0
## 4        4 PATRICK H SCHILLING    MI    5.5       1716        1744 5.5
## 5        5          HANSHI ZUO    MI    5.5       1655        1690 5.5
## 6        6         HANSEN SONG    OH    5.0       1686        1687 5.0
##   avg_rating
## 1   1605.286
## 2   1469.286
## 3   1563.571
## 4   1573.571
## 5   1500.857
## 6   1518.714
ggplot(playerranks,aes(avg_rating,pts))+geom_point(size=3)+xlab("Average Opponent Rating")+ylab("Total Points")+ggtitle("Chess Players and Opponents")

write.csv(playerranks,"playerrank_clean",row.names=FALSE)