We are given an assignment containing a text file with chess tournament results where the information has some structure. The goal is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database).
We need to calculate the Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents.
library(stringr)
library(ggplot2)
#file name and
fname="tournamentinfo.txt"
data=readLines(fname)
data[1:10]
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
The above outputs presents the raw textfile that was given to us.
Data were cleaned using regular expressions, particularly using the Stringr package in R. Below are listed the different steps used in the analysis.
The analysis consists of several steps as outlined below
1.Removing Different Separators
2.Extracting Names of Players
3.Extracting the State of Origin
4.Extracting the Total Points 5.Extracting the PreRankings and PostRanks 6.Calculating the average rank of opponents played by each player
# data cleaning, removing separators
line_vector<-str_detect(data,"\\-----")
chess_data<-data[!line_vector]
# removing headers and separators
chess_data<-chess_data[3:length(chess_data)]
chess_data<-str_split(chess_data,"\\|")
#extract names of all players
ext_data<-str_extract_all(chess_data,"[[:alpha:]-?[:alpha:] ?]{2,}")
names_loc<-str_detect(unlist(ext_data),"[[:alpha:]]{3,}")
names<-unlist(ext_data)[names_loc]
head(names)
## [1] " GARY HUA " " DAKSHESH DARURI "
## [3] " ADITYA BAJAJ " " PATRICK H SCHILLING "
## [5] " HANSHI ZUO " " HANSEN SONG "
#extract state of origin
state_loc<-str_detect(unlist(ext_data),"[[:alpha:]]{2}")
state<-unlist(ext_data)[(state_loc)&(!names_loc)]
head(state)
## [1] " ON " " MI " " MI " " MI " " MI " " OH "
#extract points
num_data<-str_extract_all(chess_data,"\\d{1,}+\\.?.?")
pt_loc<-str_detect(unlist(num_data),"\\d\\.\\d")
pts<-unlist(num_data)[(pt_loc)]
head(pts)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
#extract pre->post
rtg_data<-str_extract_all(chess_data,"(( \\:)|(\\>))?.?\\d{1,}P*\\.?")
pre_loc<-str_detect(unlist(rtg_data),"\\b\\d{3,4}P?\\b")
post_loc<-str_detect(unlist(rtg_data),"\\>.?\\b\\d{3,4}P?\\b")
pre_rating<-unlist(rtg_data)[(pre_loc)&(!post_loc)]
pre_rating<-str_replace_all(pre_rating,"P","")
post_rating<-unlist(rtg_data)[post_loc]
post_rating<-str_replace_all(post_rating,"([>P])","")
head(pre_rating)
## [1] " 1794" " 1553" " 1384" " 1716" " 1655" " 1686"
head(post_rating)
## [1] "1817" "1663" "1640" "1744" "1690" "1687"
#extract games played by player
gm_data<-str_extract_all(chess_data,"[WDL]...\\d{1,2}")
gm_data<-str_extract_all(gm_data,"\\.?\\d{1,2}")
gm_data<-str_replace_all(gm_data,"\\b[0]\\b",".")
gm_zl<-str_detect(gm_data,fixed("."))
gm_data<-gm_data[!(gm_zl)]
head(gm_data)
## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")"
## [2] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")"
## [3] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")"
## [4] "c(\"23\", \"28\", \"2\", \"26\", \"5\", \"19\", \"1\")"
## [5] "c(\"45\", \"37\", \"12\", \"13\", \"4\", \"14\", \"17\")"
## [6] "c(\"34\", \"29\", \"11\", \"35\", \"10\", \"27\", \"21\")"
Extracted data are then combined to develop a dataframe:
# remove spaces
id<-seq(1,64,by=1)
names<-str_trim(names,"both")
state<-str_trim(state,"both")
pre_rating<-str_trim(pre_rating,"both")
post_rating<-str_trim(post_rating,"both")
playerranks<-cbind(id,names,state,pts,pre_rating,post_rating)
playerranks<-as.data.frame(playerranks)
#numeric data are stored as factors, convert to numeric
playerranks$pts<-as.numeric(as.character(playerranks$pts))
playerranks$pre_rating<-as.numeric(as.character(playerranks$pre_rating))
playerranks$post_rating<-as.numeric(as.character(playerranks$post_rating))
head(playerranks)
## id names state pts pre_rating post_rating
## 1 1 GARY HUA ON 6.0 1794 1817
## 2 2 DAKSHESH DARURI MI 6.0 1553 1663
## 3 3 ADITYA BAJAJ MI 6.0 1384 1640
## 4 4 PATRICK H SCHILLING MI 5.5 1716 1744
## 5 5 HANSHI ZUO MI 5.5 1655 1690
## 6 6 HANSEN SONG OH 5.0 1686 1687
Using a loop we query all the opponents played by each player during seven rounds of play. These are then averaged to get the mean ranking.
#calculate the mean of opponent rankings
result<-array(0,dim=nrow(playerranks))
#loop through all the players
for (i in 1:nrow(playerranks)){
#extract opponent data and query rankings
match_res<-as.numeric(str_split(unlist(str_extract_all(gm_data[i],"\\d{1,2}"))," "))
#calcuate the average of opponent rankings
result[i]<-mean(playerranks[match_res,colnames(playerranks)=="pre_rating"])
};
playerranks$avg_rating<-result
head(playerranks)
## id names state pts pre_rating post_rating avg_rating
## 1 1 GARY HUA ON 6.0 1794 1817 1605.286
## 2 2 DAKSHESH DARURI MI 6.0 1553 1663 1469.286
## 3 3 ADITYA BAJAJ MI 6.0 1384 1640 1563.571
## 4 4 PATRICK H SCHILLING MI 5.5 1716 1744 1573.571
## 5 5 HANSHI ZUO MI 5.5 1655 1690 1500.857
## 6 6 HANSEN SONG OH 5.0 1686 1687 1518.714
#write the output to a CSV file
write.csv(playerranks,"playerrank_clean",row.names=FALSE)
Finally some plots to view the analysis…
# Develop some plots
ggplot(playerranks,aes(avg_rating,pts,color="players"))+geom_point(size=3)+xlab("Average Opponent Rating")+ylab("Total Points")+ggtitle("Chess Players and Opponents")