Hazal Gunduz
DATA607 - Project 1
In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents.
For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605.
1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
If you have questions about the meaning of the data or the results, please post them on the discussion forum. Data science, like chess, is a game of back and forth…
The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments.
library(stringr)
library(ggplot2)
Importing Data into R
fname="tournamentinfo.txt"
data=readLines(fname)
## Warning in readLines(fname): incomplete final line found on 'tournamentinfo.txt'
data[1:10]
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
Data Cleaning Using Regular Expressions
The analysis consists of steps as outlined below;
1.Removing different separators
2.Extracting names of players
3.Extracting the state of origin
4.Extracting the total points
5.Extracting the PreRanks and PostRanks
6.Calculate the average rank of opponents played by each player
line_vector<-str_detect(data,"\\-----")
chess_data<-data[!line_vector]
chess_data<-chess_data[3:length(chess_data)]
chess_data<-str_split(chess_data,"\\|")
ext_data<-str_extract_all(chess_data,"[[:alpha:]-?[:alpha:] ?]{2,}")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
names_loc<-str_detect(unlist(ext_data),"[[:alpha:]]{3,}")
names<-unlist(ext_data)[names_loc]
head(names)
## [1] " GARY HUA " " DAKSHESH DARURI "
## [3] " ADITYA BAJAJ " " PATRICK H SCHILLING "
## [5] " HANSHI ZUO " " HANSEN SONG "
state_loc<-str_detect(unlist(ext_data),"[[:alpha:]]{2}")
state<-unlist(ext_data)[(state_loc)&(!names_loc)]
head(state)
## [1] " ON " " MI " " MI " " MI " " MI " " OH "
num_data<-str_extract_all(chess_data,"\\d{1,}+\\.?.?")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
pt_loc<-str_detect(unlist(num_data),"\\d\\.\\d")
pts<-unlist(num_data)[(pt_loc)]
head(pts)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
rtg_data<-str_extract_all(chess_data,"(( \\:)|(\\>))?.?\\d{1,}P*\\.?")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
pre_loc<-str_detect(unlist(rtg_data),"\\b\\d{3,4}P?\\b")
post_loc<-str_detect(unlist(rtg_data),"\\>.?\\b\\d{3,4}P?\\b")
pre_rating<-unlist(rtg_data)[(pre_loc)&(!post_loc)]
pre_rating<-str_replace_all(pre_rating,"P","")
post_rating<-unlist(rtg_data)[post_loc]
post_rating<-str_replace_all(post_rating,"([>P])","")
head(pre_rating)
## [1] " 1794" " 1553" " 1384" " 1716" " 1655" " 1686"
head(post_rating)
## [1] "1817" "1663" "1640" "1744" "1690" "1687"
gm_data<-str_extract_all(chess_data,"[WDL]...\\d{1,2}")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
gm_data<-str_extract_all(gm_data,"\\.?\\d{1,2}")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
gm_data<-str_replace_all(gm_data,"\\b[0]\\b",".")
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
gm_zl<-str_detect(gm_data,fixed("."))
gm_data<-gm_data[!(gm_zl)]
head(gm_data)
## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")"
## [2] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")"
## [3] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")"
## [4] "c(\"23\", \"28\", \"2\", \"26\", \"5\", \"19\", \"1\")"
## [5] "c(\"45\", \"37\", \"12\", \"13\", \"4\", \"14\", \"17\")"
## [6] "c(\"34\", \"29\", \"11\", \"35\", \"10\", \"27\", \"21\")"
=> The data is combined for a dataframe:
id<-seq(1,64,by=1)
names<-str_trim(names,"both")
state<-str_trim(state,"both")
pre_rating<-str_trim(pre_rating,"both")
post_rating<-str_trim(post_rating,"both")
playerranks<-cbind(id,names,state,pts,pre_rating,post_rating)
playerranks<-as.data.frame(playerranks)
playerranks$pts<-as.numeric(as.character(playerranks$pts))
playerranks$pre_rating<-as.numeric(as.character(playerranks$pre_rating))
playerranks$post_rating<-as.numeric(as.character(playerranks$post_rating))
head(playerranks)
## id names state pts pre_rating post_rating
## 1 1 GARY HUA ON 6.0 1794 1817
## 2 2 DAKSHESH DARURI MI 6.0 1553 1663
## 3 3 ADITYA BAJAJ MI 6.0 1384 1640
## 4 4 PATRICK H SCHILLING MI 5.5 1716 1744
## 5 5 HANSHI ZUO MI 5.5 1655 1690
## 6 6 HANSEN SONG OH 5.0 1686 1687
=> Using a loop we query all the opponents played by each player during seven rounds of play. These are then averaged to get the mean ranking.
result<-array(0,dim=nrow(playerranks))
for (i in 1:nrow(playerranks)){
match_res<-as.numeric(str_split(unlist(str_extract_all(gm_data[i],"\\d{1,2}"))," "))
result[i]<-mean(playerranks[match_res,colnames(playerranks)=="pre_rating"])
};
playerranks$avg_rating<-result
head(playerranks)
## id names state pts pre_rating post_rating avg_rating
## 1 1 GARY HUA ON 6.0 1794 1817 1605.286
## 2 2 DAKSHESH DARURI MI 6.0 1553 1663 1469.286
## 3 3 ADITYA BAJAJ MI 6.0 1384 1640 1563.571
## 4 4 PATRICK H SCHILLING MI 5.5 1716 1744 1573.571
## 5 5 HANSHI ZUO MI 5.5 1655 1690 1500.857
## 6 6 HANSEN SONG OH 5.0 1686 1687 1518.714
write.csv(playerranks,"playerrank_clean",row.names=FALSE)
To view the analysis
ggplot(playerranks, aes(avg_rating, pts, color = "players")) + geom_point(size = 3) + xlab("Average Opponent Rating") + ylab("Total Points") + ggtitle("Chess Players and Opponents")
Rpubs => https://rpubs.com/gunduzhazal/837564