Gehad Gad
February 23, 2020
DATA 607 - Project 1
In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
#Import libraries and/or Packages
library(stringr)
library (ggplot2)
# I Loaded the text file to my github to be read in r.
file = "https://github.com/GehadGad/DATA607-Project-1/raw/master/tournamentinfo.txt"
# The whole data can be seen from the link below.
#Data <- read.table (url("https://github.com/GehadGad/DATA607-Project-1/raw/master/tournamentinfo.txt"), sep = ",")
#Display the head of the Data
data=readLines(file , warn=FALSE )
head(data)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
# Data cleaning, removing separators
data<-data[!str_detect(data,"\\-----")]
# Removing headers and separators
data<-str_split(data[3:length(data)],"\\|")
#Extract names of all players
extracted<-str_extract_all(data,"[[:alpha:]-?[:alpha:] ?]{2,}")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
names<-unlist(extracted)[str_detect(unlist(extracted),"[[:alpha:]]{3,}")]
head(names)
## [1] " GARY HUA " " DAKSHESH DARURI "
## [3] " ADITYA BAJAJ " " PATRICK H SCHILLING "
## [5] " HANSHI ZUO " " HANSEN SONG "
#Extract state of origin
state<-unlist(extracted)[(str_detect(unlist(extracted),"[[:alpha:]]{2}"))&(!str_detect(unlist(extracted),"[[:alpha:]]{3,}"))]
head(state)
## [1] " ON " " MI " " MI " " MI " " MI " " OH "
#Extract points
num_data<-str_extract_all(data,"\\d{1,}+\\.?.?")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
pts<-unlist(num_data)[(str_detect(unlist(num_data),"\\d\\.\\d"))]
head(pts)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
#Extract pre->post
rtg<-str_extract_all(data,"(( \\:)|(\\>))?.?\\d{1,}P*\\.?")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
pre_rating<-unlist(rtg)[(str_detect(unlist(rtg),"\\b\\d{3,4}P?\\b"))&(!str_detect(unlist(rtg),"\\>.?\\b\\d{3,4}P?\\b"))]
pre<-str_replace_all(pre_rating,"P","")
post_rating<-unlist(rtg)[str_detect(unlist(rtg),"\\>.?\\b\\d{3,4}P?\\b")]
post<-str_replace_all(post_rating,"([>P])","")
head(pre_rating)
## [1] " 1794" " 1553" " 1384" " 1716" " 1655" " 1686"
#Extract games played by player
games_data<-str_replace_all(str_extract_all(str_extract_all(data,"[WDL]...\\d{1,2}"),"\\.?\\d{1,2}"),"\\b[0]\\b",".")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
games<-str_detect(games_data,fixed("."))
games_data<-games_data[!(games)]
head(games_data)
## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")"
## [2] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")"
## [3] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")"
## [4] "c(\"23\", \"28\", \"2\", \"26\", \"5\", \"19\", \"1\")"
## [5] "c(\"45\", \"37\", \"12\", \"13\", \"4\", \"14\", \"17\")"
## [6] "c(\"34\", \"29\", \"11\", \"35\", \"10\", \"27\", \"21\")"
id<-seq(1,64,by=1)
names<-str_trim(names,"both")
state<-str_trim(state,"both")
pre_rating<-str_trim(pre_rating,"both")
post_rating<-str_trim(post_rating,"both")
player_ranks<-as.data.frame(cbind(id,names,state,pts,pre_rating,post_rating))
#Calculate the mean of opponent rankings
results<-array(0,dim=nrow(player_ranks))
player_ranks[] <- lapply(player_ranks, gsub, pattern='>', replacement='')
player_ranks[] <- lapply(player_ranks, gsub, pattern='P', replacement='')
player_ranks$pts<-as.numeric(as.character(player_ranks$pts))
player_ranks$pre_rating<-as.numeric(as.character(player_ranks$pre_rating))
player_ranks$post_rating<-as.numeric(as.character(player_ranks$post_rating))
#loop through all the players
for (i in 1:nrow(player_ranks)){
#extract opponent data and query rankings
match_res<-as.numeric(str_split(unlist(str_extract_all(games_data[i],"\\d{1,2}"))," "))
#calcuate the average of opponent rankings
results[i]<-mean(player_ranks[match_res,colnames(player_ranks)=="pre_rating"])
};
player_ranks$avg_rating<-results
head(player_ranks)
## id names state pts pre_rating post_rating avg_rating
## 1 1 GARY HUA ON 6.0 1794 1817 1605.286
## 2 2 DAKSHESH DARURI MI 6.0 1553 1663 1469.286
## 3 3 ADITYA BAJAJ MI 6.0 1384 1640 1563.571
## 4 4 ATRICK H SCHILLING MI 5.5 1716 1744 1573.571
## 5 5 HANSHI ZUO MI 5.5 1655 1690 1500.857
## 6 6 HANSEN SONG OH 5.0 1686 1687 1518.714
# Export csv file.
write.csv(player_ranks,"player_rank",row.names=TRUE)