pro-1---607.knit

Hazal Gunduz

DATA607 - Project 1

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents.

For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605.

1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

If you have questions about the meaning of the data or the results, please post them on the discussion forum. Data science, like chess, is a game of back and forth…

The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments.

library(stringr)
library(ggplot2)

Importing Data into R

fname="tournamentinfo.txt"
data=readLines(fname)

## Warning in readLines(fname): incomplete final line found on 'tournamentinfo.txt'

data[1:10]

##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------"

Data Cleaning Using Regular Expressions

The analysis consists of steps as outlined below;

1.Removing different separators

2.Extracting names of players

3.Extracting the state of origin

4.Extracting the total points

5.Extracting the PreRanks and PostRanks

6.Calculate the average rank of opponents played by each player

line_vector<-str_detect(data,"\\-----")
chess_data<-data[!line_vector]

chess_data<-chess_data[3:length(chess_data)]
chess_data<-str_split(chess_data,"\\|")

ext_data<-str_extract_all(chess_data,"[[:alpha:]-?[:alpha:]  ?]{2,}")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

names_loc<-str_detect(unlist(ext_data),"[[:alpha:]]{3,}")
names<-unlist(ext_data)[names_loc]
head(names)

## [1] " GARY HUA                        " " DAKSHESH DARURI                 "
## [3] " ADITYA BAJAJ                    " " PATRICK H SCHILLING             "
## [5] " HANSHI ZUO                      " " HANSEN SONG                     "

state_loc<-str_detect(unlist(ext_data),"[[:alpha:]]{2}")
state<-unlist(ext_data)[(state_loc)&(!names_loc)]
head(state)

## [1] "   ON " "   MI " "   MI " "   MI " "   MI " "   OH "

num_data<-str_extract_all(chess_data,"\\d{1,}+\\.?.?")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

pt_loc<-str_detect(unlist(num_data),"\\d\\.\\d")
pts<-unlist(num_data)[(pt_loc)]
head(pts)

## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"

rtg_data<-str_extract_all(chess_data,"(( \\:)|(\\>))?.?\\d{1,}P*\\.?")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

pre_loc<-str_detect(unlist(rtg_data),"\\b\\d{3,4}P?\\b")
post_loc<-str_detect(unlist(rtg_data),"\\>.?\\b\\d{3,4}P?\\b")
pre_rating<-unlist(rtg_data)[(pre_loc)&(!post_loc)]
pre_rating<-str_replace_all(pre_rating,"P","")
post_rating<-unlist(rtg_data)[post_loc]
post_rating<-str_replace_all(post_rating,"([>P])","")
head(pre_rating)

## [1] " 1794" " 1553" " 1384" " 1716" " 1655" " 1686"

head(post_rating)

## [1] "1817" "1663" "1640" "1744" "1690" "1687"

gm_data<-str_extract_all(chess_data,"[WDL]...\\d{1,2}")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

gm_data<-str_extract_all(gm_data,"\\.?\\d{1,2}")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

gm_data<-str_replace_all(gm_data,"\\b[0]\\b",".")

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing

gm_zl<-str_detect(gm_data,fixed("."))
gm_data<-gm_data[!(gm_zl)]
head(gm_data)

## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")"  
## [2] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")"  
## [3] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")" 
## [4] "c(\"23\", \"28\", \"2\", \"26\", \"5\", \"19\", \"1\")"   
## [5] "c(\"45\", \"37\", \"12\", \"13\", \"4\", \"14\", \"17\")" 
## [6] "c(\"34\", \"29\", \"11\", \"35\", \"10\", \"27\", \"21\")"

=> The data is combined for a dataframe:

id<-seq(1,64,by=1)
names<-str_trim(names,"both")
state<-str_trim(state,"both")
pre_rating<-str_trim(pre_rating,"both")
post_rating<-str_trim(post_rating,"both")
playerranks<-cbind(id,names,state,pts,pre_rating,post_rating)
playerranks<-as.data.frame(playerranks)

playerranks$pts<-as.numeric(as.character(playerranks$pts))
playerranks$pre_rating<-as.numeric(as.character(playerranks$pre_rating))
playerranks$post_rating<-as.numeric(as.character(playerranks$post_rating))
head(playerranks)

##   id               names state pts pre_rating post_rating
## 1  1            GARY HUA    ON 6.0       1794        1817
## 2  2     DAKSHESH DARURI    MI 6.0       1553        1663
## 3  3        ADITYA BAJAJ    MI 6.0       1384        1640
## 4  4 PATRICK H SCHILLING    MI 5.5       1716        1744
## 5  5          HANSHI ZUO    MI 5.5       1655        1690
## 6  6         HANSEN SONG    OH 5.0       1686        1687

=> Using a loop we query all the opponents played by each player during seven rounds of play. These are then averaged to get the mean ranking.

result<-array(0,dim=nrow(playerranks))

for (i in 1:nrow(playerranks)){ 
  
  match_res<-as.numeric(str_split(unlist(str_extract_all(gm_data[i],"\\d{1,2}"))," "))
  
result[i]<-mean(playerranks[match_res,colnames(playerranks)=="pre_rating"])
};

playerranks$avg_rating<-result
head(playerranks)

##   id               names state pts pre_rating post_rating avg_rating
## 1  1            GARY HUA    ON 6.0       1794        1817   1605.286
## 2  2     DAKSHESH DARURI    MI 6.0       1553        1663   1469.286
## 3  3        ADITYA BAJAJ    MI 6.0       1384        1640   1563.571
## 4  4 PATRICK H SCHILLING    MI 5.5       1716        1744   1573.571
## 5  5          HANSHI ZUO    MI 5.5       1655        1690   1500.857
## 6  6         HANSEN SONG    OH 5.0       1686        1687   1518.714

write.csv(playerranks,"playerrank_clean",row.names=FALSE)

To view the analysis

ggplot(playerranks, aes(avg_rating, pts, color = "players")) + geom_point(size = 3) + xlab("Average Opponent Rating") + ylab("Total Points") + ggtitle("Chess Players and Opponents")

Rpubs => https://rpubs.com/gunduzhazal/837564

Github => https://github.com/Gunduzhazal/tournament