Data 607 Project 1

Overview

We are given an assignment containing a text file with chess tournament results where the information has some structure. The goal is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database).
We need to calculate the Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents.

library(stringr)
library(ggplot2)

#file name and 
fname="tournamentinfo.txt"
data=readLines(fname)
data[1:10]

##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------"

The above outputs presents the raw textfile that was given to us.

Data Cleaning Using Regular Expressions

Data were cleaned using regular expressions, particularly using the Stringr package in R. Below are listed the different steps used in the analysis.

The analysis consists of several steps as outlined below
1.Removing Different Separators
2.Extracting Names of Players
3.Extracting the State of Origin
4.Extracting the Total Points 5.Extracting the PreRankings and PostRanks 6.Calculating the average rank of opponents played by each player

# data cleaning, removing separators
line_vector<-str_detect(data,"\\-----")
chess_data<-data[!line_vector]
# removing headers and separators
chess_data<-chess_data[3:length(chess_data)]
chess_data<-str_split(chess_data,"\\|")

#extract names of all players
ext_data<-str_extract_all(chess_data,"[[:alpha:]-?[:alpha:]  ?]{2,}")
names_loc<-str_detect(unlist(ext_data),"[[:alpha:]]{3,}")
names<-unlist(ext_data)[names_loc]
head(names)

## [1] " GARY HUA                        " " DAKSHESH DARURI                 "
## [3] " ADITYA BAJAJ                    " " PATRICK H SCHILLING             "
## [5] " HANSHI ZUO                      " " HANSEN SONG                     "

#extract state of origin
state_loc<-str_detect(unlist(ext_data),"[[:alpha:]]{2}")
state<-unlist(ext_data)[(state_loc)&(!names_loc)]
head(state)

## [1] "   ON " "   MI " "   MI " "   MI " "   MI " "   OH "

#extract points
num_data<-str_extract_all(chess_data,"\\d{1,}+\\.?.?")
pt_loc<-str_detect(unlist(num_data),"\\d\\.\\d")
pts<-unlist(num_data)[(pt_loc)]
head(pts)

## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"

#extract pre->post
rtg_data<-str_extract_all(chess_data,"(( \\:)|(\\>))?.?\\d{1,}P*\\.?")
pre_loc<-str_detect(unlist(rtg_data),"\\b\\d{3,4}P?\\b")
post_loc<-str_detect(unlist(rtg_data),"\\>.?\\b\\d{3,4}P?\\b")
pre_rating<-unlist(rtg_data)[(pre_loc)&(!post_loc)]
pre_rating<-str_replace_all(pre_rating,"P","")
post_rating<-unlist(rtg_data)[post_loc]
post_rating<-str_replace_all(post_rating,"([>P])","")
head(pre_rating)

## [1] " 1794" " 1553" " 1384" " 1716" " 1655" " 1686"

head(post_rating)

## [1] "1817" "1663" "1640" "1744" "1690" "1687"

#extract games played by player
gm_data<-str_extract_all(chess_data,"[WDL]...\\d{1,2}")
gm_data<-str_extract_all(gm_data,"\\.?\\d{1,2}")
gm_data<-str_replace_all(gm_data,"\\b[0]\\b",".")
gm_zl<-str_detect(gm_data,fixed("."))
gm_data<-gm_data[!(gm_zl)]
head(gm_data)

## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")"  
## [2] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")"  
## [3] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")" 
## [4] "c(\"23\", \"28\", \"2\", \"26\", \"5\", \"19\", \"1\")"   
## [5] "c(\"45\", \"37\", \"12\", \"13\", \"4\", \"14\", \"17\")" 
## [6] "c(\"34\", \"29\", \"11\", \"35\", \"10\", \"27\", \"21\")"

Extracted data are then combined to develop a dataframe:

# remove spaces
id<-seq(1,64,by=1)
names<-str_trim(names,"both")
state<-str_trim(state,"both")
pre_rating<-str_trim(pre_rating,"both")
post_rating<-str_trim(post_rating,"both")
playerranks<-cbind(id,names,state,pts,pre_rating,post_rating)
playerranks<-as.data.frame(playerranks)
#numeric data are stored as factors, convert to numeric
playerranks$pts<-as.numeric(as.character(playerranks$pts))
playerranks$pre_rating<-as.numeric(as.character(playerranks$pre_rating))
playerranks$post_rating<-as.numeric(as.character(playerranks$post_rating))
head(playerranks)

##   id               names state pts pre_rating post_rating
## 1  1            GARY HUA    ON 6.0       1794        1817
## 2  2     DAKSHESH DARURI    MI 6.0       1553        1663
## 3  3        ADITYA BAJAJ    MI 6.0       1384        1640
## 4  4 PATRICK H SCHILLING    MI 5.5       1716        1744
## 5  5          HANSHI ZUO    MI 5.5       1655        1690
## 6  6         HANSEN SONG    OH 5.0       1686        1687

Using a loop we query all the opponents played by each player during seven rounds of play. These are then averaged to get the mean ranking.

#calculate the mean of opponent rankings
result<-array(0,dim=nrow(playerranks))
#loop through all the players
for (i in 1:nrow(playerranks)){ 
  #extract opponent data and query rankings
  match_res<-as.numeric(str_split(unlist(str_extract_all(gm_data[i],"\\d{1,2}"))," "))
  #calcuate the average of opponent rankings
result[i]<-mean(playerranks[match_res,colnames(playerranks)=="pre_rating"])
};

playerranks$avg_rating<-result
head(playerranks)

##   id               names state pts pre_rating post_rating avg_rating
## 1  1            GARY HUA    ON 6.0       1794        1817   1605.286
## 2  2     DAKSHESH DARURI    MI 6.0       1553        1663   1469.286
## 3  3        ADITYA BAJAJ    MI 6.0       1384        1640   1563.571
## 4  4 PATRICK H SCHILLING    MI 5.5       1716        1744   1573.571
## 5  5          HANSHI ZUO    MI 5.5       1655        1690   1500.857
## 6  6         HANSEN SONG    OH 5.0       1686        1687   1518.714

#write the output to a CSV file
write.csv(playerranks,"playerrank_clean",row.names=FALSE)

Finally some plots to view the analysis…

# Develop some plots
ggplot(playerranks,aes(avg_rating,pts,color="players"))+geom_point(size=3)+xlab("Average Opponent Rating")+ylab("Total Points")+ggtitle("Chess Players and Opponents")

Data 607 Project 1

Talha Muhammad

September 25, 2016

Overview

Data Cleaning Using Regular Expressions