Project description

The project description can be found on github.com.

Environment setup

#Loading packages
library(stringr)
library(knitr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)

Importing data

#Reading the dataset from github
chessText <- read.csv(paste0("https://raw.githubusercontent.com/ahmshahparan/DATA607_PROJECT01/master/tournamentinfo.txt"))
head(chessText)
##   X.........................................................................................
## 1  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 2  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 3  -----------------------------------------------------------------------------------------
## 4      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 5     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 6  -----------------------------------------------------------------------------------------

Cleaning data

#Removing the column header
cleanData <-chessText[-c(1:2),]
#Subsetting player info
name<-cleanData[seq(2, length(cleanData), 3)]
head(name,3)
## [1]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [2]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [3]     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
## 131 Levels:     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| ...
#Subsetting rating info
rating<-cleanData[seq(3, length(cleanData), 3)]
head(rating,3)
## [1]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [2]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [3]    MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 131 Levels:     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| ...

Relevant data extraction

#Applying data transformation
playerId <- c(1:64)
playerName <- str_trim(str_extract(name, "(\\w+\\s){2,3}"))
totalPoint <- as.numeric(str_extract(name, "\\d.\\d"))
preRating <- as.integer(str_extract(str_extract(rating, "\\D\\d{3,4}\\D"), "\\d+")) 
playerState <- str_extract(rating, "\\w\\w")
opponentId <- str_extract_all(str_extract_all(name, "\\d+\\|"), "\\d+")

Average of pre-tournament opponents’ ratings

#For loop to find the average of every player's oppoent players' pre chess ratings
oppoPreRatingAvg <- length(name)
for (i in 1:length(name)) 
{ 
  oppoPreRatingAvg[i] <- round(mean(preRating[as.numeric(unlist(opponentId[playerId[i]]))]), digits = 0)
}

Constructing dataset

df <- data_frame(playerName, playerState, totalPoint, preRating, oppoPreRatingAvg)
colnames(df) <-  c("Player's Name", "Player's State", "Total Number of Points", "Player's Pre-Rating", "Opponents' Average Ratings")
kable(head(df))
Player’s Name Player’s State Total Number of Points Player’s Pre-Rating Opponents’ Average Ratings
GARY HUA ON 6.0 1794 1605
DAKSHESH DARURI MI 6.0 1553 1469
ADITYA BAJAJ MI 6.0 1384 1564
PATRICK H SCHILLING MI 5.5 1716 1574
HANSHI ZUO MI 5.5 1655 1501
HANSEN SONG OH 5.0 1686 1519

Exporting to .csv

#Exporting in working directory
write.table(df, file = "tournamentInfo_cleaned.csv",row.names=FALSE, na="",col.names=TRUE, sep=",")

A copy of this cleaned up .csv file is available at github.com.