DATA 606 PROJECT 01

Project description

The project description can be found on github.com.

Environment setup

#Loading packages
library(stringr)
library(knitr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)

Importing data

#Reading the dataset from github
chessText <- read.csv(paste0("https://raw.githubusercontent.com/ahmshahparan/DATA607_PROJECT01/master/tournamentinfo.txt"))
head(chessText)

##   X.........................................................................................
## 1  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 2  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 3  -----------------------------------------------------------------------------------------
## 4      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 5     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 6  -----------------------------------------------------------------------------------------

Cleaning data

#Removing the column header
cleanData <-chessText[-c(1:2),]

#Subsetting player info
name<-cleanData[seq(2, length(cleanData), 3)]
head(name,3)

## [1]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [2]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [3]     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
## 131 Levels:     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| ...

#Subsetting rating info
rating<-cleanData[seq(3, length(cleanData), 3)]
head(rating,3)

## [1]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [2]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [3]    MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 131 Levels:     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4| ...

Relevant data extraction

#Applying data transformation
playerId <- c(1:64)
playerName <- str_trim(str_extract(name, "(\\w+\\s){2,3}"))
totalPoint <- as.numeric(str_extract(name, "\\d.\\d"))
preRating <- as.integer(str_extract(str_extract(rating, "\\D\\d{3,4}\\D"), "\\d+")) 
playerState <- str_extract(rating, "\\w\\w")
opponentId <- str_extract_all(str_extract_all(name, "\\d+\\|"), "\\d+")

Average of pre-tournament opponents’ ratings

#For loop to find the average of every player's oppoent players' pre chess ratings
oppoPreRatingAvg <- length(name)
for (i in 1:length(name)) 
{ 
  oppoPreRatingAvg[i] <- round(mean(preRating[as.numeric(unlist(opponentId[playerId[i]]))]), digits = 0)
}

Constructing dataset

df <- data_frame(playerName, playerState, totalPoint, preRating, oppoPreRatingAvg)
colnames(df) <-  c("Player's Name", "Player's State", "Total Number of Points", "Player's Pre-Rating", "Opponents' Average Ratings")
kable(head(df))

Player’s Name	Player’s State	Total Number of Points	Player’s Pre-Rating	Opponents’ Average Ratings
GARY HUA	ON	6.0	1794	1605
DAKSHESH DARURI	MI	6.0	1553	1469
ADITYA BAJAJ	MI	6.0	1384	1564
PATRICK H SCHILLING	MI	5.5	1716	1574
HANSHI ZUO	MI	5.5	1655	1501
HANSEN SONG	OH	5.0	1686	1519

Exporting to .csv

#Exporting in working directory
write.table(df, file = "tournamentInfo_cleaned.csv",row.names=FALSE, na="",col.names=TRUE, sep=",")

A copy of this cleaned up .csv file is available at github.com.

DATA 606 PROJECT 01

A H M Shahparan

2/24/2018

Project description

Environment setup

Importing data

Cleaning data

Relevant data extraction

Average of pre-tournament opponents’ ratings

Constructing dataset

Exporting to .csv