Project 1 Chess Rating

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents.

Install packages for this project

if (!require('DT')) install.packages('DT')
## Loading required package: DT
if (!require('stringr')) install.packages('stringr')
## Loading required package: stringr
if (!require('ggplot2')) install.packages('ggplot2')
## Loading required package: ggplot2
library(DT)
library(stringr)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(tidyr)
library(knitr)

load chess tournament results

raw = readLines("C:/Users/tbao/Desktop/CUNY MSDS notes/607/project 1 wk4/tournamentinfo.txt")
## Warning in readLines("C:/Users/tbao/Desktop/CUNY MSDS notes/607/project 1
## wk4/tournamentinfo.txt"): incomplete final line found on 'C:/Users/tbao/
## Desktop/CUNY MSDS notes/607/project 1 wk4/tournamentinfo.txt'
raw = str_replace_all(raw,"-","") 
print(raw[1:9])
## [1] ""                                                                                          
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre>Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | " 
## [4] ""                                                                                          
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   >1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"  
## [7] ""                                                                                          
## [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
## [9] "   MI | 14598900 / R: 1553   >1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"

Data groping and splitting Find the table breaks

b0 <- 0
b1 <- unname(str_locate_all(pattern = '\\|', raw[5])[[1]][1,1])
b2 <- unname(str_locate_all(pattern = '\\|', raw[5])[[1]][2,1])
b3 <- unname(str_locate_all(pattern = '\\|', raw[5])[[1]][3,1])
b4 <- max(nchar(raw))

Data seperating

# Group1 = Num, Name, Points, Opponent IDs
g1row <- seq(5, 196, 3)
# Group2 = State, Rating
g2row <- seq(6, 196, 3)
# subset for easy searching
group1 <- raw[g1row]
group2 <- raw[g2row]

Data Field Creation and Extraction Player Name

namesub <- substr(group1, b1+1, b2-2)
namesub <- str_trim(namesub)
PlayerName <- str_to_title(namesub)
head(PlayerName)
## [1] "Gary Hua"            "Dakshesh Daruri"     "Aditya Bajaj"       
## [4] "Patrick H Schilling" "Hanshi Zuo"          "Hansen Song"

Player State

statesub <- substr(group2, b0, b1-1)
State <- str_trim(statesub)
head(State)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
# dataframe
chess <- data.frame(PlayerName, State)

Total Points

pointsub <- substr(group1, b2+1, b3-1)
head(pointsub)
## [1] "6.0  " "6.0  " "6.0  " "5.5  " "5.5  " "5.0  "
chess$TotalPoints <- as.numeric(unlist(str_extract_all(raw, "(\\d)\\.(\\d)")))

Player Pre-Rating

presub <- substr(group2, b1+1, b2-1)
presub <- str_extract(presub, ': *\\d{2,}')
head(presub)
## [1] ": 1794" ": 1553" ": 1384" ": 1716" ": 1655" ": 1686"
chess$PreRating <- as.integer(str_extract(presub, '\\d{2,}'))

Avg Opponent Pre-Rating

oppsub <- substr(group1, b3+1, b4)
oppsub <- str_extract_all(oppsub, '\\b\\d{1,}')
oppsub <- as.matrix(oppsub)

calculation <- function(l, p ){
  temp <- l[p]
    for (place in temp){
        rating <- 0
        counter <- 0
        for(i in place) {
            counter <- counter + 1
            rating <- rating + chess$PreRating[as.numeric(i)]
        }
        rating <- round(rating / counter)
    }
  return (rating)
}

chess$AvgOppPreRating <- apply(oppsub, 1, calculation)

Review Final Dataset

datatable(chess)

Export in .csv

write.csv(chess, "chessData.csv", row.names=FALSE)

Data visualization

x <- ggplot(chess, aes(PreRating, AvgOppPreRating)) + geom_point(aes(color=TotalPoints)) + ggtitle("Pre-Rating VS Avg Opponent Pre-Rating by Total Points Gained")
x