In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents.
Install packages for this project
if (!require('DT')) install.packages('DT')
## Loading required package: DT
if (!require('stringr')) install.packages('stringr')
## Loading required package: stringr
if (!require('ggplot2')) install.packages('ggplot2')
## Loading required package: ggplot2
library(DT)
library(stringr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(tidyr)
library(knitr)
load chess tournament results
raw = readLines("C:/Users/tbao/Desktop/CUNY MSDS notes/607/project 1 wk4/tournamentinfo.txt")
## Warning in readLines("C:/Users/tbao/Desktop/CUNY MSDS notes/607/project 1
## wk4/tournamentinfo.txt"): incomplete final line found on 'C:/Users/tbao/
## Desktop/CUNY MSDS notes/607/project 1 wk4/tournamentinfo.txt'
raw = str_replace_all(raw,"-","")
print(raw[1:9])
## [1] ""
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre>Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] ""
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 >1817 |N:2 |W |B |W |B |W |B |W |"
## [7] ""
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 >1663 |N:2 |B |W |B |W |B |W |B |"
Data groping and splitting Find the table breaks
b0 <- 0
b1 <- unname(str_locate_all(pattern = '\\|', raw[5])[[1]][1,1])
b2 <- unname(str_locate_all(pattern = '\\|', raw[5])[[1]][2,1])
b3 <- unname(str_locate_all(pattern = '\\|', raw[5])[[1]][3,1])
b4 <- max(nchar(raw))
Data seperating
# Group1 = Num, Name, Points, Opponent IDs
g1row <- seq(5, 196, 3)
# Group2 = State, Rating
g2row <- seq(6, 196, 3)
# subset for easy searching
group1 <- raw[g1row]
group2 <- raw[g2row]
Data Field Creation and Extraction Player Name
namesub <- substr(group1, b1+1, b2-2)
namesub <- str_trim(namesub)
PlayerName <- str_to_title(namesub)
head(PlayerName)
## [1] "Gary Hua" "Dakshesh Daruri" "Aditya Bajaj"
## [4] "Patrick H Schilling" "Hanshi Zuo" "Hansen Song"
Player State
statesub <- substr(group2, b0, b1-1)
State <- str_trim(statesub)
head(State)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
# dataframe
chess <- data.frame(PlayerName, State)
Total Points
pointsub <- substr(group1, b2+1, b3-1)
head(pointsub)
## [1] "6.0 " "6.0 " "6.0 " "5.5 " "5.5 " "5.0 "
chess$TotalPoints <- as.numeric(unlist(str_extract_all(raw, "(\\d)\\.(\\d)")))
Player Pre-Rating
presub <- substr(group2, b1+1, b2-1)
presub <- str_extract(presub, ': *\\d{2,}')
head(presub)
## [1] ": 1794" ": 1553" ": 1384" ": 1716" ": 1655" ": 1686"
chess$PreRating <- as.integer(str_extract(presub, '\\d{2,}'))
Avg Opponent Pre-Rating
oppsub <- substr(group1, b3+1, b4)
oppsub <- str_extract_all(oppsub, '\\b\\d{1,}')
oppsub <- as.matrix(oppsub)
calculation <- function(l, p ){
temp <- l[p]
for (place in temp){
rating <- 0
counter <- 0
for(i in place) {
counter <- counter + 1
rating <- rating + chess$PreRating[as.numeric(i)]
}
rating <- round(rating / counter)
}
return (rating)
}
chess$AvgOppPreRating <- apply(oppsub, 1, calculation)
Review Final Dataset
datatable(chess)
Export in .csv
write.csv(chess, "chessData.csv", row.names=FALSE)
Data visualization
x <- ggplot(chess, aes(PreRating, AvgOppPreRating)) + geom_point(aes(color=TotalPoints)) + ggtitle("Pre-Rating VS Avg Opponent Pre-Rating by Total Points Gained")
x