D607 Project 1: Create Table Using Regex

The Task

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605

Load Packages

knitr::opts_chunk$set(#echo=FALSE, 
                      warning=FALSE, 
                      message=FALSE,
                      tidy=TRUE,
                      #comment = "",
                      dev="png", 
                      dev.args=list(type="cairo"))

#https://cran.r-project.org/web/packages/prettydoc/vignettes/
#https://www.rstudio.com/wp-content/uploads/2015/03/rmarkdown-reference.pdf

load.packages <- c("RCurl", "knitr","stringr","prettydoc")


ipak <- function(pkg){
    #FUNCTION SOURCE: https://gist.github.com/stevenworthington/3178163
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if (length(new.pkg)) 
        install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE)
}
ipak(load.packages)

## Loading required package: RCurl

## Loading required package: bitops

## Loading required package: knitr

## Loading required package: stringr

## Loading required package: prettydoc

Load the data & take a look at it

my.data <- getURLContent("https://raw.githubusercontent.com/kylegilde/D607-Data-Acquistion/master/data-sets/tournamentinfo.txt")
writeLines(str_sub(my.data, 1, 811))

## -----------------------------------------------------------------------------------------
##  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
##  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## -----------------------------------------------------------------------------------------
##     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
##    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## -----------------------------------------------------------------------------------------
##     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
##    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |

Or click here

Extract the first 7 required attributes

num <- str_trim(unlist(str_extract_all(my.data, " {3,4}\\d{1,2} ")))
name <- data.frame(str_trim(unlist(str_extract_all(my.data, " [[:alpha:] -]{2,30} {6}"))), 
    stringsAsFactors = F)
name <- name[2:65, 1]
state <- str_trim(unlist(str_extract_all(my.data, " {3}[[:alpha:]]{2} ")))
total_points <- as.numeric(str_trim(unlist(str_extract_all(my.data, "\\d\\.\\d"))))
pre_rating <- as.numeric(str_trim(unlist(str_extract_all(str_extract_all(my.data, 
    ": {1,2}\\d{3,4}( {3}|P)"), "\\d+"))))


# to extract the opponent numbers, create a list seperated by newline
# characters
my.rows <- unlist(str_extract_all(my.data, ".+\\n"))
# subset to the rows that have names in them since these rows have the
# opponent numbers in them
my.rows <- str_subset(my.rows, " [[:alpha:] -]{2,30} {6}")
# extract only the digits of opponent numbers
my.rows <- str_extract_all(my.rows, "((W|L|D) {2,3}\\d{1,2})")
# just the digits
my.rows <- str_extract_all(my.rows, "\\d{1,2}")
opponents <- data.frame(sapply(my.rows, str_c, collapse = ","), stringsAsFactors = F)
# remove superfluous row
opponents <- opponents[2:65, 1]

Let’s take a look at the DF so far

# create my df so far
chess.df <- data.frame(num, name, state, total_points, pre_rating, opponents, 
    stringsAsFactors = F)

kable(head(chess.df, 10))

num	name	state	total_points	pre_rating	opponents
1	GARY HUA	ON	6.0	1794	39,21,18,14,7,12,4
2	DAKSHESH DARURI	MI	6.0	1553	63,58,4,17,16,20,7
3	ADITYA BAJAJ	MI	6.0	1384	8,61,25,21,11,13,12
4	PATRICK H SCHILLING	MI	5.5	1716	23,28,2,26,5,19,1
5	HANSHI ZUO	MI	5.5	1655	45,37,12,13,4,14,17
6	HANSEN SONG	OH	5.0	1686	34,29,11,35,10,27,21
7	GARY DEE SWATHELL	MI	5.0	1649	57,46,13,11,1,9,2
8	EZEKIEL HOUGHTON	MI	5.0	1641	3,32,14,9,47,28,19
9	STEFANO LEE	ON	5.0	1411	25,18,59,8,26,7,20
10	ANVIT RAO	MI	5.0	1365	16,19,55,31,6,25,18

Calculate the opponents’ mean pre-rating

opponent_mean <- function(opp.list, opp.scores) {
    # function to calculate the opponent pre-rating means
    list_len <- length(opp.list)
    mean.vec <- rep(NA, list_len)
    for (i in 1:list_len) {
        opps <- c(str_split(opp.list, ",")[[i]])
        opp.mean <- mean(subset(opp.scores[, 2], opp.scores[, 1] %in% opps))
        mean.vec[i] <- opp.mean
    }
    return(mean.vec)
}
chess.df$opponent_pre_rating_mean <- round(opponent_mean(chess.df[, 6], chess.df[, 
    c(1, 5)]))

Let’s take a look at the final data frame & create the CSV

final_chess_df <- chess.df[, c(2:5, 7)]
write.table(final_chess_df, file = "kgilde_chess_df.csv", sep = ",", row.names = F)

kable(head(final_chess_df, 10))

name	state	total_points	pre_rating	opponent_pre_rating_mean
GARY HUA	ON	6.0	1794	1605
DAKSHESH DARURI	MI	6.0	1553	1469
ADITYA BAJAJ	MI	6.0	1384	1564
PATRICK H SCHILLING	MI	5.5	1716	1574
HANSHI ZUO	MI	5.5	1655	1501
HANSEN SONG	OH	5.0	1686	1519
GARY DEE SWATHELL	MI	5.0	1649	1372
EZEKIEL HOUGHTON	MI	5.0	1641	1468
STEFANO LEE	ON	5.0	1411	1523
ANVIT RAO	MI	5.0	1365	1554

The CSV file can be found here

Can we see any correlation between the players’ pre-rating and the mean opponents’ prerating?

There may be a little bit, but it’s not too strong.

pre_rating_opp_pre_rating <- lm(final_chess_df$opponent_pre_rating_mean ~ final_chess_df$pre_rating)
plot(final_chess_df$opponent_pre_rating_mean ~ final_chess_df$pre_rating)
abline(pre_rating_opp_pre_rating)

Was there any correlation between the players’ pre-rating and their total points from the tournament?

Yes, the variables appear correlated.

pre_rating_points <- lm(final_chess_df$total_points ~ final_chess_df$pre_rating)
plot(final_chess_df$total_points ~ final_chess_df$pre_rating)
abline(pre_rating_points)