D607 Project 1: Create Table Using Regex

Kyle Gilde

Feb. 19, 2017

The Task

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605

Load Packages

knitr::opts_chunk$set(#echo=FALSE, 
                      warning=FALSE, 
                      message=FALSE,
                      tidy=TRUE,
                      #comment = "",
                      dev="png", 
                      dev.args=list(type="cairo"))

#https://cran.r-project.org/web/packages/prettydoc/vignettes/
#https://www.rstudio.com/wp-content/uploads/2015/03/rmarkdown-reference.pdf

load.packages <- c("RCurl", "knitr","stringr","prettydoc")


ipak <- function(pkg){
    #FUNCTION SOURCE: https://gist.github.com/stevenworthington/3178163
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if (length(new.pkg)) 
        install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE)
}
ipak(load.packages)
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: knitr
## Loading required package: stringr
## Loading required package: prettydoc

Load the data & take a look at it

my.data <- getURLContent("https://raw.githubusercontent.com/kylegilde/D607-Data-Acquistion/master/data-sets/tournamentinfo.txt")
writeLines(str_sub(my.data, 1, 811))
## -----------------------------------------------------------------------------------------
##  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
##  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## -----------------------------------------------------------------------------------------
##     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
##    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## -----------------------------------------------------------------------------------------
##     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
##    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |

Or click here

Extract the first 7 required attributes

num <- str_trim(unlist(str_extract_all(my.data, " {3,4}\\d{1,2} ")))
name <- data.frame(str_trim(unlist(str_extract_all(my.data, " [[:alpha:] -]{2,30} {6}"))), 
    stringsAsFactors = F)
name <- name[2:65, 1]
state <- str_trim(unlist(str_extract_all(my.data, " {3}[[:alpha:]]{2} ")))
total_points <- as.numeric(str_trim(unlist(str_extract_all(my.data, "\\d\\.\\d"))))
pre_rating <- as.numeric(str_trim(unlist(str_extract_all(str_extract_all(my.data, 
    ": {1,2}\\d{3,4}( {3}|P)"), "\\d+"))))


# to extract the opponent numbers, create a list seperated by newline
# characters
my.rows <- unlist(str_extract_all(my.data, ".+\\n"))
# subset to the rows that have names in them since these rows have the
# opponent numbers in them
my.rows <- str_subset(my.rows, " [[:alpha:] -]{2,30} {6}")
# extract only the digits of opponent numbers
my.rows <- str_extract_all(my.rows, "((W|L|D) {2,3}\\d{1,2})")
# just the digits
my.rows <- str_extract_all(my.rows, "\\d{1,2}")
opponents <- data.frame(sapply(my.rows, str_c, collapse = ","), stringsAsFactors = F)
# remove superfluous row
opponents <- opponents[2:65, 1]

Let’s take a look at the DF so far

# create my df so far
chess.df <- data.frame(num, name, state, total_points, pre_rating, opponents, 
    stringsAsFactors = F)

kable(head(chess.df, 10))
num name state total_points pre_rating opponents
1 GARY HUA ON 6.0 1794 39,21,18,14,7,12,4
2 DAKSHESH DARURI MI 6.0 1553 63,58,4,17,16,20,7
3 ADITYA BAJAJ MI 6.0 1384 8,61,25,21,11,13,12
4 PATRICK H SCHILLING MI 5.5 1716 23,28,2,26,5,19,1
5 HANSHI ZUO MI 5.5 1655 45,37,12,13,4,14,17
6 HANSEN SONG OH 5.0 1686 34,29,11,35,10,27,21
7 GARY DEE SWATHELL MI 5.0 1649 57,46,13,11,1,9,2
8 EZEKIEL HOUGHTON MI 5.0 1641 3,32,14,9,47,28,19
9 STEFANO LEE ON 5.0 1411 25,18,59,8,26,7,20
10 ANVIT RAO MI 5.0 1365 16,19,55,31,6,25,18

Calculate the opponents’ mean pre-rating

opponent_mean <- function(opp.list, opp.scores) {
    # function to calculate the opponent pre-rating means
    list_len <- length(opp.list)
    mean.vec <- rep(NA, list_len)
    for (i in 1:list_len) {
        opps <- c(str_split(opp.list, ",")[[i]])
        opp.mean <- mean(subset(opp.scores[, 2], opp.scores[, 1] %in% opps))
        mean.vec[i] <- opp.mean
    }
    return(mean.vec)
}
chess.df$opponent_pre_rating_mean <- round(opponent_mean(chess.df[, 6], chess.df[, 
    c(1, 5)]))

Let’s take a look at the final data frame & create the CSV

final_chess_df <- chess.df[, c(2:5, 7)]
write.table(final_chess_df, file = "kgilde_chess_df.csv", sep = ",", row.names = F)

kable(head(final_chess_df, 10))
name state total_points pre_rating opponent_pre_rating_mean
GARY HUA ON 6.0 1794 1605
DAKSHESH DARURI MI 6.0 1553 1469
ADITYA BAJAJ MI 6.0 1384 1564
PATRICK H SCHILLING MI 5.5 1716 1574
HANSHI ZUO MI 5.5 1655 1501
HANSEN SONG OH 5.0 1686 1519
GARY DEE SWATHELL MI 5.0 1649 1372
EZEKIEL HOUGHTON MI 5.0 1641 1468
STEFANO LEE ON 5.0 1411 1523
ANVIT RAO MI 5.0 1365 1554

The CSV file can be found here

Can we see any correlation between the players’ pre-rating and the mean opponents’ prerating?

  • There may be a little bit, but it’s not too strong.
pre_rating_opp_pre_rating <- lm(final_chess_df$opponent_pre_rating_mean ~ final_chess_df$pre_rating)
plot(final_chess_df$opponent_pre_rating_mean ~ final_chess_df$pre_rating)
abline(pre_rating_opp_pre_rating)

Was there any correlation between the players’ pre-rating and their total points from the tournament?

  • Yes, the variables appear correlated.
pre_rating_points <- lm(final_chess_df$total_points ~ final_chess_df$pre_rating)
plot(final_chess_df$total_points ~ final_chess_df$pre_rating)
abline(pre_rating_points)