The Task
In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605
Load Packages
knitr::opts_chunk$set(#echo=FALSE,
warning=FALSE,
message=FALSE,
tidy=TRUE,
#comment = "",
dev="png",
dev.args=list(type="cairo"))
#https://cran.r-project.org/web/packages/prettydoc/vignettes/
#https://www.rstudio.com/wp-content/uploads/2015/03/rmarkdown-reference.pdf
load.packages <- c("RCurl", "knitr","stringr","prettydoc")
ipak <- function(pkg){
#FUNCTION SOURCE: https://gist.github.com/stevenworthington/3178163
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
ipak(load.packages)## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: knitr
## Loading required package: stringr
## Loading required package: prettydoc
Load the data & take a look at it
my.data <- getURLContent("https://raw.githubusercontent.com/kylegilde/D607-Data-Acquistion/master/data-sets/tournamentinfo.txt")
writeLines(str_sub(my.data, 1, 811))## -----------------------------------------------------------------------------------------
## Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## -----------------------------------------------------------------------------------------
## 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## -----------------------------------------------------------------------------------------
## 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
Or click here
Extract the first 7 required attributes
num <- str_trim(unlist(str_extract_all(my.data, " {3,4}\\d{1,2} ")))
name <- data.frame(str_trim(unlist(str_extract_all(my.data, " [[:alpha:] -]{2,30} {6}"))),
stringsAsFactors = F)
name <- name[2:65, 1]
state <- str_trim(unlist(str_extract_all(my.data, " {3}[[:alpha:]]{2} ")))
total_points <- as.numeric(str_trim(unlist(str_extract_all(my.data, "\\d\\.\\d"))))
pre_rating <- as.numeric(str_trim(unlist(str_extract_all(str_extract_all(my.data,
": {1,2}\\d{3,4}( {3}|P)"), "\\d+"))))
# to extract the opponent numbers, create a list seperated by newline
# characters
my.rows <- unlist(str_extract_all(my.data, ".+\\n"))
# subset to the rows that have names in them since these rows have the
# opponent numbers in them
my.rows <- str_subset(my.rows, " [[:alpha:] -]{2,30} {6}")
# extract only the digits of opponent numbers
my.rows <- str_extract_all(my.rows, "((W|L|D) {2,3}\\d{1,2})")
# just the digits
my.rows <- str_extract_all(my.rows, "\\d{1,2}")
opponents <- data.frame(sapply(my.rows, str_c, collapse = ","), stringsAsFactors = F)
# remove superfluous row
opponents <- opponents[2:65, 1]Let’s take a look at the DF so far
# create my df so far
chess.df <- data.frame(num, name, state, total_points, pre_rating, opponents,
stringsAsFactors = F)
kable(head(chess.df, 10))| num | name | state | total_points | pre_rating | opponents |
|---|---|---|---|---|---|
| 1 | GARY HUA | ON | 6.0 | 1794 | 39,21,18,14,7,12,4 |
| 2 | DAKSHESH DARURI | MI | 6.0 | 1553 | 63,58,4,17,16,20,7 |
| 3 | ADITYA BAJAJ | MI | 6.0 | 1384 | 8,61,25,21,11,13,12 |
| 4 | PATRICK H SCHILLING | MI | 5.5 | 1716 | 23,28,2,26,5,19,1 |
| 5 | HANSHI ZUO | MI | 5.5 | 1655 | 45,37,12,13,4,14,17 |
| 6 | HANSEN SONG | OH | 5.0 | 1686 | 34,29,11,35,10,27,21 |
| 7 | GARY DEE SWATHELL | MI | 5.0 | 1649 | 57,46,13,11,1,9,2 |
| 8 | EZEKIEL HOUGHTON | MI | 5.0 | 1641 | 3,32,14,9,47,28,19 |
| 9 | STEFANO LEE | ON | 5.0 | 1411 | 25,18,59,8,26,7,20 |
| 10 | ANVIT RAO | MI | 5.0 | 1365 | 16,19,55,31,6,25,18 |
Calculate the opponents’ mean pre-rating
opponent_mean <- function(opp.list, opp.scores) {
# function to calculate the opponent pre-rating means
list_len <- length(opp.list)
mean.vec <- rep(NA, list_len)
for (i in 1:list_len) {
opps <- c(str_split(opp.list, ",")[[i]])
opp.mean <- mean(subset(opp.scores[, 2], opp.scores[, 1] %in% opps))
mean.vec[i] <- opp.mean
}
return(mean.vec)
}
chess.df$opponent_pre_rating_mean <- round(opponent_mean(chess.df[, 6], chess.df[,
c(1, 5)]))Let’s take a look at the final data frame & create the CSV
final_chess_df <- chess.df[, c(2:5, 7)]
write.table(final_chess_df, file = "kgilde_chess_df.csv", sep = ",", row.names = F)
kable(head(final_chess_df, 10))| name | state | total_points | pre_rating | opponent_pre_rating_mean |
|---|---|---|---|---|
| GARY HUA | ON | 6.0 | 1794 | 1605 |
| DAKSHESH DARURI | MI | 6.0 | 1553 | 1469 |
| ADITYA BAJAJ | MI | 6.0 | 1384 | 1564 |
| PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
| HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
| HANSEN SONG | OH | 5.0 | 1686 | 1519 |
| GARY DEE SWATHELL | MI | 5.0 | 1649 | 1372 |
| EZEKIEL HOUGHTON | MI | 5.0 | 1641 | 1468 |
| STEFANO LEE | ON | 5.0 | 1411 | 1523 |
| ANVIT RAO | MI | 5.0 | 1365 | 1554 |
The CSV file can be found here
Can we see any correlation between the players’ pre-rating and the mean opponents’ prerating?
- There may be a little bit, but it’s not too strong.
pre_rating_opp_pre_rating <- lm(final_chess_df$opponent_pre_rating_mean ~ final_chess_df$pre_rating)
plot(final_chess_df$opponent_pre_rating_mean ~ final_chess_df$pre_rating)
abline(pre_rating_opp_pre_rating)Was there any correlation between the players’ pre-rating and their total points from the tournament?
- Yes, the variables appear correlated.
pre_rating_points <- lm(final_chess_df$total_points ~ final_chess_df$pre_rating)
plot(final_chess_df$total_points ~ final_chess_df$pre_rating)
abline(pre_rating_points)