Task:

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be:

Gary Hua, ON, 6.0, 1794, 1605

1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

If you have questions about the meaning of the data or the results, please post them on the discussion forum. Data science, like chess, is a game of back and forth…

The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments.

You may substitute another text file (or set of text files, or data scraped from web pages) of similar or greater complexity, and create your own assignment and solution. You may work in a small team. All of your code should be in an R markdown file (and published to rpubs.com); with your data accessible for the person running the script.

# Load necessary libraries
library(stringr)
library(DT)

Load the Data

# Load the data from the file hosted in my github project folder

chess_data <- read.table(url("https://raw.githubusercontent.com/JoshuaSturm/CUNY_MSDA/Public/Fall_2017/DATA_607/Project%201/tournamentinfo.txt"), sep = ",")

head(chess_data)

##                                                                                           V1
## 1  -----------------------------------------------------------------------------------------
## 2  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 3  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 4  -----------------------------------------------------------------------------------------
## 5      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 6     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |

# Since the regex I made will include the top row, I can exclude it since it doesn't have relevant info, anyway

chess_data <- chess_data[c(5:nrow(chess_data)),]

# Preview the table format

head(chess_data)

## [1]     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## [2]    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## [3] -----------------------------------------------------------------------------------------
## [4]     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## [5]    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## [6] -----------------------------------------------------------------------------------------
## 131 Levels: ----------------------------------------------------------------------------------------- ...

Extracting the variables we want

Extract player IDs

id_pattern <- "\\d{1,2}(?=\\s\\|)"
chess_id <- unlist(str_extract_all(unlist(chess_data), id_pattern))

Extract player names

# Extract the player names
# Note that I made use of the website 'RegExr.com'

# The pattern I made is multiple capital letters, followed by a space, with at least two matches

name_pattern <- "([[:upper:]]+\\s){2,}"
chess_names <- unlist(str_extract_all(unlist(chess_data), name_pattern))
str_trim(chess_names, side = "right")

##  [1] "GARY HUA"                 "DAKSHESH DARURI"         
##  [3] "ADITYA BAJAJ"             "PATRICK H SCHILLING"     
##  [5] "HANSHI ZUO"               "HANSEN SONG"             
##  [7] "GARY DEE SWATHELL"        "EZEKIEL HOUGHTON"        
##  [9] "STEFANO LEE"              "ANVIT RAO"               
## [11] "CAMERON WILLIAM MC LEMAN" "KENNETH J TACK"          
## [13] "TORRANCE HENRY JR"        "BRADLEY SHAW"            
## [15] "ZACHARY JAMES HOUGHTON"   "MIKE NIKITIN"            
## [17] "RONALD GRZEGORCZYK"       "DAVID SUNDEEN"           
## [19] "DIPANKAR ROY"             "JASON ZHENG"             
## [21] "DINH DANG BUI"            "EUGENE L MCCLURE"        
## [23] "ALAN BUI"                 "MICHAEL R ALDRICH"       
## [25] "LOREN SCHWIEBERT"         "MAX ZHU"                 
## [27] "GAURAV GIDWANI"           "SOFIA ADINA"             
## [29] "CHIEDOZIE OKORIE"         "GEORGE AVERY JONES"      
## [31] "RISHI SHETTY"             "JOSHUA PHILIP MATHEWS"   
## [33] "JADE GE"                  "MICHAEL JEFFERY THOMAS"  
## [35] "JOSHUA DAVID LEE"         "SIDDHARTH JHA"           
## [37] "AMIYATOSH PWNANANDAM"     "BRIAN LIU"               
## [39] "JOEL R HENDON"            "FOREST ZHANG"            
## [41] "KYLE WILLIAM MURPHY"      "JARED GE"                
## [43] "ROBERT GLEN VASEY"        "JUSTIN D SCHILLING"      
## [45] "DEREK YAN"                "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT"              "DANIEL KHAIN"            
## [49] "MICHAEL J MARTIN"         "SHIVAM JHA"              
## [51] "TEJAS AYYAGARI"           "ETHAN GUO"               
## [53] "JOSE C YBARRA"            "LARRY HODGE"             
## [55] "ALEX KONG"                "MARISA RICCI"            
## [57] "MICHAEL LU"               "VIRAJ MOHILE"            
## [59] "SEAN M MC CORMICK"        "JULIA SHEN"              
## [61] "JEZZEL FARKAS"            "ASHWIN BALAJI"           
## [63] "THOMAS JOSEPH HOSMER"     "BEN LI"

Extract player states

# Extract each player's state

# The pattern I made is 2 capital letters, followed by a space, followed by a '|'

state_pattern <- "([[:upper:]]){2}\\s(?=\\|)"
chess_states <- unlist(str_extract_all(unlist(chess_data), state_pattern))
str_trim(chess_states, side = "right")

##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"

Extract total number of points

# Extract each player's total number of points

# The pattern I made is two digits with a period between them

points_pattern <- "\\d\\.\\d"
chess_points <- unlist(str_extract_all(unlist(chess_data), points_pattern))
chess_points

##  [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5"
## [12] "4.5" "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [23] "4.0" "4.0" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [34] "3.5" "3.5" "3.5" "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0"
## [45] "3.0" "3.0" "2.5" "2.5" "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0"
## [56] "2.0" "2.0" "2.0" "2.0" "1.5" "1.5" "1.0" "1.0" "1.0"

Extract player’s pre-rating

# Extract each player's pre-rating

# This one was tricky. It would include some of the 3 number post-ratings. I experimented with about a dozen patterns before I created one that worked perfectly. It excludes leading patterns with a > and a space. Then looks for between 1 and two spaces, or one space and a colon, followed by 3 or 4 digits, followed by a space or the letter 'P'

pre_pattern <- "(?<!\\>\\s)(?<=\\s{1,2}|\\s\\:)(\\d{3,4}(?=\\s|P))"
chess_pre <- unlist(str_extract_all(unlist(chess_data), pre_pattern))
chess_pre <- str_trim(chess_pre)

Extract average pre-rating of opponents

# Extract the average of opponents' pre-rating

# Begin by vectorizing each player's opponents

opponents_pattern <- "(\\d{1,}|[[:blank:]]{1})(?=\\|)"
player_opponents <- unlist(str_extract_all(unlist(chess_data), opponents_pattern))
player_opponents[player_opponents==" "]  <- NA
opponent1 <- as.numeric(player_opponents[seq(4, length(player_opponents), 10)])
opponent1 <- as.numeric(opponent1[seq(1, length(opponent1), 2)])
opponent2 <- as.numeric(player_opponents[seq(5, length(player_opponents), 10)])
opponent2 <- as.numeric(opponent2[seq(1, length(opponent2), 2)])
opponent3 <- as.numeric(player_opponents[seq(6, length(player_opponents), 10)])
opponent3 <- as.numeric(opponent3[seq(1, length(opponent3), 2)])
opponent4 <- as.numeric(player_opponents[seq(7, length(player_opponents), 10)])
opponent4 <- as.numeric(opponent4[seq(1, length(opponent4), 2)])
opponent5 <- as.numeric(player_opponents[seq(8, length(player_opponents), 10)])
opponent5 <- as.numeric(opponent5[seq(1, length(opponent5), 2)])
opponent6 <- as.numeric(player_opponents[seq(9, length(player_opponents), 10)])
opponent6 <- as.numeric(opponent6[seq(1, length(opponent6), 2)])
opponent7 <- as.numeric(player_opponents[seq(10, length(player_opponents), 10)])
opponent7 <- as.numeric(opponent7[seq(1, length(opponent7), 2)])
player_opponents <- matrix(c(opponent1, opponent2, opponent3, opponent4, opponent5, opponent6, opponent7),nrow = 64, ncol = 7)

# Match the opponent to the player's id

chess_avg <- 0
chess_table <- 0
for (i in 1:(length(chess_id)))
{
  chess_avg[i] <- mean(as.numeric(chess_pre[player_opponents[i,]]), na.rm = T)
}

Create a data table of all info

chess_table <- data.frame(chess_names, chess_states, chess_points, chess_pre, chess_avg)
datatable(chess_table, extensions = 'Scroller', options = list(
  deferRender = TRUE,
  scrollY = 200,
  scroller = TRUE
))

Write info to .csv file

write.csv(chess_table, file = "chessTable.csv")

DATA 607 - Project 1

Joshua Sturm

September 18, 2017