Project 1

In this project, we’re given a text file with chess tournament results where the information has some structure.The task is to generate a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

PlayerName	State	TotalPoints	PreRating	AvgOppPreRating
Gary Hua	ON	6.0	1794	1605

#Load necessary libraries
library(stringr)
library(DT)
library(dplyr)
library(tidyverse)

LOAD DATA

# Load the data from the file in my github repository
get_url <- 'https://raw.githubusercontent.com/ErindaB/Data607_Assignment/master/tournamentinfo.txt'
raw_data <- read.delim(get_url, header=FALSE, stringsAsFactors =FALSE )
head(raw_data)

##                                                                                           V1
## 1  -----------------------------------------------------------------------------------------
## 2  Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 3  Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 4  -----------------------------------------------------------------------------------------
## 5      1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 6     ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |

The chess tournament text file is written in a format that is not legible in R as it is. This file needs to be restructured in order to find the average pre-tournament score.

#Remove the header which is comprised of two rows
raw_data <- raw_data[c(5:nrow(raw_data)),]
head(raw_data)

## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [5] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "-----------------------------------------------------------------------------------------"

Extracting main variables from the file

id_var <- "\\d{1,2}(?=\\s\\|)"
chess_id <- unlist(str_extract_all(unlist(raw_data), id_var))

Extract player’s names

# Extract the player's names 
# Get names in  capital letters, followed by a space and with at least two matches

name <- "([[:upper:]]+\\s){2,}"
player_name <- unlist(str_extract_all(unlist(raw_data),name))
str_trim(player_name, side = "right")

##  [1] "GARY HUA"                 "DAKSHESH DARURI"         
##  [3] "ADITYA BAJAJ"             "PATRICK H SCHILLING"     
##  [5] "HANSHI ZUO"               "HANSEN SONG"             
##  [7] "GARY DEE SWATHELL"        "EZEKIEL HOUGHTON"        
##  [9] "STEFANO LEE"              "ANVIT RAO"               
## [11] "CAMERON WILLIAM MC LEMAN" "KENNETH J TACK"          
## [13] "TORRANCE HENRY JR"        "BRADLEY SHAW"            
## [15] "ZACHARY JAMES HOUGHTON"   "MIKE NIKITIN"            
## [17] "RONALD GRZEGORCZYK"       "DAVID SUNDEEN"           
## [19] "DIPANKAR ROY"             "JASON ZHENG"             
## [21] "DINH DANG BUI"            "EUGENE L MCCLURE"        
## [23] "ALAN BUI"                 "MICHAEL R ALDRICH"       
## [25] "LOREN SCHWIEBERT"         "MAX ZHU"                 
## [27] "GAURAV GIDWANI"           "SOFIA ADINA"             
## [29] "CHIEDOZIE OKORIE"         "GEORGE AVERY JONES"      
## [31] "RISHI SHETTY"             "JOSHUA PHILIP MATHEWS"   
## [33] "JADE GE"                  "MICHAEL JEFFERY THOMAS"  
## [35] "JOSHUA DAVID LEE"         "SIDDHARTH JHA"           
## [37] "AMIYATOSH PWNANANDAM"     "BRIAN LIU"               
## [39] "JOEL R HENDON"            "FOREST ZHANG"            
## [41] "KYLE WILLIAM MURPHY"      "JARED GE"                
## [43] "ROBERT GLEN VASEY"        "JUSTIN D SCHILLING"      
## [45] "DEREK YAN"                "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT"              "DANIEL KHAIN"            
## [49] "MICHAEL J MARTIN"         "SHIVAM JHA"              
## [51] "TEJAS AYYAGARI"           "ETHAN GUO"               
## [53] "JOSE C YBARRA"            "LARRY HODGE"             
## [55] "ALEX KONG"                "MARISA RICCI"            
## [57] "MICHAEL LU"               "VIRAJ MOHILE"            
## [59] "SEAN M MC CORMICK"        "JULIA SHEN"              
## [61] "JEZZEL FARKAS"            "ASHWIN BALAJI"           
## [63] "THOMAS JOSEPH HOSMER"     "BEN LI"

Extract player’s total points

# Extract each player's total number of points

points <- "\\d\\.\\d"
total_points <- unlist(str_extract_all(unlist(raw_data), points))
total_points

##  [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5"
## [12] "4.5" "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [23] "4.0" "4.0" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [34] "3.5" "3.5" "3.5" "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0"
## [45] "3.0" "3.0" "2.5" "2.5" "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0"
## [56] "2.0" "2.0" "2.0" "2.0" "1.5" "1.5" "1.0" "1.0" "1.0"

Extract states

# Extract each player's state
# Get state by  2 capital letters,followed by a space and  '|'

state <- "([[:upper:]]){2}\\s(?=\\|)"
states <- unlist(str_extract_all(unlist(raw_data), state))
str_trim(states, side = "right")

##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"

Extract pre-rating

#  Extract each player's pre-rating
#  Avoid  patterns with a > and a space. Then search for between 1 and two spaces, or one space and a colon, followed by 3 or 4 digits, and  a space or the letter 'P'

feature <- "(?<!\\>\\s)(?<=\\s{1,2}|\\s\\:)(\\d{3,4}(?=\\s|P))"
pre_rating <- unlist(str_extract_all(unlist(raw_data), feature))
pre_rating <- str_trim(pre_rating)
pre_rating

##  [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" "1365"
## [11] "1712" "1663" "1666" "1610" "1220" "1604" "1629" "1600" "1564" "1595"
## [21] "1563" "1555" "1363" "1229" "1745" "1579" "1552" "1507" "1602" "1522"
## [31] "1494" "1441" "1449" "1399" "1438" "1355" "980"  "1423" "1436" "1348"
## [41] "1403" "1332" "1283" "1199" "1242" "377"  "1362" "1382" "1291" "1056"
## [51] "1011" "935"  "1393" "1270" "1186" "1153" "1092" "917"  "853"  "967" 
## [61] "955"  "1530" "1175" "1163"

Extract average of opponents pre-rating

opponents_pattern <- "(\\d{1,}|[[:blank:]]{1})(?=\\|)"
player_opponents <- unlist(str_extract_all(unlist(raw_data), opponents_pattern))
player_opponents[player_opponents==" "]  <- NA
opponent1 <- as.numeric(player_opponents[seq(4, length(player_opponents), 10)])
opponent1 <- as.numeric(opponent1[seq(1, length(opponent1), 2)])
opponent2 <- as.numeric(player_opponents[seq(5, length(player_opponents), 10)])
opponent2 <- as.numeric(opponent2[seq(1, length(opponent2), 2)])
opponent3 <- as.numeric(player_opponents[seq(6, length(player_opponents), 10)])
opponent3 <- as.numeric(opponent3[seq(1, length(opponent3), 2)])
opponent4 <- as.numeric(player_opponents[seq(7, length(player_opponents), 10)])
opponent4 <- as.numeric(opponent4[seq(1, length(opponent4), 2)])
opponent5 <- as.numeric(player_opponents[seq(8, length(player_opponents), 10)])
opponent5 <- as.numeric(opponent5[seq(1, length(opponent5), 2)])
opponent6 <- as.numeric(player_opponents[seq(9, length(player_opponents), 10)])
opponent6 <- as.numeric(opponent6[seq(1, length(opponent6), 2)])
opponent7 <- as.numeric(player_opponents[seq(10, length(player_opponents), 10)])
opponent7 <- as.numeric(opponent7[seq(1, length(opponent7), 2)])
player_opponents <- matrix(c(opponent1, opponent2, opponent3, opponent4, opponent5, opponent6, opponent7),nrow = 64, ncol = 7)

# Match the opponent to the player's id

AvgOppRating<- 0
chess_tournament <- 0
for (i in 1:(length(chess_id)))
{
  AvgOppRating[i] <- mean(as.numeric(pre_rating[player_opponents[i,]]), na.rm = T)
}

Create a table with all new variables

AvgOppRating<-round(AvgOppRating)
chess_tournament <- data.frame(player_name,states, total_points, pre_rating, AvgOppRating)
chess_tournament

##                  player_name states total_points pre_rating AvgOppRating
## 1                  GARY HUA     ON           6.0       1794         1605
## 2           DAKSHESH DARURI     MI           6.0       1553         1469
## 3              ADITYA BAJAJ     MI           6.0       1384         1564
## 4       PATRICK H SCHILLING     MI           5.5       1716         1574
## 5                HANSHI ZUO     MI           5.5       1655         1501
## 6               HANSEN SONG     OH           5.0       1686         1519
## 7         GARY DEE SWATHELL     MI           5.0       1649         1372
## 8          EZEKIEL HOUGHTON     MI           5.0       1641         1468
## 9               STEFANO LEE     ON           5.0       1411         1523
## 10                ANVIT RAO     MI           5.0       1365         1554
## 11 CAMERON WILLIAM MC LEMAN     MI           4.5       1712         1468
## 12           KENNETH J TACK     MI           4.5       1663         1506
## 13        TORRANCE HENRY JR     MI           4.5       1666         1498
## 14             BRADLEY SHAW     MI           4.5       1610         1515
## 15   ZACHARY JAMES HOUGHTON     MI           4.5       1220         1484
## 16             MIKE NIKITIN     MI           4.0       1604         1386
## 17       RONALD GRZEGORCZYK     MI           4.0       1629         1499
## 18            DAVID SUNDEEN     MI           4.0       1600         1480
## 19             DIPANKAR ROY     MI           4.0       1564         1426
## 20              JASON ZHENG     MI           4.0       1595         1411
## 21            DINH DANG BUI     ON           4.0       1563         1470
## 22         EUGENE L MCCLURE     MI           4.0       1555         1300
## 23                 ALAN BUI     ON           4.0       1363         1214
## 24        MICHAEL R ALDRICH     MI           4.0       1229         1357
## 25         LOREN SCHWIEBERT     MI           3.5       1745         1363
## 26                  MAX ZHU     ON           3.5       1579         1507
## 27           GAURAV GIDWANI     MI           3.5       1552         1222
## 28              SOFIA ADINA     MI           3.5       1507         1522
## 29         CHIEDOZIE OKORIE     MI           3.5       1602         1314
## 30       GEORGE AVERY JONES     ON           3.5       1522         1144
## 31             RISHI SHETTY     MI           3.5       1494         1260
## 32    JOSHUA PHILIP MATHEWS     ON           3.5       1441         1379
## 33                  JADE GE     MI           3.5       1449         1277
## 34   MICHAEL JEFFERY THOMAS     MI           3.5       1399         1375
## 35         JOSHUA DAVID LEE     MI           3.5       1438         1150
## 36            SIDDHARTH JHA     MI           3.5       1355         1388
## 37     AMIYATOSH PWNANANDAM     MI           3.5        980         1385
## 38                BRIAN LIU     MI           3.0       1423         1539
## 39            JOEL R HENDON     MI           3.0       1436         1430
## 40             FOREST ZHANG     MI           3.0       1348         1391
## 41      KYLE WILLIAM MURPHY     MI           3.0       1403         1248
## 42                 JARED GE     MI           3.0       1332         1150
## 43        ROBERT GLEN VASEY     MI           3.0       1283         1107
## 44       JUSTIN D SCHILLING     MI           3.0       1199         1327
## 45                DEREK YAN     MI           3.0       1242         1152
## 46 JACOB ALEXANDER LAVALLEY     MI           3.0        377         1358
## 47              ERIC WRIGHT     MI           2.5       1362         1392
## 48             DANIEL KHAIN     MI           2.5       1382         1356
## 49         MICHAEL J MARTIN     MI           2.5       1291         1286
## 50               SHIVAM JHA     MI           2.5       1056         1296
## 51           TEJAS AYYAGARI     MI           2.5       1011         1356
## 52                ETHAN GUO     MI           2.5        935         1495
## 53            JOSE C YBARRA     MI           2.0       1393         1345
## 54              LARRY HODGE     MI           2.0       1270         1206
## 55                ALEX KONG     MI           2.0       1186         1406
## 56             MARISA RICCI     MI           2.0       1153         1414
## 57               MICHAEL LU     MI           2.0       1092         1363
## 58             VIRAJ MOHILE     MI           2.0        917         1391
## 59        SEAN M MC CORMICK     MI           2.0        853         1319
## 60               JULIA SHEN     MI           1.5        967         1330
## 61            JEZZEL FARKAS     ON           1.5        955         1327
## 62            ASHWIN BALAJI     MI           1.0       1530         1186
## 63     THOMAS JOSEPH HOSMER     MI           1.0       1175         1350
## 64                   BEN LI     MI           1.0       1163         1263

Write info to .csv file

write.csv(chess_tournament, file = "chess_tournament.csv")

Exploratory Plot

p <- ggplot(chess_tournament, aes(pre_rating, AvgOppRating)) + geom_point(aes(color=total_points)) + ggtitle("Comparing Player Pre-Rating to Avg Opponent Pre-Rating \n  by Total Points Gained")
p

Project 1

Erinda Budo

9/18/2019