In this project, we’re given a text file with chess tournament results where the information has some structure.The task is to generate a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
| PlayerName | State | TotalPoints | PreRating | AvgOppPreRating |
|---|---|---|---|---|
| Gary Hua | ON | 6.0 | 1794 | 1605 |
#Load necessary libraries
library(stringr)
library(DT)
library(dplyr)
library(tidyverse)
LOAD DATA
# Load the data from the file in my github repository
get_url <- 'https://raw.githubusercontent.com/ErindaB/Data607_Assignment/master/tournamentinfo.txt'
raw_data <- read.delim(get_url, header=FALSE, stringsAsFactors =FALSE )
head(raw_data)
## V1
## 1 -----------------------------------------------------------------------------------------
## 2 Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## 3 Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## 4 -----------------------------------------------------------------------------------------
## 5 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 6 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
The chess tournament text file is written in a format that is not legible in R as it is. This file needs to be restructured in order to find the average pre-tournament score.
#Remove the header which is comprised of two rows
raw_data <- raw_data[c(5:nrow(raw_data)),]
head(raw_data)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [5] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [6] "-----------------------------------------------------------------------------------------"
Extracting main variables from the file
id_var <- "\\d{1,2}(?=\\s\\|)"
chess_id <- unlist(str_extract_all(unlist(raw_data), id_var))
Extract player’s names
# Extract the player's names
# Get names in capital letters, followed by a space and with at least two matches
name <- "([[:upper:]]+\\s){2,}"
player_name <- unlist(str_extract_all(unlist(raw_data),name))
str_trim(player_name, side = "right")
## [1] "GARY HUA" "DAKSHESH DARURI"
## [3] "ADITYA BAJAJ" "PATRICK H SCHILLING"
## [5] "HANSHI ZUO" "HANSEN SONG"
## [7] "GARY DEE SWATHELL" "EZEKIEL HOUGHTON"
## [9] "STEFANO LEE" "ANVIT RAO"
## [11] "CAMERON WILLIAM MC LEMAN" "KENNETH J TACK"
## [13] "TORRANCE HENRY JR" "BRADLEY SHAW"
## [15] "ZACHARY JAMES HOUGHTON" "MIKE NIKITIN"
## [17] "RONALD GRZEGORCZYK" "DAVID SUNDEEN"
## [19] "DIPANKAR ROY" "JASON ZHENG"
## [21] "DINH DANG BUI" "EUGENE L MCCLURE"
## [23] "ALAN BUI" "MICHAEL R ALDRICH"
## [25] "LOREN SCHWIEBERT" "MAX ZHU"
## [27] "GAURAV GIDWANI" "SOFIA ADINA"
## [29] "CHIEDOZIE OKORIE" "GEORGE AVERY JONES"
## [31] "RISHI SHETTY" "JOSHUA PHILIP MATHEWS"
## [33] "JADE GE" "MICHAEL JEFFERY THOMAS"
## [35] "JOSHUA DAVID LEE" "SIDDHARTH JHA"
## [37] "AMIYATOSH PWNANANDAM" "BRIAN LIU"
## [39] "JOEL R HENDON" "FOREST ZHANG"
## [41] "KYLE WILLIAM MURPHY" "JARED GE"
## [43] "ROBERT GLEN VASEY" "JUSTIN D SCHILLING"
## [45] "DEREK YAN" "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT" "DANIEL KHAIN"
## [49] "MICHAEL J MARTIN" "SHIVAM JHA"
## [51] "TEJAS AYYAGARI" "ETHAN GUO"
## [53] "JOSE C YBARRA" "LARRY HODGE"
## [55] "ALEX KONG" "MARISA RICCI"
## [57] "MICHAEL LU" "VIRAJ MOHILE"
## [59] "SEAN M MC CORMICK" "JULIA SHEN"
## [61] "JEZZEL FARKAS" "ASHWIN BALAJI"
## [63] "THOMAS JOSEPH HOSMER" "BEN LI"
Extract player’s total points
# Extract each player's total number of points
points <- "\\d\\.\\d"
total_points <- unlist(str_extract_all(unlist(raw_data), points))
total_points
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5"
## [12] "4.5" "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [23] "4.0" "4.0" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [34] "3.5" "3.5" "3.5" "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0"
## [45] "3.0" "3.0" "2.5" "2.5" "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0"
## [56] "2.0" "2.0" "2.0" "2.0" "1.5" "1.5" "1.0" "1.0" "1.0"
Extract states
# Extract each player's state
# Get state by 2 capital letters,followed by a space and '|'
state <- "([[:upper:]]){2}\\s(?=\\|)"
states <- unlist(str_extract_all(unlist(raw_data), state))
str_trim(states, side = "right")
## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
Extract pre-rating
# Extract each player's pre-rating
# Avoid patterns with a > and a space. Then search for between 1 and two spaces, or one space and a colon, followed by 3 or 4 digits, and a space or the letter 'P'
feature <- "(?<!\\>\\s)(?<=\\s{1,2}|\\s\\:)(\\d{3,4}(?=\\s|P))"
pre_rating <- unlist(str_extract_all(unlist(raw_data), feature))
pre_rating <- str_trim(pre_rating)
pre_rating
## [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" "1365"
## [11] "1712" "1663" "1666" "1610" "1220" "1604" "1629" "1600" "1564" "1595"
## [21] "1563" "1555" "1363" "1229" "1745" "1579" "1552" "1507" "1602" "1522"
## [31] "1494" "1441" "1449" "1399" "1438" "1355" "980" "1423" "1436" "1348"
## [41] "1403" "1332" "1283" "1199" "1242" "377" "1362" "1382" "1291" "1056"
## [51] "1011" "935" "1393" "1270" "1186" "1153" "1092" "917" "853" "967"
## [61] "955" "1530" "1175" "1163"
Extract average of opponents pre-rating
opponents_pattern <- "(\\d{1,}|[[:blank:]]{1})(?=\\|)"
player_opponents <- unlist(str_extract_all(unlist(raw_data), opponents_pattern))
player_opponents[player_opponents==" "] <- NA
opponent1 <- as.numeric(player_opponents[seq(4, length(player_opponents), 10)])
opponent1 <- as.numeric(opponent1[seq(1, length(opponent1), 2)])
opponent2 <- as.numeric(player_opponents[seq(5, length(player_opponents), 10)])
opponent2 <- as.numeric(opponent2[seq(1, length(opponent2), 2)])
opponent3 <- as.numeric(player_opponents[seq(6, length(player_opponents), 10)])
opponent3 <- as.numeric(opponent3[seq(1, length(opponent3), 2)])
opponent4 <- as.numeric(player_opponents[seq(7, length(player_opponents), 10)])
opponent4 <- as.numeric(opponent4[seq(1, length(opponent4), 2)])
opponent5 <- as.numeric(player_opponents[seq(8, length(player_opponents), 10)])
opponent5 <- as.numeric(opponent5[seq(1, length(opponent5), 2)])
opponent6 <- as.numeric(player_opponents[seq(9, length(player_opponents), 10)])
opponent6 <- as.numeric(opponent6[seq(1, length(opponent6), 2)])
opponent7 <- as.numeric(player_opponents[seq(10, length(player_opponents), 10)])
opponent7 <- as.numeric(opponent7[seq(1, length(opponent7), 2)])
player_opponents <- matrix(c(opponent1, opponent2, opponent3, opponent4, opponent5, opponent6, opponent7),nrow = 64, ncol = 7)
# Match the opponent to the player's id
AvgOppRating<- 0
chess_tournament <- 0
for (i in 1:(length(chess_id)))
{
AvgOppRating[i] <- mean(as.numeric(pre_rating[player_opponents[i,]]), na.rm = T)
}
Create a table with all new variables
AvgOppRating<-round(AvgOppRating)
chess_tournament <- data.frame(player_name,states, total_points, pre_rating, AvgOppRating)
chess_tournament
## player_name states total_points pre_rating AvgOppRating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
## 7 GARY DEE SWATHELL MI 5.0 1649 1372
## 8 EZEKIEL HOUGHTON MI 5.0 1641 1468
## 9 STEFANO LEE ON 5.0 1411 1523
## 10 ANVIT RAO MI 5.0 1365 1554
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712 1468
## 12 KENNETH J TACK MI 4.5 1663 1506
## 13 TORRANCE HENRY JR MI 4.5 1666 1498
## 14 BRADLEY SHAW MI 4.5 1610 1515
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220 1484
## 16 MIKE NIKITIN MI 4.0 1604 1386
## 17 RONALD GRZEGORCZYK MI 4.0 1629 1499
## 18 DAVID SUNDEEN MI 4.0 1600 1480
## 19 DIPANKAR ROY MI 4.0 1564 1426
## 20 JASON ZHENG MI 4.0 1595 1411
## 21 DINH DANG BUI ON 4.0 1563 1470
## 22 EUGENE L MCCLURE MI 4.0 1555 1300
## 23 ALAN BUI ON 4.0 1363 1214
## 24 MICHAEL R ALDRICH MI 4.0 1229 1357
## 25 LOREN SCHWIEBERT MI 3.5 1745 1363
## 26 MAX ZHU ON 3.5 1579 1507
## 27 GAURAV GIDWANI MI 3.5 1552 1222
## 28 SOFIA ADINA MI 3.5 1507 1522
## 29 CHIEDOZIE OKORIE MI 3.5 1602 1314
## 30 GEORGE AVERY JONES ON 3.5 1522 1144
## 31 RISHI SHETTY MI 3.5 1494 1260
## 32 JOSHUA PHILIP MATHEWS ON 3.5 1441 1379
## 33 JADE GE MI 3.5 1449 1277
## 34 MICHAEL JEFFERY THOMAS MI 3.5 1399 1375
## 35 JOSHUA DAVID LEE MI 3.5 1438 1150
## 36 SIDDHARTH JHA MI 3.5 1355 1388
## 37 AMIYATOSH PWNANANDAM MI 3.5 980 1385
## 38 BRIAN LIU MI 3.0 1423 1539
## 39 JOEL R HENDON MI 3.0 1436 1430
## 40 FOREST ZHANG MI 3.0 1348 1391
## 41 KYLE WILLIAM MURPHY MI 3.0 1403 1248
## 42 JARED GE MI 3.0 1332 1150
## 43 ROBERT GLEN VASEY MI 3.0 1283 1107
## 44 JUSTIN D SCHILLING MI 3.0 1199 1327
## 45 DEREK YAN MI 3.0 1242 1152
## 46 JACOB ALEXANDER LAVALLEY MI 3.0 377 1358
## 47 ERIC WRIGHT MI 2.5 1362 1392
## 48 DANIEL KHAIN MI 2.5 1382 1356
## 49 MICHAEL J MARTIN MI 2.5 1291 1286
## 50 SHIVAM JHA MI 2.5 1056 1296
## 51 TEJAS AYYAGARI MI 2.5 1011 1356
## 52 ETHAN GUO MI 2.5 935 1495
## 53 JOSE C YBARRA MI 2.0 1393 1345
## 54 LARRY HODGE MI 2.0 1270 1206
## 55 ALEX KONG MI 2.0 1186 1406
## 56 MARISA RICCI MI 2.0 1153 1414
## 57 MICHAEL LU MI 2.0 1092 1363
## 58 VIRAJ MOHILE MI 2.0 917 1391
## 59 SEAN M MC CORMICK MI 2.0 853 1319
## 60 JULIA SHEN MI 1.5 967 1330
## 61 JEZZEL FARKAS ON 1.5 955 1327
## 62 ASHWIN BALAJI MI 1.0 1530 1186
## 63 THOMAS JOSEPH HOSMER MI 1.0 1175 1350
## 64 BEN LI MI 1.0 1163 1263
Write info to .csv file
write.csv(chess_tournament, file = "chess_tournament.csv")
Exploratory Plot
p <- ggplot(chess_tournament, aes(pre_rating, AvgOppRating)) + geom_point(aes(color=total_points)) + ggtitle("Comparing Player Pre-Rating to Avg Opponent Pre-Rating \n by Total Points Gained")
p