Import Data and Organize

I uploaded the .txt file to GitHub and imported it from there. After studying the data file I used the read_delim() method so I could set my own delimiter. I used ‘|’ as the delimiter, trimmed the leading and trailing white spaces from each field and skipped the first row of all hyphens so I could use the next row for column names.

Note: ‘-BELLU’ was cut off from the name ‘SOFIA ADINA STANESCU-BELLU’ during the initial import. I could never figure out why it was cut off or how to fix it.

chessdata <- read_delim("https://raw.githubusercontent.com/smithchad17/Class607/master/tournamentinfo.txt", delim = "|", trim_ws = T, skip = 1, col_names = T)

Removed the first row which was not needed and the last column which was empty

chessdata <- chessdata[-c(1),-dim(chessdata)[2]]

Filter rows where the player’s name is empty. This will eliminate the rows with hyphens

chessdata <- filter(chessdata, chessdata$`Player Name` != " ")
#Rename columns
names(chessdata) <- c("Pair", "Player's Name", "Total Points", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7")
chessdata
## # A tibble: 128 x 10
##     Pair             `Player's Name` `Total Points` Round1 Round2 Round3
##    <chr>                       <chr>          <chr>  <chr>  <chr>  <chr>
##  1     1                    GARY HUA            6.0  W  39  W  21  W  18
##  2    ON 15445895 / R: 1794   ->1817            N:2      W      B      W
##  3     2             DAKSHESH DARURI            6.0  W  63  W  58  L   4
##  4    MI 14598900 / R: 1553   ->1663            N:2      B      W      B
##  5     3                ADITYA BAJAJ            6.0  L   8  W  61  W  25
##  6    MI 14959604 / R: 1384   ->1640            N:2      W      B      W
##  7     4         PATRICK H SCHILLING            5.5  W  23  D  28  W   2
##  8    MI 12616049 / R: 1716   ->1744            N:2      W      B      W
##  9     5                  HANSHI ZUO            5.5  W  45  W  37  D  12
## 10    MI 14601533 / R: 1655   ->1690            N:2      B      W      B
## # ... with 118 more rows, and 4 more variables: Round4 <chr>,
## #   Round5 <chr>, Round6 <chr>, Round7 <chr>

Pull Specific Data

Each column of data was pulled as a vector, then set as a column in a temporary data frame so it could be filtered using dplyr. The result was a data frame of one column.

Get Player’s Number

#Extract the rows that only have digits
playernumber <- str_extract(chessdata$Pair, "[[:digit:]]+")
##Add to a temporary data frame so it can be filtered
tempdata <- data.frame(playernumber, stringsAsFactors = F)
#Filter rows that are empty
playernumber <- filter(tempdata, playernumber != " ")
playernumber
##    playernumber
## 1             1
## 2             2
## 3             3
## 4             4
## 5             5
## 6             6
## 7             7
## 8             8
## 9             9
## 10           10
## 11           11
## 12           12
## 13           13
## 14           14
## 15           15
## 16           16
## 17           17
## 18           18
## 19           19
## 20           20
## 21           21
## 22           22
## 23           23
## 24           24
## 25           25
## 26           26
## 27           27
## 28           28
## 29           29
## 30           30
## 31           31
## 32           32
## 33           33
## 34           34
## 35           35
## 36           36
## 37           37
## 38           38
## 39           39
## 40           40
## 41           41
## 42           42
## 43           43
## 44           44
## 45           45
## 46           46
## 47           47
## 48           48
## 49           49
## 50           50
## 51           51
## 52           52
## 53           53
## 54           54
## 55           55
## 56           56
## 57           57
## 58           58
## 59           59
## 60           60
## 61           61
## 62           62
## 63           63
## 64           64

Get Player’s Name

#Extract the rows that only have letters and spaces
playername <- str_extract(chessdata$`Player's Name`, "[[:alpha:] ]+")
#Add to a temporary data frame so it can be filtered
tempdata <- data.frame(playername, stringsAsFactors = F)
#Filter rows that are empty
playername <- filter(tempdata, playername != " ")
playername
##                  playername
## 1                  GARY HUA
## 2           DAKSHESH DARURI
## 3              ADITYA BAJAJ
## 4       PATRICK H SCHILLING
## 5                HANSHI ZUO
## 6               HANSEN SONG
## 7         GARY DEE SWATHELL
## 8          EZEKIEL HOUGHTON
## 9               STEFANO LEE
## 10                ANVIT RAO
## 11 CAMERON WILLIAM MC LEMAN
## 12           KENNETH J TACK
## 13        TORRANCE HENRY JR
## 14             BRADLEY SHAW
## 15   ZACHARY JAMES HOUGHTON
## 16             MIKE NIKITIN
## 17       RONALD GRZEGORCZYK
## 18            DAVID SUNDEEN
## 19             DIPANKAR ROY
## 20              JASON ZHENG
## 21            DINH DANG BUI
## 22         EUGENE L MCCLURE
## 23                 ALAN BUI
## 24        MICHAEL R ALDRICH
## 25         LOREN SCHWIEBERT
## 26                  MAX ZHU
## 27           GAURAV GIDWANI
## 28     SOFIA ADINA STANESCU
## 29         CHIEDOZIE OKORIE
## 30       GEORGE AVERY JONES
## 31             RISHI SHETTY
## 32    JOSHUA PHILIP MATHEWS
## 33                  JADE GE
## 34   MICHAEL JEFFERY THOMAS
## 35         JOSHUA DAVID LEE
## 36            SIDDHARTH JHA
## 37     AMIYATOSH PWNANANDAM
## 38                BRIAN LIU
## 39            JOEL R HENDON
## 40             FOREST ZHANG
## 41      KYLE WILLIAM MURPHY
## 42                 JARED GE
## 43        ROBERT GLEN VASEY
## 44       JUSTIN D SCHILLING
## 45                DEREK YAN
## 46 JACOB ALEXANDER LAVALLEY
## 47              ERIC WRIGHT
## 48             DANIEL KHAIN
## 49         MICHAEL J MARTIN
## 50               SHIVAM JHA
## 51           TEJAS AYYAGARI
## 52                ETHAN GUO
## 53            JOSE C YBARRA
## 54              LARRY HODGE
## 55                ALEX KONG
## 56             MARISA RICCI
## 57               MICHAEL LU
## 58             VIRAJ MOHILE
## 59        SEAN M MC CORMICK
## 60               JULIA SHEN
## 61            JEZZEL FARKAS
## 62            ASHWIN BALAJI
## 63     THOMAS JOSEPH HOSMER
## 64                   BEN LI

Get Player’s State

#Extract the rows that only have letters
playerstate <- str_extract(chessdata$Pair, "[[:alpha:]]+")
#Add to a temporary data frame so it can be filtered
tempdata <- data.frame(playerstate, stringsAsFactors = F)
#Filter rows that are empty
playerstate <- filter(tempdata, playerstate != " ")
playerstate
##    playerstate
## 1           ON
## 2           MI
## 3           MI
## 4           MI
## 5           MI
## 6           OH
## 7           MI
## 8           MI
## 9           ON
## 10          MI
## 11          MI
## 12          MI
## 13          MI
## 14          MI
## 15          MI
## 16          MI
## 17          MI
## 18          MI
## 19          MI
## 20          MI
## 21          ON
## 22          MI
## 23          ON
## 24          MI
## 25          MI
## 26          ON
## 27          MI
## 28          MI
## 29          MI
## 30          ON
## 31          MI
## 32          ON
## 33          MI
## 34          MI
## 35          MI
## 36          MI
## 37          MI
## 38          MI
## 39          MI
## 40          MI
## 41          MI
## 42          MI
## 43          MI
## 44          MI
## 45          MI
## 46          MI
## 47          MI
## 48          MI
## 49          MI
## 50          MI
## 51          MI
## 52          MI
## 53          MI
## 54          MI
## 55          MI
## 56          MI
## 57          MI
## 58          MI
## 59          MI
## 60          MI
## 61          ON
## 62          MI
## 63          MI
## 64          MI

Get Player’s Points

#Extract the rows that only have numbers before and after a period
playerpoints <- str_extract(chessdata$"Total Points", "\\d+.\\d+")
#Add to a temporary data frame so it can be filtered
tempdata <- data.frame(playerpoints, stringsAsFactors = F)
#Filter rows that are empty
playerpoints <- filter(tempdata, playerpoints != " ")
playerpoints
##    playerpoints
## 1           6.0
## 2           6.0
## 3           6.0
## 4           5.5
## 5           5.5
## 6           5.0
## 7           5.0
## 8           5.0
## 9           5.0
## 10          5.0
## 11          4.5
## 12          4.5
## 13          4.5
## 14          4.5
## 15          4.5
## 16          4.0
## 17          4.0
## 18          4.0
## 19          4.0
## 20          4.0
## 21          4.0
## 22          4.0
## 23          4.0
## 24          4.0
## 25          3.5
## 26          3.5
## 27          3.5
## 28          3.5
## 29          3.5
## 30          3.5
## 31          3.5
## 32          3.5
## 33          3.5
## 34          3.5
## 35          3.5
## 36          3.5
## 37          3.5
## 38          3.0
## 39          3.0
## 40          3.0
## 41          3.0
## 42          3.0
## 43          3.0
## 44          3.0
## 45          3.0
## 46          3.0
## 47          2.5
## 48          2.5
## 49          2.5
## 50          2.5
## 51          2.5
## 52          2.5
## 53          2.0
## 54          2.0
## 55          2.0
## 56          2.0
## 57          2.0
## 58          2.0
## 59          2.0
## 60          1.5
## 61          1.5
## 62          1.0
## 63          1.0
## 64          1.0

Get Player’s Pre-Rating

#Extract the rows that only have numbers after a colon and before a hyphen
playerrating <- str_extract(chessdata$`Player's Name`, ":.+?(\\d+).+?-")
#Extract only the numbers 
playerrating <- str_extract(playerrating, "\\d+")
#Add to a temporary data frame so it can be filtered
tempdata <- data.frame(playerrating, stringsAsFactors = F)
#Filter rows that are empty
playerrating <- filter(tempdata, playerrating != " ")
playerrating
##    playerrating
## 1          1794
## 2          1553
## 3          1384
## 4          1716
## 5          1655
## 6          1686
## 7          1649
## 8          1641
## 9          1411
## 10         1365
## 11         1712
## 12         1663
## 13         1666
## 14         1610
## 15         1220
## 16         1604
## 17         1629
## 18         1600
## 19         1564
## 20         1595
## 21         1563
## 22         1555
## 23         1363
## 24         1229
## 25         1745
## 26         1579
## 27         1552
## 28         1507
## 29         1602
## 30         1522
## 31         1494
## 32         1441
## 33         1449
## 34         1399
## 35         1438
## 36         1355
## 37          980
## 38         1423
## 39         1436
## 40         1348
## 41         1403
## 42         1332
## 43         1283
## 44         1199
## 45         1242
## 46          377
## 47         1362
## 48         1382
## 49         1291
## 50         1056
## 51         1011
## 52          935
## 53         1393
## 54         1270
## 55         1186
## 56         1153
## 57         1092
## 58          917
## 59          853
## 60          967
## 61          955
## 62         1530
## 63         1175
## 64         1163

Games Played and AVG Ratings

To find the amount of games played, I organized the data a little further. I only want to extract data from the most simplified data frame as possible so I removed the rows I didn’t need. To find the amount of games played I didn’t need the rows with the ratings so I removed those to create a subset of the original data frame.

chess.subset <- chessdata[grep('^[0-9]', chessdata$Pair), ]
chess.subset
## # A tibble: 64 x 10
##     Pair     `Player's Name` `Total Points` Round1 Round2 Round3 Round4
##    <chr>               <chr>          <chr>  <chr>  <chr>  <chr>  <chr>
##  1     1            GARY HUA            6.0  W  39  W  21  W  18  W  14
##  2     2     DAKSHESH DARURI            6.0  W  63  W  58  L   4  W  17
##  3     3        ADITYA BAJAJ            6.0  L   8  W  61  W  25  W  21
##  4     4 PATRICK H SCHILLING            5.5  W  23  D  28  W   2  W  26
##  5     5          HANSHI ZUO            5.5  W  45  W  37  D  12  D  13
##  6     6         HANSEN SONG            5.0  W  34  D  29  L  11  W  35
##  7     7   GARY DEE SWATHELL            5.0  W  57  W  46  W  13  W  11
##  8     8    EZEKIEL HOUGHTON            5.0  W   3  W  32  L  14  L   9
##  9     9         STEFANO LEE            5.0  W  25  L  18  W  59  W   8
## 10    10           ANVIT RAO            5.0  D  16  L  19  W  55  W  31
## # ... with 54 more rows, and 3 more variables: Round5 <chr>, Round6 <chr>,
## #   Round7 <chr>

Looping through the columns Round1 - Round7 by row, while counting the amount of games played.

#Count how many games each player played
played <- c()

#Start with the first row and cycle through columns of Round1 through Round7
#If a opponent number is present, add it to the counter to find the amount of games played.
for(i in chess.subset$Pair){
  j <- 4
  count <- 0
  while(j <= dim(chess.subset)[2]){
    if(str_detect(chess.subset[as.integer(i),j], "\\d") == T) {count <- count + 1}
    j <- j + 1
  }
  played <- c(played, count)
}

played #amount of games played per player
##  [1] 7 7 7 7 7 7 7 7 7 7 7 6 7 7 7 5 7 7 7 7 7 6 7 7 7 7 6 7 6 7 7 7 7 7 7
## [36] 6 5 6 7 7 4 7 7 6 7 7 7 5 5 6 7 7 3 6 6 5 6 6 6 5 7 1 5 7

Finding the AVG Pre-Ratings

Start with the first row and cycle through columns of Round1 through Round7. Extracts the opponent number and compares it to the corresponding value in the ‘playerratings’ data frame. Adds the value which will be summed up and averaged

#Empty vector that will hold the average opponents rating
averages <- c()

#Start with the first row and cycle through columns of Round1 through Round7.
#Extracts the opponent number and compares it to the corresponding value in the 
#'playerratings' data frame.
for(i in chess.subset$Pair){
  j <- 4
  ratings <- c()
  while(j <= dim(chess.subset)[2]){
    opp_num <- str_extract(chess.subset[as.integer(i),j], "\\d+")
    #Appends the next opponent rating to the ratings vector
    ratings <- c(ratings, playerrating[opp_num,])
    j <- j + 1
  }
  #Computes the mean of the 'ratings' vector and appends the result to the 'average' vector
  #Ignore the 'NA' results that were extracted when an opponent wasn't played that round.
  averages <- c(averages, mean(as.integer(ratings), na.rm = T))
}
round(averages)
##  [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515
## [15] 1484 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522
## [29] 1314 1144 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150
## [43] 1107 1327 1152 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414
## [57] 1363 1391 1319 1330 1327 1186 1350 1263

Merge Pulled Data

Create a final data frame from the resulting data frames we previously found

columns <- c(playername, playerstate, playerpoints, playerrating, as.data.frame(played), as.data.frame(round(averages)))
finaldata <- data.frame(columns, stringsAsFactors = F)
names(finaldata) <- c("Player's Name", "Player's State", "Total Points", "Pre-Rating", "Games Played", "AVG Opponent Pre-Rating")

finaldata
##               Player's Name Player's State Total Points Pre-Rating
## 1                  GARY HUA             ON          6.0       1794
## 2           DAKSHESH DARURI             MI          6.0       1553
## 3              ADITYA BAJAJ             MI          6.0       1384
## 4       PATRICK H SCHILLING             MI          5.5       1716
## 5                HANSHI ZUO             MI          5.5       1655
## 6               HANSEN SONG             OH          5.0       1686
## 7         GARY DEE SWATHELL             MI          5.0       1649
## 8          EZEKIEL HOUGHTON             MI          5.0       1641
## 9               STEFANO LEE             ON          5.0       1411
## 10                ANVIT RAO             MI          5.0       1365
## 11 CAMERON WILLIAM MC LEMAN             MI          4.5       1712
## 12           KENNETH J TACK             MI          4.5       1663
## 13        TORRANCE HENRY JR             MI          4.5       1666
## 14             BRADLEY SHAW             MI          4.5       1610
## 15   ZACHARY JAMES HOUGHTON             MI          4.5       1220
## 16             MIKE NIKITIN             MI          4.0       1604
## 17       RONALD GRZEGORCZYK             MI          4.0       1629
## 18            DAVID SUNDEEN             MI          4.0       1600
## 19             DIPANKAR ROY             MI          4.0       1564
## 20              JASON ZHENG             MI          4.0       1595
## 21            DINH DANG BUI             ON          4.0       1563
## 22         EUGENE L MCCLURE             MI          4.0       1555
## 23                 ALAN BUI             ON          4.0       1363
## 24        MICHAEL R ALDRICH             MI          4.0       1229
## 25         LOREN SCHWIEBERT             MI          3.5       1745
## 26                  MAX ZHU             ON          3.5       1579
## 27           GAURAV GIDWANI             MI          3.5       1552
## 28     SOFIA ADINA STANESCU             MI          3.5       1507
## 29         CHIEDOZIE OKORIE             MI          3.5       1602
## 30       GEORGE AVERY JONES             ON          3.5       1522
## 31             RISHI SHETTY             MI          3.5       1494
## 32    JOSHUA PHILIP MATHEWS             ON          3.5       1441
## 33                  JADE GE             MI          3.5       1449
## 34   MICHAEL JEFFERY THOMAS             MI          3.5       1399
## 35         JOSHUA DAVID LEE             MI          3.5       1438
## 36            SIDDHARTH JHA             MI          3.5       1355
## 37     AMIYATOSH PWNANANDAM             MI          3.5        980
## 38                BRIAN LIU             MI          3.0       1423
## 39            JOEL R HENDON             MI          3.0       1436
## 40             FOREST ZHANG             MI          3.0       1348
## 41      KYLE WILLIAM MURPHY             MI          3.0       1403
## 42                 JARED GE             MI          3.0       1332
## 43        ROBERT GLEN VASEY             MI          3.0       1283
## 44       JUSTIN D SCHILLING             MI          3.0       1199
## 45                DEREK YAN             MI          3.0       1242
## 46 JACOB ALEXANDER LAVALLEY             MI          3.0        377
## 47              ERIC WRIGHT             MI          2.5       1362
## 48             DANIEL KHAIN             MI          2.5       1382
## 49         MICHAEL J MARTIN             MI          2.5       1291
## 50               SHIVAM JHA             MI          2.5       1056
## 51           TEJAS AYYAGARI             MI          2.5       1011
## 52                ETHAN GUO             MI          2.5        935
## 53            JOSE C YBARRA             MI          2.0       1393
## 54              LARRY HODGE             MI          2.0       1270
## 55                ALEX KONG             MI          2.0       1186
## 56             MARISA RICCI             MI          2.0       1153
## 57               MICHAEL LU             MI          2.0       1092
## 58             VIRAJ MOHILE             MI          2.0        917
## 59        SEAN M MC CORMICK             MI          2.0        853
## 60               JULIA SHEN             MI          1.5        967
## 61            JEZZEL FARKAS             ON          1.5        955
## 62            ASHWIN BALAJI             MI          1.0       1530
## 63     THOMAS JOSEPH HOSMER             MI          1.0       1175
## 64                   BEN LI             MI          1.0       1163
##    Games Played AVG Opponent Pre-Rating
## 1             7                    1605
## 2             7                    1469
## 3             7                    1564
## 4             7                    1574
## 5             7                    1501
## 6             7                    1519
## 7             7                    1372
## 8             7                    1468
## 9             7                    1523
## 10            7                    1554
## 11            7                    1468
## 12            6                    1506
## 13            7                    1498
## 14            7                    1515
## 15            7                    1484
## 16            5                    1386
## 17            7                    1499
## 18            7                    1480
## 19            7                    1426
## 20            7                    1411
## 21            7                    1470
## 22            6                    1300
## 23            7                    1214
## 24            7                    1357
## 25            7                    1363
## 26            7                    1507
## 27            6                    1222
## 28            7                    1522
## 29            6                    1314
## 30            7                    1144
## 31            7                    1260
## 32            7                    1379
## 33            7                    1277
## 34            7                    1375
## 35            7                    1150
## 36            6                    1388
## 37            5                    1385
## 38            6                    1539
## 39            7                    1430
## 40            7                    1391
## 41            4                    1248
## 42            7                    1150
## 43            7                    1107
## 44            6                    1327
## 45            7                    1152
## 46            7                    1358
## 47            7                    1392
## 48            5                    1356
## 49            5                    1286
## 50            6                    1296
## 51            7                    1356
## 52            7                    1495
## 53            3                    1345
## 54            6                    1206
## 55            6                    1406
## 56            5                    1414
## 57            6                    1363
## 58            6                    1391
## 59            6                    1319
## 60            5                    1330
## 61            7                    1327
## 62            1                    1186
## 63            5                    1350
## 64            7                    1263

Write to CSV file

write.csv(finaldata, file = "ChessData.csv")