This Assignment shows an example of given a raw file, using regular expressions

extract the necessary data, and once clean and tidy, export to a CSV file for

possible usage in a database. In this case it is working with chess Tournament

Data and the goal is to scrape and clean the tournament data to extract the

Player’s name, their state, their pre-rating, total points accumated and the

average pre-chess rating of player’s opponents.

Load the data and make the columns more meaningful.

Note: I use a Ubuntu Linux environment to store my work so do change the

directory of your choice on your OS.

# library(utils)
chess_data <- read.fwf("/home/jonboy1987/Desktop/CUNYSPS/IS607/Assignments/Project1/tournamentinfo.txt"
                       ,skip = 4, widths = c(5,3,32,6,6,6,6,6,6,6,6))

# give the chess_data dataset more better attribute names
colnames(chess_data) <- c("playerID_state","V2","PlayerName_prating","total_pts"
                          ,"round1","round2","round3","round4","round5","round6"
                          ,"round7")

head(chess_data)
##   playerID_state  V2               PlayerName_prating total_pts round1
## 1              1  |  GARY HUA                            |6.0   |W  39
## 2             ON  |  15445895 / R: 1794   ->1817         |N:2   |W    
## 3          ----- --- --------------------------------    ------ ------
## 4              2  |  DAKSHESH DARURI                     |6.0   |W  63
## 5             MI  |  14598900 / R: 1553   ->1663         |N:2   |B    
## 6          ----- --- --------------------------------    ------ ------
##   round2 round3 round4 round5 round6 round7
## 1 |W  21 |W  18 |W  14 |W   7 |D  12 |D   4
## 2 |B     |W     |B     |W     |B     |W    
## 3 ------ ------ ------ ------ ------ ------
## 4 |W  58 |L   4 |W  17 |W  16 |W  20 |W   7
## 5 |W     |B     |W     |B     |W     |B    
## 6 ------ ------ ------ ------ ------ ------

We can extract the State using a regular expression by using column 1 of the

Chess data set by using “[A-Z]{2}” to get the states

# use grep and value = TRUE to show the strings that match the regexp
states <- grep("[A-Z]{2}", chess_data$playerID_state, value = TRUE)

# strip out the white spaces and store the Data
states <- trimws(states)

We will also extract the ID of each player; this will come in handy in

computing the average pre-rating of opponents.

ID <- grep("[0-9]+", chess_data$playerID_state, value = TRUE)
ID <- trimws(ID)
ID
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
## [15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
## [29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
## [43] "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56"
## [57] "57" "58" "59" "60" "61" "62" "63" "64"

The 2nd column in the chess_data set contains the name and Pre-Rating

extract.

# "^[A-Z]+" matches all strings that start with uppercase letters
names <- grep("^[A-Z]+", chess_data$PlayerName_prating, value = TRUE)
names <- trimws(names)
# Extract the portion that starts with "R:"
pre_rating <- regmatches(chess_data$PlayerName_prating,
                         regexpr("R:\\s{1,2}[0-9]{1,4}",
                                 chess_data$PlayerName_prating))

pre_rating <- gsub("R:", "", pre_rating) # Remove the R:
pre_rating <- trimws(pre_rating) # remove whitespaces
# remove the "|" character from the total points field
chess_data$total_pts <- gsub('\\|', '', chess_data$total_pts)

total_points <- grep("^[0-9]+\\.[0-9]+",chess_data$total_pts, value = TRUE)
total_points <- trimws(total_points)

For the hardest part; to compute the average pre chess rating of opponents

First we will want to grab all the columns that contains all 7 rounds of

the tournament and extract the player ID’s from them.

rounds <- chess_data[,c("round1", "round2", "round3", "round4", "round5",
                        "round6", "round7")]

# Begin stripping down the rounds dataset
rounds <- rounds[-c(seq(from = 3, to = 192, by = 3)), ] # remove the -'s

# Remove the rows below the player ID's
rounds <- rounds[-c(seq(from = 2, to = 191, by = 2)), ]

# Add NA's to the entries that are empty 
# (player did not vs opponent for whatever reason)
rounds <- apply(rounds, 2, function(x) gsub("\\s{4}", NA, x))

# Now strip rest of characters so only the ID's and NA remain
rounds <- apply(rounds,2, function(x) gsub("^\\|[A-Z]\\s+", "",x))
rounds <- apply(rounds,2,as.numeric) # make the ID's numberic for indexing
head(rounds,15)
##       round1 round2 round3 round4 round5 round6 round7
##  [1,]     39     21     18     14      7     12      4
##  [2,]     63     58      4     17     16     20      7
##  [3,]      8     61     25     21     11     13     12
##  [4,]     23     28      2     26      5     19      1
##  [5,]     45     37     12     13      4     14     17
##  [6,]     34     29     11     35     10     27     21
##  [7,]     57     46     13     11      1      9      2
##  [8,]      3     32     14      9     47     28     19
##  [9,]     25     18     59      8     26      7     20
## [10,]     16     19     55     31      6     25     18
## [11,]     38     56      6      7      3     34     26
## [12,]     42     33      5     38     NA      1      3
## [13,]     36     27      7      5     33      3     32
## [14,]     54     44      8      1     27      5     31
## [15,]     19     16     30     22     54     33     38

With the rounds dataset now cleaned to just ID’s and NA’s we can map

the ID to the pre-ratings

# make a key-value pair so we can map 
ID_rating <- list(ID, pre_rating)
rounds <- apply(rounds, 1:2, function(x) ID_rating[[2]][x])
rounds <- apply(rounds,2,as.numeric) # make numeric for computing the average
rounds
##       round1 round2 round3 round4 round5 round6 round7
##  [1,]   1436   1563   1600   1610   1649   1663   1716
##  [2,]   1175    917   1716   1629   1604   1595   1649
##  [3,]   1641    955   1745   1563   1712   1666   1663
##  [4,]   1363   1507   1553   1579   1655   1564   1794
##  [5,]   1242    980   1663   1666   1716   1610   1629
##  [6,]   1399   1602   1712   1438   1365   1552   1563
##  [7,]   1092    377   1666   1712   1794   1411   1553
##  [8,]   1384   1441   1610   1411   1362   1507   1564
##  [9,]   1745   1600    853   1641   1579   1649   1595
## [10,]   1604   1564   1186   1494   1686   1745   1600
## [11,]   1423   1153   1686   1649   1384   1399   1579
## [12,]   1332   1449   1655   1423     NA   1794   1384
## [13,]   1355   1552   1649   1655   1449   1384   1441
## [14,]   1270   1199   1641   1794   1552   1655   1494
## [15,]   1564   1604   1522   1555   1270   1449   1423
## [16,]   1365   1220     NA   1436   1553   1355     NA
## [17,]   1382   1403   1579   1553   1363   1555   1655
## [18,]   1362   1411   1794   1441   1564   1423   1365
## [19,]   1220   1365    935   1507   1600   1716   1641
## [20,]   1348   1291   1363   1403   1507   1553   1411
## [21,]   1283   1794   1362   1384   1348   1436   1686
## [22,]   1163    935   1507   1220     NA   1629   1348
## [23,]   1716   1283   1595    917   1629    980    377
## [24,]   1507   1362   1283   1745    967   1199   1436
## [25,]   1411   1393   1384   1229   1399   1365   1362
## [26,]   1291   1348   1629   1716   1411   1441   1712
## [27,]   1011   1666    377    980   1610   1686     NA
## [28,]   1229   1716   1555   1564   1595   1641   1355
## [29,]   1056   1686   1423   1399    935   1382     NA
## [30,]    935   1163   1220   1186   1494    955   1056
## [31,]    917   1186   1163   1365   1522   1056   1610
## [32,]    955   1641   1199   1600   1011   1579   1666
## [33,]    967   1663   1056   1355   1666   1220   1011
## [34,]   1686    967    980   1602   1745   1712    935
## [35,]    377   1423   1153   1686   1092    935   1382
## [36,]   1666   1092   1011   1449     NA   1604   1507
## [37,]     NA   1655   1399   1552     NA   1363    955
## [38,]   1712   1438   1602   1663     NA   1600   1220
## [39,]   1794   1270   1348   1604   1199   1563   1229
## [40,]   1595   1579   1436    853   1563   1153   1555
## [41,]    853   1629    917   1595     NA     NA     NA
## [42,]   1663   1056   1092    967    955   1163   1153
## [43,]   1563   1363   1229   1175    853    377   1186
## [44,]     NA   1610   1441   1393   1436   1229    853
## [45,]   1655   1011    967   1153   1175   1186    917
## [46,]   1438   1649   1552   1056   1163   1283   1363
## [47,]   1600   1229   1563    955   1641   1011   1745
## [48,]   1629   1175     NA    935     NA   1602   1438
## [49,]   1579   1595   1175   1163    917     NA     NA
## [50,]   1602   1332   1449    377     NA   1494   1522
## [51,]   1552   1242   1355   1092   1441   1362   1449
## [52,]   1522   1555   1564   1382   1602   1438   1399
## [53,]     NA   1745     NA   1199     NA   1092     NA
## [54,]   1610   1436    955     NA   1220    853   1163
## [55,]   1530   1494   1365   1522     NA   1242   1283
## [56,]     NA   1712   1438   1242     NA   1348   1332
## [57,]   1649   1355   1332   1011   1438   1393     NA
## [58,]   1494   1553   1403   1363   1291     NA   1242
## [59,]   1403     NA   1411   1348   1283   1270   1199
## [60,]   1449   1399   1242   1332   1229     NA     NA
## [61,]   1441   1384   1270   1362   1332   1522    980
## [62,]   1186     NA     NA     NA     NA     NA     NA
## [63,]   1553   1382   1291   1283   1242     NA     NA
## [64,]   1555   1522   1494   1291    377   1332   1270

Compute the Average Pre-Rating of opponents of each player

avg_prerating <- round(apply(rounds, 1, mean, na.rm = TRUE), 0)
avg_prerating
##  [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515
## [15] 1484 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522
## [29] 1314 1144 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150
## [43] 1107 1327 1152 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414
## [57] 1363 1391 1319 1330 1327 1186 1350 1263
chess_data_frame <- data.frame(name = names, state = states,
                               TotalPoints = total_points, 
                               Pre_Rating = pre_rating,
                               Average_Pre_Rating = avg_prerating)
chess_data_frame
##                          name state TotalPoints Pre_Rating
## 1                    GARY HUA    ON         6.0       1794
## 2             DAKSHESH DARURI    MI         6.0       1553
## 3                ADITYA BAJAJ    MI         6.0       1384
## 4         PATRICK H SCHILLING    MI         5.5       1716
## 5                  HANSHI ZUO    MI         5.5       1655
## 6                 HANSEN SONG    OH         5.0       1686
## 7           GARY DEE SWATHELL    MI         5.0       1649
## 8            EZEKIEL HOUGHTON    MI         5.0       1641
## 9                 STEFANO LEE    ON         5.0       1411
## 10                  ANVIT RAO    MI         5.0       1365
## 11   CAMERON WILLIAM MC LEMAN    MI         4.5       1712
## 12             KENNETH J TACK    MI         4.5       1663
## 13          TORRANCE HENRY JR    MI         4.5       1666
## 14               BRADLEY SHAW    MI         4.5       1610
## 15     ZACHARY JAMES HOUGHTON    MI         4.5       1220
## 16               MIKE NIKITIN    MI         4.0       1604
## 17         RONALD GRZEGORCZYK    MI         4.0       1629
## 18              DAVID SUNDEEN    MI         4.0       1600
## 19               DIPANKAR ROY    MI         4.0       1564
## 20                JASON ZHENG    MI         4.0       1595
## 21              DINH DANG BUI    ON         4.0       1563
## 22           EUGENE L MCCLURE    MI         4.0       1555
## 23                   ALAN BUI    ON         4.0       1363
## 24          MICHAEL R ALDRICH    MI         4.0       1229
## 25           LOREN SCHWIEBERT    MI         3.5       1745
## 26                    MAX ZHU    ON         3.5       1579
## 27             GAURAV GIDWANI    MI         3.5       1552
## 28 SOFIA ADINA STANESCU-BELLU    MI         3.5       1507
## 29           CHIEDOZIE OKORIE    MI         3.5       1602
## 30         GEORGE AVERY JONES    ON         3.5       1522
## 31               RISHI SHETTY    MI         3.5       1494
## 32      JOSHUA PHILIP MATHEWS    ON         3.5       1441
## 33                    JADE GE    MI         3.5       1449
## 34     MICHAEL JEFFERY THOMAS    MI         3.5       1399
## 35           JOSHUA DAVID LEE    MI         3.5       1438
## 36              SIDDHARTH JHA    MI         3.5       1355
## 37       AMIYATOSH PWNANANDAM    MI         3.5        980
## 38                  BRIAN LIU    MI         3.0       1423
## 39              JOEL R HENDON    MI         3.0       1436
## 40               FOREST ZHANG    MI         3.0       1348
## 41        KYLE WILLIAM MURPHY    MI         3.0       1403
## 42                   JARED GE    MI         3.0       1332
## 43          ROBERT GLEN VASEY    MI         3.0       1283
## 44         JUSTIN D SCHILLING    MI         3.0       1199
## 45                  DEREK YAN    MI         3.0       1242
## 46   JACOB ALEXANDER LAVALLEY    MI         3.0        377
## 47                ERIC WRIGHT    MI         2.5       1362
## 48               DANIEL KHAIN    MI         2.5       1382
## 49           MICHAEL J MARTIN    MI         2.5       1291
## 50                 SHIVAM JHA    MI         2.5       1056
## 51             TEJAS AYYAGARI    MI         2.5       1011
## 52                  ETHAN GUO    MI         2.5        935
## 53              JOSE C YBARRA    MI         2.0       1393
## 54                LARRY HODGE    MI         2.0       1270
## 55                  ALEX KONG    MI         2.0       1186
## 56               MARISA RICCI    MI         2.0       1153
## 57                 MICHAEL LU    MI         2.0       1092
## 58               VIRAJ MOHILE    MI         2.0        917
## 59          SEAN M MC CORMICK    MI         2.0        853
## 60                 JULIA SHEN    MI         1.5        967
## 61              JEZZEL FARKAS    ON         1.5        955
## 62              ASHWIN BALAJI    MI         1.0       1530
## 63       THOMAS JOSEPH HOSMER    MI         1.0       1175
## 64                     BEN LI    MI         1.0       1163
##    Average_Pre_Rating
## 1                1605
## 2                1469
## 3                1564
## 4                1574
## 5                1501
## 6                1519
## 7                1372
## 8                1468
## 9                1523
## 10               1554
## 11               1468
## 12               1506
## 13               1498
## 14               1515
## 15               1484
## 16               1386
## 17               1499
## 18               1480
## 19               1426
## 20               1411
## 21               1470
## 22               1300
## 23               1214
## 24               1357
## 25               1363
## 26               1507
## 27               1222
## 28               1522
## 29               1314
## 30               1144
## 31               1260
## 32               1379
## 33               1277
## 34               1375
## 35               1150
## 36               1388
## 37               1385
## 38               1539
## 39               1430
## 40               1391
## 41               1248
## 42               1150
## 43               1107
## 44               1327
## 45               1152
## 46               1358
## 47               1392
## 48               1356
## 49               1286
## 50               1296
## 51               1356
## 52               1495
## 53               1345
## 54               1206
## 55               1406
## 56               1414
## 57               1363
## 58               1391
## 59               1319
## 60               1330
## 61               1327
## 62               1186
## 63               1350
## 64               1263
# Finally export the new Clean dataset into a .csv file
write.csv(chess_data_frame,
          "/home/jonboy1987/Desktop/CUNYSPS/IS607/Assignments/Project1/chess_tournament_data.csv",
          row.names = FALSE)