R Markdown

Intro: Given a txt block of data the contains chess tournament results, the objective is to extract different pieces of information using pattern matching and create a new table (and CSV) that contains only specific and relevant pieces of information.

Import Packages
# Packages
library(dplyr)
library(stringr)
Create DF with the txt file (uploaded to Github)
# Read txt file into data frame in R
txt_url <- "https://raw.githubusercontent.com/wberritt913/CUNY_DATA607/main/tournamentinfo.txt"
df1 <- read.table(url(txt_url), header = TRUE, sep = '\n')

# Check the dimensions and preview the data
dim(df1)
## [1] 195   1
head(df1, 30)
##    X.........................................................................................
## 1   Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 2   Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 3   -----------------------------------------------------------------------------------------
## 4       1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 5      ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 6   -----------------------------------------------------------------------------------------
## 7       2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## 8      MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 9   -----------------------------------------------------------------------------------------
## 10      3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
## 11     MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 12  -----------------------------------------------------------------------------------------
## 13      4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|
## 14     MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |
## 15  -----------------------------------------------------------------------------------------
## 16      5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|
## 17     MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 18  -----------------------------------------------------------------------------------------
## 19      6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|
## 20     OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |
## 21  -----------------------------------------------------------------------------------------
## 22      7 | GARY DEE SWATHELL               |5.0  |W  57|W  46|W  13|W  11|L   1|W   9|L   2|
## 23     MI | 11146376 / R: 1649   ->1673     |N:3  |W    |B    |W    |B    |B    |W    |W    |
## 24  -----------------------------------------------------------------------------------------
## 25      8 | EZEKIEL HOUGHTON                |5.0  |W   3|W  32|L  14|L   9|W  47|W  28|W  19|
## 26     MI | 15142253 / R: 1641P17->1657P24  |N:3  |B    |W    |B    |W    |B    |W    |W    |
## 27  -----------------------------------------------------------------------------------------
## 28      9 | STEFANO LEE                     |5.0  |W  25|L  18|W  59|W   8|W  26|L   7|W  20|
## 29     ON | 14954524 / R: 1411   ->1564     |N:2  |W    |B    |W    |B    |W    |B    |B    |
## 30  -----------------------------------------------------------------------------------------
Extract player names
# Pattern match to extract the player names and create a data frame containing only their names
player_info <- data.frame(value=unlist(str_extract_all(toupper(df1), '[A-Z]+\\s[A-Z]+\\s[A-Z]*\\s?.?[A-Z]*')))
player_info <- player_info %>% slice(-1:-2)
player_info$value <- trimws(player_info$value)
head(player_info, 15)
##                       value
## 1                  GARY HUA
## 2           DAKSHESH DARURI
## 3              ADITYA BAJAJ
## 4       PATRICK H SCHILLING
## 5                HANSHI ZUO
## 6               HANSEN SONG
## 7         GARY DEE SWATHELL
## 8          EZEKIEL HOUGHTON
## 9               STEFANO LEE
## 10                ANVIT RAO
## 11 CAMERON WILLIAM MC LEMAN
## 12           KENNETH J TACK
## 13        TORRANCE HENRY JR
## 14             BRADLEY SHAW
## 15   ZACHARY JAMES HOUGHTON
Extract player states
# Pattern match to extract the player states and create a data frame containing only their states
states_df <- data.frame(value=unlist(str_extract_all(toupper(df1), '[^A-Z]\\s[A-Z][A-Z]\\s')))
states_df$value <- trimws(states_df$value)
# Preview
head(states_df, 15)
##    value
## 1     ON
## 2     MI
## 3     MI
## 4     MI
## 5     MI
## 6     OH
## 7     MI
## 8     MI
## 9     ON
## 10    MI
## 11    MI
## 12    MI
## 13    MI
## 14    MI
## 15    MI
Extract player total points
# Pattern match to extract the player total points and create a data frame containing only their total points
total_points <- data.frame(value = unlist(points <- str_extract_all(toupper(df1), '[0-9]\\.[0-9]')))
total_points$value <- trimws(total_points$value)
head(total_points, 15)
##    value
## 1    6.0
## 2    6.0
## 3    6.0
## 4    5.5
## 5    5.5
## 6    5.0
## 7    5.0
## 8    5.0
## 9    5.0
## 10   5.0
## 11   4.5
## 12   4.5
## 13   4.5
## 14   4.5
## 15   4.5
Extract player ratings before + fix class
# Pattern match to extract the player player ratings before and create a data frame containing only their player ratings before and check class
p_rating_before <- data.frame(value = unlist(str_extract_all(data.frame(value = unlist(str_extract_all(toupper(df1), '(:\\s\\s?[0-9])[0-9][0-9]?.?[0-9]?[0-9]?'))), '[0-9][0-9][0-9][0-9]?')))
class(p_rating_before$value)
## [1] "character"
# Change class
p_rating_before$value <- as.numeric(gsub(",", "", p_rating_before$value))
head(p_rating_before, 15)
##    value
## 1   1794
## 2   1553
## 3   1384
## 4   1716
## 5   1655
## 6   1686
## 7   1649
## 8   1641
## 9   1411
## 10  1365
## 11  1712
## 12  1663
## 13  1666
## 14  1610
## 15  1220
Extract opponents’ ratings
# Pattern match to extract the player opponents' ratings and create a data frame containing only their opponents' ratings
opponents_df <- str_extract_all(str_extract_all(str_extract_all(str_extract_all(df1, '\\|...............................................\\|'), '\\|[0-9]..............................................\\|'), '[A-Z]....\\|'), '..\\|')
opponents_df <- data.frame(value = unlist(opponents_df))
opponents_df$value <- gsub("\\|", "", opponents_df$value)
colnames(opponents_df)[colnames(opponents_df) == "value"] <- "index"
opponents_df$index <- as.numeric(opponents_df$index)
head(opponents_df, 15)
##    index
## 1     39
## 2     21
## 3     18
## 4     14
## 5      7
## 6     12
## 7      4
## 8     63
## 9     58
## 10     4
## 11    17
## 12    16
## 13    20
## 14     7
## 15     8
Create main data frame
# Combine all data frames into one data frame (keeping player_info as base data frame)
player_info$states <- states_df
player_info$total_points <- total_points
player_info$p_rating_before <- p_rating_before
player_info$index <- 1:nrow(player_info)
head(player_info, 15)
##                       value value value value index
## 1                  GARY HUA    ON   6.0  1794     1
## 2           DAKSHESH DARURI    MI   6.0  1553     2
## 3              ADITYA BAJAJ    MI   6.0  1384     3
## 4       PATRICK H SCHILLING    MI   5.5  1716     4
## 5                HANSHI ZUO    MI   5.5  1655     5
## 6               HANSEN SONG    OH   5.0  1686     6
## 7         GARY DEE SWATHELL    MI   5.0  1649     7
## 8          EZEKIEL HOUGHTON    MI   5.0  1641     8
## 9               STEFANO LEE    ON   5.0  1411     9
## 10                ANVIT RAO    MI   5.0  1365    10
## 11 CAMERON WILLIAM MC LEMAN    MI   4.5  1712    11
## 12           KENNETH J TACK    MI   4.5  1663    12
## 13        TORRANCE HENRY JR    MI   4.5  1666    13
## 14             BRADLEY SHAW    MI   4.5  1610    14
## 15   ZACHARY JAMES HOUGHTON    MI   4.5  1220    15
Join opponents’ rating data to main data frame
# Join the opponents data frame on the index
df3 <- left_join(opponents_df, player_info, by = 'index')
head(df3, 15)
##    index                value value value value
## 1     39        JOEL R HENDON    MI   3.0  1436
## 2     21        DINH DANG BUI    ON   4.0  1563
## 3     18        DAVID SUNDEEN    MI   4.0  1600
## 4     14         BRADLEY SHAW    MI   4.5  1610
## 5      7    GARY DEE SWATHELL    MI   5.0  1649
## 6     12       KENNETH J TACK    MI   4.5  1663
## 7      4  PATRICK H SCHILLING    MI   5.5  1716
## 8     63 THOMAS JOSEPH HOSMER    MI   1.0  1175
## 9     58         VIRAJ MOHILE    MI   2.0   917
## 10     4  PATRICK H SCHILLING    MI   5.5  1716
## 11    17   RONALD GRZEGORCZYK    MI   4.0  1629
## 12    16         MIKE NIKITIN    MI   4.0  1604
## 13    20          JASON ZHENG    MI   4.0  1595
## 14     7    GARY DEE SWATHELL    MI   5.0  1649
## 15     8     EZEKIEL HOUGHTON    MI   5.0  1641
Find average rating for each player’s opponents
# Get average of every 7 rows to calculate opponent average player rating before
groupings <- df3$p_rating_before %>%
  group_by(group = (row_number() - 1) %/% 7) 
averages <- groupings %>%
  summarise(avg_p_rating_before = round(mean(value, na.rm = TRUE)))
head(averages, 15)
## # A tibble: 15 × 2
##    group avg_p_rating_before
##    <dbl>               <dbl>
##  1     0                1605
##  2     1                1469
##  3     2                1564
##  4     3                1574
##  5     4                1501
##  6     5                1519
##  7     6                1372
##  8     7                1468
##  9     8                1523
## 10     9                1554
## 11    10                1468
## 12    11                1506
## 13    12                1498
## 14    13                1515
## 15    14                1484
Create new final data frame
# Add the average to the main data frame and remove the index common key
player_info$average_opponent_rating <- averages$avg_p_rating_before
player_info <- player_info %>% select(-index)
player_data <- player_info
player_data
##                         value value value value average_opponent_rating
## 1                    GARY HUA    ON   6.0  1794                    1605
## 2             DAKSHESH DARURI    MI   6.0  1553                    1469
## 3                ADITYA BAJAJ    MI   6.0  1384                    1564
## 4         PATRICK H SCHILLING    MI   5.5  1716                    1574
## 5                  HANSHI ZUO    MI   5.5  1655                    1501
## 6                 HANSEN SONG    OH   5.0  1686                    1519
## 7           GARY DEE SWATHELL    MI   5.0  1649                    1372
## 8            EZEKIEL HOUGHTON    MI   5.0  1641                    1468
## 9                 STEFANO LEE    ON   5.0  1411                    1523
## 10                  ANVIT RAO    MI   5.0  1365                    1554
## 11   CAMERON WILLIAM MC LEMAN    MI   4.5  1712                    1468
## 12             KENNETH J TACK    MI   4.5  1663                    1506
## 13          TORRANCE HENRY JR    MI   4.5  1666                    1498
## 14               BRADLEY SHAW    MI   4.5  1610                    1515
## 15     ZACHARY JAMES HOUGHTON    MI   4.5  1220                    1484
## 16               MIKE NIKITIN    MI   4.0  1604                    1386
## 17         RONALD GRZEGORCZYK    MI   4.0  1629                    1499
## 18              DAVID SUNDEEN    MI   4.0  1600                    1480
## 19               DIPANKAR ROY    MI   4.0  1564                    1426
## 20                JASON ZHENG    MI   4.0  1595                    1411
## 21              DINH DANG BUI    ON   4.0  1563                    1470
## 22           EUGENE L MCCLURE    MI   4.0  1555                    1300
## 23                   ALAN BUI    ON   4.0  1363                    1214
## 24          MICHAEL R ALDRICH    MI   4.0  1229                    1357
## 25           LOREN SCHWIEBERT    MI   3.5  1745                    1363
## 26                    MAX ZHU    ON   3.5  1579                    1507
## 27             GAURAV GIDWANI    MI   3.5  1552                    1222
## 28 SOFIA ADINA STANESCU-BELLU    MI   3.5  1507                    1522
## 29           CHIEDOZIE OKORIE    MI   3.5  1602                    1314
## 30         GEORGE AVERY JONES    ON   3.5  1522                    1144
## 31               RISHI SHETTY    MI   3.5  1494                    1260
## 32      JOSHUA PHILIP MATHEWS    ON   3.5  1441                    1379
## 33                    JADE GE    MI   3.5  1449                    1277
## 34     MICHAEL JEFFERY THOMAS    MI   3.5  1399                    1375
## 35           JOSHUA DAVID LEE    MI   3.5  1438                    1150
## 36              SIDDHARTH JHA    MI   3.5  1355                    1388
## 37       AMIYATOSH PWNANANDAM    MI   3.5   980                    1385
## 38                  BRIAN LIU    MI   3.0  1423                    1539
## 39              JOEL R HENDON    MI   3.0  1436                    1430
## 40               FOREST ZHANG    MI   3.0  1348                    1391
## 41        KYLE WILLIAM MURPHY    MI   3.0  1403                    1248
## 42                   JARED GE    MI   3.0  1332                    1150
## 43          ROBERT GLEN VASEY    MI   3.0  1283                    1107
## 44         JUSTIN D SCHILLING    MI   3.0  1199                    1327
## 45                  DEREK YAN    MI   3.0  1242                    1152
## 46   JACOB ALEXANDER LAVALLEY    MI   3.0   377                    1358
## 47                ERIC WRIGHT    MI   2.5  1362                    1392
## 48               DANIEL KHAIN    MI   2.5  1382                    1356
## 49           MICHAEL J MARTIN    MI   2.5  1291                    1286
## 50                 SHIVAM JHA    MI   2.5  1056                    1296
## 51             TEJAS AYYAGARI    MI   2.5  1011                    1356
## 52                  ETHAN GUO    MI   2.5   935                    1495
## 53              JOSE C YBARRA    MI   2.0  1393                    1345
## 54                LARRY HODGE    MI   2.0  1270                    1206
## 55                  ALEX KONG    MI   2.0  1186                    1406
## 56               MARISA RICCI    MI   2.0  1153                    1414
## 57                 MICHAEL LU    MI   2.0  1092                    1363
## 58               VIRAJ MOHILE    MI   2.0   917                    1391
## 59          SEAN M MC CORMICK    MI   2.0   853                    1319
## 60                 JULIA SHEN    MI   1.5   967                    1330
## 61              JEZZEL FARKAS    ON   1.5   955                    1327
## 62              ASHWIN BALAJI    MI   1.0  1530                    1186
## 63       THOMAS JOSEPH HOSMER    MI   1.0  1175                    1350
## 64                     BEN LI    MI   1.0  1163                    1263
Check classes for export - fix problematic classes
# Change the column classes
lapply(player_data, class)
## $value
## [1] "character"
## 
## $states
## [1] "data.frame"
## 
## $total_points
## [1] "data.frame"
## 
## $p_rating_before
## [1] "data.frame"
## 
## $average_opponent_rating
## [1] "numeric"
# Change column classes + names to create final data frame
player_data$states <- as.character(player_data$states$value)
player_data$total_points <- as.numeric(player_data$total_points$value)
player_data$p_rating_before <- as.numeric(player_data$p_rating_before$value)
player_data <- player_data[, c("value", "states", "total_points", "p_rating_before", "average_opponent_rating")]
# Check class again
lapply(player_data, class)
## $value
## [1] "character"
## 
## $states
## [1] "character"
## 
## $total_points
## [1] "numeric"
## 
## $p_rating_before
## [1] "numeric"
## 
## $average_opponent_rating
## [1] "numeric"
# Rename value column
colnames(player_data)[colnames(player_data) == "value"] <- "name"
Create CSV
# Create CSV
write.csv(player_data, "player_data_new1.csv", row.names = FALSE, quote = FALSE)

Conclusion: Pattern matching was originally difficult as I was trying to use one pattern to extract all the relevant info. Later realizing it may make more sense to layer patten matching over each other to iteratively extract the data. The appending of the data frames was also tricky. It’s important to verify the details of the data frames before combining them to ensure the final data frame is functional and contains the relevant information. This project is a strong testament the value of pattern matching and how it can be applied.