Load RawData

raw_data <- read_csv("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/project1data.txt",show_col_types = FALSE)

#rename the column
colnames(raw_data)[1] <- "text_value"

raw_data
## # A tibble: 195 × 1
##    text_value                                                                   
##    <chr>                                                                        
##  1 Pair | Player Name                     |Total|Round|Round|Round|Round|Round|…
##  2 Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |…
##  3 ----------------------------------------------------------------------------…
##  4 1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  …
##  5 ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B …
##  6 ----------------------------------------------------------------------------…
##  7 2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  …
##  8 MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W …
##  9 ----------------------------------------------------------------------------…
## 10 3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  …
## # ℹ 185 more rows

Analyze Source File

After analyzing the source file, I realized that each player’s information is enclosed between dashes. For any row that starts with a number, I can extract the PairID, name, and Total. For rows that begin with two capital letters, I can obtain the remaining information. Consequently, two DataFrames were created with unique IDs so that they can be joined to generate the report file.

raw_data_All <- raw_data |>
  filter( row_number()!=1 , row_number()!=2)

Newrow <- 1
raw_data_ID_Info_PartI <- data.frame(PairID = character(),
                              PlayerName = character(),
                              total = character(), 
                              Round1 = character(), 
                              Round2 = character(), 
                              Round3 = character(), 
                              Round4 = character(), 
                              Round5 = character(), 
                              Round6 = character(), 
                              Round7 = character())


raw_data_ID_Info_PartII <- data.frame(PairID = character(),
                              State = character(),
                              uscf_id = character(), 
                              Rtg_Pre = character(), 
                              Rtg_Post = character(), 
                              Number = character())


for(i in 1:nrow(raw_data_All)){
  #if the row doesnt have -- 
  if (!grepl("--", raw_data_All[i, 1])) {
    #if row starts with id
      if (grepl("^[0-9]", raw_data_All[i, 1])) {
           temp  <-  raw_data_All[i, 1]|>
                                 separate(text_value, 
                                 into = c('PairID','PlayerName','Total','Round1',
                                 'Round2','Round3','Round4','Round5','Round6','Round7')
                                 , sep = "\\|", fill = "right")|>
                                 mutate(PairID = str_replace_all(PairID, " ", ""))
           
           raw_data_ID_Info_PartI<-bind_rows(raw_data_ID_Info_PartI,temp )
           PairID <- temp[1,'PairID'] # I will keep the ID so I can use it for the next row
           
      }else if (grepl("^[A-Z].{2}", raw_data_All[i, 1])){
          
           temp1  <-  raw_data_All[i, 1]|>
                                 separate(text_value, 
                                 into = c('State','uscf_id','Rtg_Pre','Rtg_Post',
                                 'Number')
                                 , sep = "[|]|[/]|[>]", fill = "right")|>
                                 mutate(across(everything(), str_trim)) |>
                                 mutate(Rtg_Pre = str_replace_all(Rtg_Pre, "[R: ]|[-]", ""))|>
                                 mutate(Number = str_replace_all(Number, "N:", ""))
           temp1[['PairID']] <- NA
           temp1[1,"PairID"] <- PairID
           
           raw_data_ID_Info_PartII<-bind_rows(raw_data_ID_Info_PartII,temp1 )
        }
  }
  
}

Merge both DataFrames

In this part, I will combine the two previous DataFrames based on the PairID key. Additionally, I will convert the Round columns to numeric format to calculate the averages for the other players.

#converting to Numeric
DF_Player <- inner_join(raw_data_ID_Info_PartI, raw_data_ID_Info_PartII, by = "PairID")|>
                                 mutate(across(everything(), str_trim))|>
                                  mutate(across(starts_with("Round"), 
                                  ~ str_replace_all(., " ", "")))|>
              mutate(across(starts_with("Round"), 
              ~ str_replace_all(., "[A-Z]", "")))|>
              mutate(across(starts_with("Round"), as.numeric))|>
              mutate(PairID = as.numeric(as.character(raw_data_ID_Info_PartI$PairID)))


DF_Player[['AveragePreChessRating']] <- NA

for(i in 1:nrow(DF_Player)){
  
  Previous_players <- DF_Player|>
  filter(  PairID == DF_Player[i, "Round1"] | PairID == DF_Player[i,"Round2"] |
           PairID == DF_Player[i,"Round3"] | PairID == DF_Player[i,"Round4"] |
           PairID == DF_Player[i,"Round5"] | PairID == DF_Player[i,"Round6"] |
           PairID == DF_Player[i,"Round7"] )|>
  select(Rtg_Pre)|>
  mutate(Rtg_Pre = as.numeric(str_remove(Rtg_Pre, "P.*")))
  
  if(nrow(Previous_players)>0){
    DF_Player[i,"AveragePreChessRating"] <-  mean(Previous_players[,"Rtg_Pre"],
                                                  na.rm = TRUE)|>
                                                  round()
  }
}

Export result into a csv file

#Export into csv file

DF_Player_Export <- DF_Player|>
  select(PlayerName,State,Total,Rtg_Pre,AveragePreChessRating)

DF_Player_Export
##                    PlayerName State Total Rtg_Pre AveragePreChessRating
## 1                    GARY HUA    ON   6.0    1794                  1605
## 2             DAKSHESH DARURI    MI   6.0    1553                  1469
## 3                ADITYA BAJAJ    MI   6.0    1384                  1564
## 4         PATRICK H SCHILLING    MI   5.5    1716                  1574
## 5                  HANSHI ZUO    MI   5.5    1655                  1501
## 6                 HANSEN SONG    OH   5.0    1686                  1519
## 7           GARY DEE SWATHELL    MI   5.0    1649                  1372
## 8            EZEKIEL HOUGHTON    MI   5.0 1641P17                  1468
## 9                 STEFANO LEE    ON   5.0    1411                  1523
## 10                  ANVIT RAO    MI   5.0    1365                  1554
## 11   CAMERON WILLIAM MC LEMAN    MI   4.5    1712                  1468
## 12             KENNETH J TACK    MI   4.5    1663                  1506
## 13          TORRANCE HENRY JR    MI   4.5    1666                  1498
## 14               BRADLEY SHAW    MI   4.5    1610                  1515
## 15     ZACHARY JAMES HOUGHTON    MI   4.5 1220P13                  1484
## 16               MIKE NIKITIN    MI   4.0    1604                  1386
## 17         RONALD GRZEGORCZYK    MI   4.0    1629                  1499
## 18              DAVID SUNDEEN    MI   4.0    1600                  1480
## 19               DIPANKAR ROY    MI   4.0    1564                  1426
## 20                JASON ZHENG    MI   4.0    1595                  1411
## 21              DINH DANG BUI    ON   4.0 1563P22                  1470
## 22           EUGENE L MCCLURE    MI   4.0    1555                  1300
## 23                   ALAN BUI    ON   4.0    1363                  1214
## 24          MICHAEL R ALDRICH    MI   4.0    1229                  1357
## 25           LOREN SCHWIEBERT    MI   3.5    1745                  1363
## 26                    MAX ZHU    ON   3.5    1579                  1507
## 27             GAURAV GIDWANI    MI   3.5    1552                  1222
## 28 SOFIA ADINA STANESCU-BELLU    MI   3.5    1507                  1522
## 29           CHIEDOZIE OKORIE    MI   3.5  1602P6                  1314
## 30         GEORGE AVERY JONES    ON   3.5    1522                  1144
## 31               RISHI SHETTY    MI   3.5    1494                  1260
## 32      JOSHUA PHILIP MATHEWS    ON   3.5    1441                  1379
## 33                    JADE GE    MI   3.5    1449                  1277
## 34     MICHAEL JEFFERY THOMAS    MI   3.5    1399                  1375
## 35           JOSHUA DAVID LEE    MI   3.5    1438                  1150
## 36              SIDDHARTH JHA    MI   3.5    1355                  1388
## 37       AMIYATOSH PWNANANDAM    MI   3.5  980P12                  1385
## 38                  BRIAN LIU    MI   3.0    1423                  1539
## 39              JOEL R HENDON    MI   3.0 1436P23                  1430
## 40               FOREST ZHANG    MI   3.0    1348                  1391
## 41        KYLE WILLIAM MURPHY    MI   3.0  1403P5                  1248
## 42                   JARED GE    MI   3.0    1332                  1150
## 43          ROBERT GLEN VASEY    MI   3.0    1283                  1107
## 44         JUSTIN D SCHILLING    MI   3.0    1199                  1327
## 45                  DEREK YAN    MI   3.0    1242                  1152
## 46   JACOB ALEXANDER LAVALLEY    MI   3.0   377P3                  1358
## 47                ERIC WRIGHT    MI   2.5    1362                  1392
## 48               DANIEL KHAIN    MI   2.5    1382                  1356
## 49           MICHAEL J MARTIN    MI   2.5 1291P12                  1286
## 50                 SHIVAM JHA    MI   2.5    1056                  1296
## 51             TEJAS AYYAGARI    MI   2.5    1011                  1356
## 52                  ETHAN GUO    MI   2.5     935                  1495
## 53              JOSE C YBARRA    MI   2.0    1393                  1345
## 54                LARRY HODGE    MI   2.0    1270                  1206
## 55                  ALEX KONG    MI   2.0    1186                  1406
## 56               MARISA RICCI    MI   2.0    1153                  1414
## 57                 MICHAEL LU    MI   2.0    1092                  1363
## 58               VIRAJ MOHILE    MI   2.0     917                  1391
## 59          SEAN M MC CORMICK    MI   2.0     853                  1319
## 60                 JULIA SHEN    MI   1.5     967                  1330
## 61              JEZZEL FARKAS    ON   1.5  955P11                  1327
## 62              ASHWIN BALAJI    MI   1.0    1530                  1186
## 63       THOMAS JOSEPH HOSMER    MI   1.0    1175                  1350
## 64                     BEN LI    MI   1.0    1163                  1263
write_csv(DF_Player_Export, "data/DF_Player.csv")