raw_data <- read_csv("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/project1data.txt",show_col_types = FALSE)
#rename the column
colnames(raw_data)[1] <- "text_value"
raw_data
## # A tibble: 195 × 1
## text_value
## <chr>
## 1 Pair | Player Name |Total|Round|Round|Round|Round|Round|…
## 2 Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 |…
## 3 ----------------------------------------------------------------------------…
## 4 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D …
## 5 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B …
## 6 ----------------------------------------------------------------------------…
## 7 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W …
## 8 MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W …
## 9 ----------------------------------------------------------------------------…
## 10 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W …
## # ℹ 185 more rows
After analyzing the source file, I realized that each player’s information is enclosed between dashes. For any row that starts with a number, I can extract the PairID, name, and Total. For rows that begin with two capital letters, I can obtain the remaining information. Consequently, two DataFrames were created with unique IDs so that they can be joined to generate the report file.
raw_data_All <- raw_data |>
filter( row_number()!=1 , row_number()!=2)
Newrow <- 1
raw_data_ID_Info_PartI <- data.frame(PairID = character(),
PlayerName = character(),
total = character(),
Round1 = character(),
Round2 = character(),
Round3 = character(),
Round4 = character(),
Round5 = character(),
Round6 = character(),
Round7 = character())
raw_data_ID_Info_PartII <- data.frame(PairID = character(),
State = character(),
uscf_id = character(),
Rtg_Pre = character(),
Rtg_Post = character(),
Number = character())
for(i in 1:nrow(raw_data_All)){
#if the row doesnt have --
if (!grepl("--", raw_data_All[i, 1])) {
#if row starts with id
if (grepl("^[0-9]", raw_data_All[i, 1])) {
temp <- raw_data_All[i, 1]|>
separate(text_value,
into = c('PairID','PlayerName','Total','Round1',
'Round2','Round3','Round4','Round5','Round6','Round7')
, sep = "\\|", fill = "right")|>
mutate(PairID = str_replace_all(PairID, " ", ""))
raw_data_ID_Info_PartI<-bind_rows(raw_data_ID_Info_PartI,temp )
PairID <- temp[1,'PairID'] # I will keep the ID so I can use it for the next row
}else if (grepl("^[A-Z].{2}", raw_data_All[i, 1])){
temp1 <- raw_data_All[i, 1]|>
separate(text_value,
into = c('State','uscf_id','Rtg_Pre','Rtg_Post',
'Number')
, sep = "[|]|[/]|[>]", fill = "right")|>
mutate(across(everything(), str_trim)) |>
mutate(Rtg_Pre = str_replace_all(Rtg_Pre, "[R: ]|[-]", ""))|>
mutate(Number = str_replace_all(Number, "N:", ""))
temp1[['PairID']] <- NA
temp1[1,"PairID"] <- PairID
raw_data_ID_Info_PartII<-bind_rows(raw_data_ID_Info_PartII,temp1 )
}
}
}
In this part, I will combine the two previous DataFrames based on the PairID key. Additionally, I will convert the Round columns to numeric format to calculate the averages for the other players.
#converting to Numeric
DF_Player <- inner_join(raw_data_ID_Info_PartI, raw_data_ID_Info_PartII, by = "PairID")|>
mutate(across(everything(), str_trim))|>
mutate(across(starts_with("Round"),
~ str_replace_all(., " ", "")))|>
mutate(across(starts_with("Round"),
~ str_replace_all(., "[A-Z]", "")))|>
mutate(across(starts_with("Round"), as.numeric))|>
mutate(PairID = as.numeric(as.character(raw_data_ID_Info_PartI$PairID)))
DF_Player[['AveragePreChessRating']] <- NA
for(i in 1:nrow(DF_Player)){
Previous_players <- DF_Player|>
filter( PairID == DF_Player[i, "Round1"] | PairID == DF_Player[i,"Round2"] |
PairID == DF_Player[i,"Round3"] | PairID == DF_Player[i,"Round4"] |
PairID == DF_Player[i,"Round5"] | PairID == DF_Player[i,"Round6"] |
PairID == DF_Player[i,"Round7"] )|>
select(Rtg_Pre)|>
mutate(Rtg_Pre = as.numeric(str_remove(Rtg_Pre, "P.*")))
if(nrow(Previous_players)>0){
DF_Player[i,"AveragePreChessRating"] <- mean(Previous_players[,"Rtg_Pre"],
na.rm = TRUE)|>
round()
}
}
#Export into csv file
DF_Player_Export <- DF_Player|>
select(PlayerName,State,Total,Rtg_Pre,AveragePreChessRating)
DF_Player_Export
## PlayerName State Total Rtg_Pre AveragePreChessRating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
## 7 GARY DEE SWATHELL MI 5.0 1649 1372
## 8 EZEKIEL HOUGHTON MI 5.0 1641P17 1468
## 9 STEFANO LEE ON 5.0 1411 1523
## 10 ANVIT RAO MI 5.0 1365 1554
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712 1468
## 12 KENNETH J TACK MI 4.5 1663 1506
## 13 TORRANCE HENRY JR MI 4.5 1666 1498
## 14 BRADLEY SHAW MI 4.5 1610 1515
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220P13 1484
## 16 MIKE NIKITIN MI 4.0 1604 1386
## 17 RONALD GRZEGORCZYK MI 4.0 1629 1499
## 18 DAVID SUNDEEN MI 4.0 1600 1480
## 19 DIPANKAR ROY MI 4.0 1564 1426
## 20 JASON ZHENG MI 4.0 1595 1411
## 21 DINH DANG BUI ON 4.0 1563P22 1470
## 22 EUGENE L MCCLURE MI 4.0 1555 1300
## 23 ALAN BUI ON 4.0 1363 1214
## 24 MICHAEL R ALDRICH MI 4.0 1229 1357
## 25 LOREN SCHWIEBERT MI 3.5 1745 1363
## 26 MAX ZHU ON 3.5 1579 1507
## 27 GAURAV GIDWANI MI 3.5 1552 1222
## 28 SOFIA ADINA STANESCU-BELLU MI 3.5 1507 1522
## 29 CHIEDOZIE OKORIE MI 3.5 1602P6 1314
## 30 GEORGE AVERY JONES ON 3.5 1522 1144
## 31 RISHI SHETTY MI 3.5 1494 1260
## 32 JOSHUA PHILIP MATHEWS ON 3.5 1441 1379
## 33 JADE GE MI 3.5 1449 1277
## 34 MICHAEL JEFFERY THOMAS MI 3.5 1399 1375
## 35 JOSHUA DAVID LEE MI 3.5 1438 1150
## 36 SIDDHARTH JHA MI 3.5 1355 1388
## 37 AMIYATOSH PWNANANDAM MI 3.5 980P12 1385
## 38 BRIAN LIU MI 3.0 1423 1539
## 39 JOEL R HENDON MI 3.0 1436P23 1430
## 40 FOREST ZHANG MI 3.0 1348 1391
## 41 KYLE WILLIAM MURPHY MI 3.0 1403P5 1248
## 42 JARED GE MI 3.0 1332 1150
## 43 ROBERT GLEN VASEY MI 3.0 1283 1107
## 44 JUSTIN D SCHILLING MI 3.0 1199 1327
## 45 DEREK YAN MI 3.0 1242 1152
## 46 JACOB ALEXANDER LAVALLEY MI 3.0 377P3 1358
## 47 ERIC WRIGHT MI 2.5 1362 1392
## 48 DANIEL KHAIN MI 2.5 1382 1356
## 49 MICHAEL J MARTIN MI 2.5 1291P12 1286
## 50 SHIVAM JHA MI 2.5 1056 1296
## 51 TEJAS AYYAGARI MI 2.5 1011 1356
## 52 ETHAN GUO MI 2.5 935 1495
## 53 JOSE C YBARRA MI 2.0 1393 1345
## 54 LARRY HODGE MI 2.0 1270 1206
## 55 ALEX KONG MI 2.0 1186 1406
## 56 MARISA RICCI MI 2.0 1153 1414
## 57 MICHAEL LU MI 2.0 1092 1363
## 58 VIRAJ MOHILE MI 2.0 917 1391
## 59 SEAN M MC CORMICK MI 2.0 853 1319
## 60 JULIA SHEN MI 1.5 967 1330
## 61 JEZZEL FARKAS ON 1.5 955P11 1327
## 62 ASHWIN BALAJI MI 1.0 1530 1186
## 63 THOMAS JOSEPH HOSMER MI 1.0 1175 1350
## 64 BEN LI MI 1.0 1163 1263
write_csv(DF_Player_Export, "data/DF_Player.csv")