In this project we clean up a particularly formatted text document
for a chess tournament and extract the average opponent pre-rating for
each player.
Import and cleanup
Steps
- We import the textfile from github where its been stored, remove the
dotted lines, and then combine the two rows per player into 1 row
raw_data <- readLines("https://raw.githubusercontent.com/jerryjerald27/Data-607/refs/heads/main/Week4Assignment/tournamentinfo.txt")[-(1:3)]
#remove the dotted lines
raw_data <- raw_data[str_detect(raw_data, '^\\-+$') == FALSE]
# #combine two lines to one row
combined_data <- raw_data %>%
str_trim() %>%
# .[str_detect(., "^\\-S")] %>%
enframe(name = NULL) %>%
mutate(row_num = rep(1:(n()/2), each = 2)) %>%
group_by(row_num) %>%
summarise(combined = paste(value, collapse = " ")) %>%
pull(combined)
knitr::kable((head(combined_data)),"simple")
| 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4| ON
| 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W | |
| 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W
20|W 7| MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B
| |
| 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W
12| MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W | |
| 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W
19|D 1| MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B
| |
| 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|
MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B | |
| 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W
21| OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B | |
- We then map this combined long observation into each of their
specific columns
# Split combined data by "|" and create a dataframe
raw_data_frame <- combined_data %>%
str_split(pattern = "\\|") %>%
map_dfr(~tibble(Player = .[1], Name = .[2], State =.[11], Total = .[3], Pre_Rating = .[12],
Round1 = .[4], Round2 = .[5], Round3 = .[6],
Round4 = .[7], Round5 = .[8], Round6 = .[9], Round7 = .[10]))
knitr::kable(head(raw_data_frame),"simple")
| 1 |
GARY HUA |
ON |
6.0 |
15445895 / R: 1794 ->1817 |
W 39 |
W 21 |
W 18 |
W 14 |
W 7 |
D 12 |
D 4 |
| 2 |
DAKSHESH DARURI |
MI |
6.0 |
14598900 / R: 1553 ->1663 |
W 63 |
W 58 |
L 4 |
W 17 |
W 16 |
W 20 |
W 7 |
| 3 |
ADITYA BAJAJ |
MI |
6.0 |
14959604 / R: 1384 ->1640 |
L 8 |
W 61 |
W 25 |
W 21 |
W 11 |
W 13 |
W 12 |
| 4 |
PATRICK H SCHILLING |
MI |
5.5 |
12616049 / R: 1716 ->1744 |
W 23 |
D 28 |
W 2 |
W 26 |
D 5 |
W 19 |
D 1 |
| 5 |
HANSHI ZUO |
MI |
5.5 |
14601533 / R: 1655 ->1690 |
W 45 |
W 37 |
D 12 |
D 13 |
D 4 |
W 14 |
W 17 |
| 6 |
HANSEN SONG |
OH |
5.0 |
15055204 / R: 1686 ->1687 |
W 34 |
D 29 |
L 11 |
W 35 |
D 10 |
W 27 |
W 21 |
- We can now mutate the columns to further clean up the Pre rating
column to only include the rating, and the Rounds column to only include
the Opponent ID and not the game result
raw_data_frame <- raw_data_frame %>%
mutate(
Pre_Rating = as.numeric(str_extract(Pre_Rating, "(?<=R:\\s{1,2})\\d{3,4}")), # Gets just the prerating from the prerating column
across(starts_with("Round"), ~ gsub("[^0-9]", "", .)), #mutates all columns starting with rounds to only include digits
Player = as.numeric(Player)
)
knitr::kable(head(raw_data_frame),"simple")
| 1 |
GARY HUA |
ON |
6.0 |
1794 |
39 |
21 |
18 |
14 |
7 |
12 |
4 |
| 2 |
DAKSHESH DARURI |
MI |
6.0 |
1553 |
63 |
58 |
4 |
17 |
16 |
20 |
7 |
| 3 |
ADITYA BAJAJ |
MI |
6.0 |
1384 |
8 |
61 |
25 |
21 |
11 |
13 |
12 |
| 4 |
PATRICK H SCHILLING |
MI |
5.5 |
1716 |
23 |
28 |
2 |
26 |
5 |
19 |
1 |
| 5 |
HANSHI ZUO |
MI |
5.5 |
1655 |
45 |
37 |
12 |
13 |
4 |
14 |
17 |
| 6 |
HANSEN SONG |
OH |
5.0 |
1686 |
34 |
29 |
11 |
35 |
10 |
27 |
21 |
Calculating average Opponent ratings
This is done in 2 steps
- For each player we replace the opponent Ids for each of the rounds
with the Pre_ratings of the opponents
round_columns <- paste0("Round", 1:7)
for (col in round_columns) {
raw_data_frame[[col]] <- sapply(raw_data_frame[[col]], function(x) {
rating <- raw_data_frame$Pre_Rating[raw_data_frame$Player == x]
if (length(rating) > 0) rating else NA
})
}
knitr::kable(head(raw_data_frame),"simple")
| 1 |
GARY HUA |
ON |
6.0 |
1794 |
1436 |
1563 |
1600 |
1610 |
1649 |
1663 |
1716 |
| 2 |
DAKSHESH DARURI |
MI |
6.0 |
1553 |
1175 |
917 |
1716 |
1629 |
1604 |
1595 |
1649 |
| 3 |
ADITYA BAJAJ |
MI |
6.0 |
1384 |
1641 |
955 |
1745 |
1563 |
1712 |
1666 |
1663 |
| 4 |
PATRICK H SCHILLING |
MI |
5.5 |
1716 |
1363 |
1507 |
1553 |
1579 |
1655 |
1564 |
1794 |
| 5 |
HANSHI ZUO |
MI |
5.5 |
1655 |
1242 |
980 |
1663 |
1666 |
1716 |
1610 |
1629 |
| 6 |
HANSEN SONG |
OH |
5.0 |
1686 |
1399 |
1602 |
1712 |
1438 |
1365 |
1552 |
1563 |
- We then Average out the ratings across the rounds
# Calculate average opponent pre-rating
raw_data_frame$Avg_Opp_Pre_rating <- round(rowMeans(raw_data_frame[round_columns], na.rm = TRUE))
Average_chess_rating <- raw_data_frame %>% select(Name, State, Total, Pre_Rating, Avg_Opp_Pre_rating)
write.csv(Average_chess_rating, "cleaned_chess.csv", row.names = TRUE)
Conclusion
We can now view the completed table and export it into a .csv
file
knitr::kable((Average_chess_rating), "simple")
| GARY HUA |
ON |
6.0 |
1794 |
1605 |
| DAKSHESH DARURI |
MI |
6.0 |
1553 |
1469 |
| ADITYA BAJAJ |
MI |
6.0 |
1384 |
1564 |
| PATRICK H SCHILLING |
MI |
5.5 |
1716 |
1574 |
| HANSHI ZUO |
MI |
5.5 |
1655 |
1501 |
| HANSEN SONG |
OH |
5.0 |
1686 |
1519 |
| GARY DEE SWATHELL |
MI |
5.0 |
1649 |
1372 |
| EZEKIEL HOUGHTON |
MI |
5.0 |
1641 |
1468 |
| STEFANO LEE |
ON |
5.0 |
1411 |
1523 |
| ANVIT RAO |
MI |
5.0 |
1365 |
1554 |
| CAMERON WILLIAM MC LEMAN |
MI |
4.5 |
1712 |
1468 |
| KENNETH J TACK |
MI |
4.5 |
1663 |
1506 |
| TORRANCE HENRY JR |
MI |
4.5 |
1666 |
1498 |
| BRADLEY SHAW |
MI |
4.5 |
1610 |
1515 |
| ZACHARY JAMES HOUGHTON |
MI |
4.5 |
1220 |
1484 |
| MIKE NIKITIN |
MI |
4.0 |
1604 |
1386 |
| RONALD GRZEGORCZYK |
MI |
4.0 |
1629 |
1499 |
| DAVID SUNDEEN |
MI |
4.0 |
1600 |
1480 |
| DIPANKAR ROY |
MI |
4.0 |
1564 |
1426 |
| JASON ZHENG |
MI |
4.0 |
1595 |
1411 |
| DINH DANG BUI |
ON |
4.0 |
1563 |
1470 |
| EUGENE L MCCLURE |
MI |
4.0 |
1555 |
1300 |
| ALAN BUI |
ON |
4.0 |
1363 |
1214 |
| MICHAEL R ALDRICH |
MI |
4.0 |
1229 |
1357 |
| LOREN SCHWIEBERT |
MI |
3.5 |
1745 |
1363 |
| MAX ZHU |
ON |
3.5 |
1579 |
1507 |
| GAURAV GIDWANI |
MI |
3.5 |
1552 |
1222 |
| SOFIA ADINA STANESCU-BELLU |
MI |
3.5 |
1507 |
1522 |
| CHIEDOZIE OKORIE |
MI |
3.5 |
1602 |
1314 |
| GEORGE AVERY JONES |
ON |
3.5 |
1522 |
1144 |
| RISHI SHETTY |
MI |
3.5 |
1494 |
1260 |
| JOSHUA PHILIP MATHEWS |
ON |
3.5 |
1441 |
1379 |
| JADE GE |
MI |
3.5 |
1449 |
1277 |
| MICHAEL JEFFERY THOMAS |
MI |
3.5 |
1399 |
1375 |
| JOSHUA DAVID LEE |
MI |
3.5 |
1438 |
1150 |
| SIDDHARTH JHA |
MI |
3.5 |
1355 |
1388 |
| AMIYATOSH PWNANANDAM |
MI |
3.5 |
980 |
1385 |
| BRIAN LIU |
MI |
3.0 |
1423 |
1539 |
| JOEL R HENDON |
MI |
3.0 |
1436 |
1430 |
| FOREST ZHANG |
MI |
3.0 |
1348 |
1391 |
| KYLE WILLIAM MURPHY |
MI |
3.0 |
1403 |
1248 |
| JARED GE |
MI |
3.0 |
1332 |
1150 |
| ROBERT GLEN VASEY |
MI |
3.0 |
1283 |
1107 |
| JUSTIN D SCHILLING |
MI |
3.0 |
1199 |
1327 |
| DEREK YAN |
MI |
3.0 |
1242 |
1152 |
| JACOB ALEXANDER LAVALLEY |
MI |
3.0 |
377 |
1358 |
| ERIC WRIGHT |
MI |
2.5 |
1362 |
1392 |
| DANIEL KHAIN |
MI |
2.5 |
1382 |
1356 |
| MICHAEL J MARTIN |
MI |
2.5 |
1291 |
1286 |
| SHIVAM JHA |
MI |
2.5 |
1056 |
1296 |
| TEJAS AYYAGARI |
MI |
2.5 |
1011 |
1356 |
| ETHAN GUO |
MI |
2.5 |
935 |
1495 |
| JOSE C YBARRA |
MI |
2.0 |
1393 |
1345 |
| LARRY HODGE |
MI |
2.0 |
1270 |
1206 |
| ALEX KONG |
MI |
2.0 |
1186 |
1406 |
| MARISA RICCI |
MI |
2.0 |
1153 |
1414 |
| MICHAEL LU |
MI |
2.0 |
1092 |
1363 |
| VIRAJ MOHILE |
MI |
2.0 |
917 |
1391 |
| SEAN M MC CORMICK |
MI |
2.0 |
853 |
1319 |
| JULIA SHEN |
MI |
1.5 |
967 |
1330 |
| JEZZEL FARKAS |
ON |
1.5 |
955 |
1327 |
| ASHWIN BALAJI |
MI |
1.0 |
1530 |
1186 |
| THOMAS JOSEPH HOSMER |
MI |
1.0 |
1175 |
1350 |
| BEN LI |
MI |
1.0 |
1163 |
1263 |