In this project we are given a text file with chess tournament results where the information has some structure. The goal of this project is to structure and tidy the data into a CSV file.
The reason we would want the data tidy in a CSV file is because originally the text file is written in such a way as to be easily inferred by a person reading it, but not a machine. For example, it has multiple lines for the same player, ambiguous delimiters, and mixed data types such as text + numeric (e.g. W 39).
# Load tournament data
tournament_data <- read.table("https://raw.githubusercontent.com/Emin-NYC/DATA607-week5project/refs/heads/main/tournamentinfo.txt", sep = "\n", stringsAsFactors = FALSE, fill = TRUE)
tournament_data <- as.character(tournament_data$V1)
# Print first few rows
print(head(tournament_data, 20))
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
## [11] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [12] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [13] "-----------------------------------------------------------------------------------------"
## [14] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [15] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [16] "-----------------------------------------------------------------------------------------"
## [17] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [18] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [19] "-----------------------------------------------------------------------------------------"
## [20] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
clean_data <- tournament_data[5:length(tournament_data)]
clean_data <- clean_data[!grepl("-{10,}", clean_data)]
player_names <- c()
player_states <- c()
total_points <- c()
pre_ratings <- c()
opponents <- list()
for (i in seq(1, length(clean_data), by = 2)) {
if ((i + 1) > length(clean_data)) next
# First line has player details (e.g. name and points scored)
info_line <- clean_data[i]
# Second line has player's rating.
rating_line <- clean_data[i + 1]
# Split the lines into parts using "\\|" as delimiter
info_parts <- unlist(strsplit(info_line, "\\|"))
rating_parts <- unlist(strsplit(rating_line, "\\|"))
# Extract player name from part of info line
name <- trimws(info_parts[2])
if (name == "") {
print(paste("Skipping due to missing name at line", i))
next
}
# Extract player's total points in tournament
points <- as.numeric(gsub("[^0-9.]", "", info_parts[3]))
if (is.na(points)) {
print(paste("Skipping due to missing points for player:", name))
next
}
# Extract player state from part of second line
state <- trimws(rating_parts[1])
if (state == "") {
print(paste("Skipping due to missing state for player:", name))
next
}
# Extract the player's rating before tournament
pre_rating_text <- sub(".*R: *([0-9]+).*", "\\1", rating_parts[2])
pre_rating <- as.numeric(pre_rating_text)
if (is.na(pre_rating)) {
print(paste("Skipping due to missing or incorrect pre-rating for player:", name))
next
}
# Extract opponent numbers using a more reliable method
opponent_numbers <- gsub("[A-Za-z]", "", paste(info_parts[4:length(info_parts)], collapse = " ")) # Remove all alphabetic characters
opponent_numbers <- trimws(unlist(strsplit(opponent_numbers, " +"))) # Split by spaces
opponent_numbers <- opponent_numbers[opponent_numbers != ""] # Remove empty values
opponent_numbers <- as.numeric(opponent_numbers) # Convert to numeric
opponent_numbers <- opponent_numbers[!is.na(opponent_numbers)] # Remove NA values
# Append extracted information to lists we created earlier
player_names <- c(player_names, name)
player_states <- c(player_states, state)
total_points <- c(total_points, points)
pre_ratings <- c(pre_ratings, pre_rating)
opponents[[length(opponents) + 1]] <- opponent_numbers
}
# Calculate average opponent ratings
avg_opponent_ratings <- c()
# For each player, calculate average rating of opponents they played against
for (i in seq_along(opponents)) {
# Get opponent indices for current player
opp_indices <- opponents[[i]]
# Make sure opponent indices are valid (within list of all players)
opp_indices <- opp_indices[!is.na(opp_indices) & opp_indices > 0 & opp_indices <= length(pre_ratings)]
if (length(opp_indices) == 0) {
avg_rating <- NA # Set to NA if no valid opponents are found
} else {
opp_ratings <- pre_ratings[opp_indices]
avg_rating <- round(mean(opp_ratings, na.rm = TRUE))
}
avg_opponent_ratings <- c(avg_opponent_ratings, avg_rating)
}
final_df <- data.frame(
Player_Name = player_names,
Player_State = player_states,
Total_Points = total_points,
Player_Pre_Rating = pre_ratings,
Avg_Pre_Chess_Rating = avg_opponent_ratings
)
# Remove any rows where total points are NA
final_df <- final_df[!is.na(final_df$Total_Points), ]
# Write the dataframe to a CSV file
write.csv(final_df, "chess_tournament_results.csv", row.names = FALSE)
# Print final dataframe
print(final_df)
## Player_Name Player_State Total_Points Player_Pre_Rating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## 7 GARY DEE SWATHELL MI 5.0 1649
## 8 EZEKIEL HOUGHTON MI 5.0 1641
## 9 STEFANO LEE ON 5.0 1411
## 10 ANVIT RAO MI 5.0 1365
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712
## 12 KENNETH J TACK MI 4.5 1663
## 13 TORRANCE HENRY JR MI 4.5 1666
## 14 BRADLEY SHAW MI 4.5 1610
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220
## 16 MIKE NIKITIN MI 4.0 1604
## 17 RONALD GRZEGORCZYK MI 4.0 1629
## 18 DAVID SUNDEEN MI 4.0 1600
## 19 DIPANKAR ROY MI 4.0 1564
## 20 JASON ZHENG MI 4.0 1595
## 21 DINH DANG BUI ON 4.0 1563
## 22 EUGENE L MCCLURE MI 4.0 1555
## 23 ALAN BUI ON 4.0 1363
## 24 MICHAEL R ALDRICH MI 4.0 1229
## 25 LOREN SCHWIEBERT MI 3.5 1745
## 26 MAX ZHU ON 3.5 1579
## 27 GAURAV GIDWANI MI 3.5 1552
## 28 SOFIA ADINA STANESCU-BELLU MI 3.5 1507
## 29 CHIEDOZIE OKORIE MI 3.5 1602
## 30 GEORGE AVERY JONES ON 3.5 1522
## 31 RISHI SHETTY MI 3.5 1494
## 32 JOSHUA PHILIP MATHEWS ON 3.5 1441
## 33 JADE GE MI 3.5 1449
## 34 MICHAEL JEFFERY THOMAS MI 3.5 1399
## 35 JOSHUA DAVID LEE MI 3.5 1438
## 36 SIDDHARTH JHA MI 3.5 1355
## 37 AMIYATOSH PWNANANDAM MI 3.5 980
## 38 BRIAN LIU MI 3.0 1423
## 39 JOEL R HENDON MI 3.0 1436
## 40 FOREST ZHANG MI 3.0 1348
## 41 KYLE WILLIAM MURPHY MI 3.0 1403
## 42 JARED GE MI 3.0 1332
## 43 ROBERT GLEN VASEY MI 3.0 1283
## 44 JUSTIN D SCHILLING MI 3.0 1199
## 45 DEREK YAN MI 3.0 1242
## 46 JACOB ALEXANDER LAVALLEY MI 3.0 377
## 47 ERIC WRIGHT MI 2.5 1362
## 48 DANIEL KHAIN MI 2.5 1382
## 49 MICHAEL J MARTIN MI 2.5 1291
## 50 SHIVAM JHA MI 2.5 1056
## 51 TEJAS AYYAGARI MI 2.5 1011
## 52 ETHAN GUO MI 2.5 935
## 53 JOSE C YBARRA MI 2.0 1393
## 54 LARRY HODGE MI 2.0 1270
## 55 ALEX KONG MI 2.0 1186
## 56 MARISA RICCI MI 2.0 1153
## 57 MICHAEL LU MI 2.0 1092
## 58 VIRAJ MOHILE MI 2.0 917
## 59 SEAN M MC CORMICK MI 2.0 853
## 60 JULIA SHEN MI 1.5 967
## 61 JEZZEL FARKAS ON 1.5 955
## 62 ASHWIN BALAJI MI 1.0 1530
## 63 THOMAS JOSEPH HOSMER MI 1.0 1175
## 64 BEN LI MI 1.0 1163
## Avg_Pre_Chess_Rating
## 1 1605
## 2 1469
## 3 1564
## 4 1574
## 5 1501
## 6 1519
## 7 1372
## 8 1468
## 9 1523
## 10 1554
## 11 1468
## 12 1506
## 13 1498
## 14 1515
## 15 1484
## 16 1386
## 17 1499
## 18 1480
## 19 1426
## 20 1411
## 21 1470
## 22 1300
## 23 1214
## 24 1357
## 25 1363
## 26 1507
## 27 1222
## 28 1522
## 29 1314
## 30 1144
## 31 1260
## 32 1379
## 33 1277
## 34 1375
## 35 1150
## 36 1388
## 37 1385
## 38 1539
## 39 1430
## 40 1391
## 41 1248
## 42 1150
## 43 1107
## 44 1327
## 45 1152
## 46 1358
## 47 1392
## 48 1356
## 49 1286
## 50 1296
## 51 1356
## 52 1495
## 53 1345
## 54 1206
## 55 1406
## 56 1414
## 57 1363
## 58 1391
## 59 1319
## 60 1330
## 61 1327
## 62 1186
## 63 1350
## 64 1263