Intro

In this project we are given a text file with chess tournament results where the information has some structure. The goal of this project is to structure and tidy the data into a CSV file.

The reason we would want the data tidy in a CSV file is because originally the text file is written in such a way as to be easily inferred by a person reading it, but not a machine. For example, it has multiple lines for the same player, ambiguous delimiters, and mixed data types such as text + numeric (e.g. W 39).

First we load the text file containing tournament data

# Load tournament data
tournament_data <- read.table("https://raw.githubusercontent.com/Emin-NYC/DATA607-week5project/refs/heads/main/tournamentinfo.txt", sep = "\n", stringsAsFactors = FALSE, fill = TRUE)
tournament_data <- as.character(tournament_data$V1)

# Print first few rows
print(head(tournament_data, 20))

##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------" 
## [11] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|" 
## [12] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [13] "-----------------------------------------------------------------------------------------" 
## [14] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|" 
## [15] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |" 
## [16] "-----------------------------------------------------------------------------------------" 
## [17] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|" 
## [18] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [19] "-----------------------------------------------------------------------------------------" 
## [20] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"

After loading the text file, we clean the data by removing unnecessary rows

clean_data <- tournament_data[5:length(tournament_data)]
clean_data <- clean_data[!grepl("-{10,}", clean_data)]

After cleaning the data, we create empty lists (vectors) to store extracted info like player names, states, points scored, ratings, and opponents.

player_names <- c()
player_states <- c()
total_points <- c()
pre_ratings <- c()
opponents <- list()

After initializing our vectors, we extract the relevant information for each player by looping through the cleaned data two lines at a time because each player’s data takes up two lines.

for (i in seq(1, length(clean_data), by = 2)) {
  if ((i + 1) > length(clean_data)) next

  # First line has player details (e.g. name and points scored)
  info_line <- clean_data[i]
  
  # Second line has player's rating.
  rating_line <- clean_data[i + 1]

  # Split the lines into parts using "\\|" as delimiter
  info_parts <- unlist(strsplit(info_line, "\\|"))
  rating_parts <- unlist(strsplit(rating_line, "\\|"))

  # Extract player name from part of info line
  name <- trimws(info_parts[2])
  if (name == "") {
    print(paste("Skipping due to missing name at line", i))
    next
  }

  # Extract player's total points in tournament
  points <- as.numeric(gsub("[^0-9.]", "", info_parts[3]))
  if (is.na(points)) {
    print(paste("Skipping due to missing points for player:", name))
    next
  }

  # Extract player state from part of second line
  state <- trimws(rating_parts[1])
  if (state == "") {
    print(paste("Skipping due to missing state for player:", name))
    next
  }

  # Extract the player's rating before tournament
  pre_rating_text <- sub(".*R: *([0-9]+).*", "\\1", rating_parts[2])
  pre_rating <- as.numeric(pre_rating_text)
  if (is.na(pre_rating)) {
    print(paste("Skipping due to missing or incorrect pre-rating for player:", name))
    next
  }

  # Extract opponent numbers using a more reliable method
  opponent_numbers <- gsub("[A-Za-z]", "", paste(info_parts[4:length(info_parts)], collapse = " "))  # Remove all alphabetic characters
  opponent_numbers <- trimws(unlist(strsplit(opponent_numbers, " +")))  # Split by spaces
  opponent_numbers <- opponent_numbers[opponent_numbers != ""]  # Remove empty values
  opponent_numbers <- as.numeric(opponent_numbers)  # Convert to numeric
  opponent_numbers <- opponent_numbers[!is.na(opponent_numbers)]  # Remove NA values

  # Append extracted information to lists we created earlier
  player_names <- c(player_names, name)
  player_states <- c(player_states, state)
  total_points <- c(total_points, points)
  pre_ratings <- c(pre_ratings, pre_rating)
  opponents[[length(opponents) + 1]] <- opponent_numbers
}


# Calculate average opponent ratings
avg_opponent_ratings <- c()

# For each player, calculate average rating of opponents they played against
for (i in seq_along(opponents)) {
  # Get opponent indices for current player
  opp_indices <- opponents[[i]]
  
  # Make sure opponent indices are valid (within list of all players)
  opp_indices <- opp_indices[!is.na(opp_indices) & opp_indices > 0 & opp_indices <= length(pre_ratings)]

  if (length(opp_indices) == 0) {
    avg_rating <- NA  # Set to NA if no valid opponents are found
  } else {
    opp_ratings <- pre_ratings[opp_indices]
    avg_rating <- round(mean(opp_ratings, na.rm = TRUE))
  }
  
  avg_opponent_ratings <- c(avg_opponent_ratings, avg_rating)
}

Finally, we create the final dataframe and remove NA values

final_df <- data.frame(
  Player_Name = player_names,
  Player_State = player_states,
  Total_Points = total_points,
  Player_Pre_Rating = pre_ratings,
  Avg_Pre_Chess_Rating = avg_opponent_ratings
)

# Remove any rows where total points are NA
final_df <- final_df[!is.na(final_df$Total_Points), ]

# Write the dataframe to a CSV file
write.csv(final_df, "chess_tournament_results.csv", row.names = FALSE)
  
# Print final dataframe
print(final_df)

##                   Player_Name Player_State Total_Points Player_Pre_Rating
## 1                    GARY HUA           ON          6.0              1794
## 2             DAKSHESH DARURI           MI          6.0              1553
## 3                ADITYA BAJAJ           MI          6.0              1384
## 4         PATRICK H SCHILLING           MI          5.5              1716
## 5                  HANSHI ZUO           MI          5.5              1655
## 6                 HANSEN SONG           OH          5.0              1686
## 7           GARY DEE SWATHELL           MI          5.0              1649
## 8            EZEKIEL HOUGHTON           MI          5.0              1641
## 9                 STEFANO LEE           ON          5.0              1411
## 10                  ANVIT RAO           MI          5.0              1365
## 11   CAMERON WILLIAM MC LEMAN           MI          4.5              1712
## 12             KENNETH J TACK           MI          4.5              1663
## 13          TORRANCE HENRY JR           MI          4.5              1666
## 14               BRADLEY SHAW           MI          4.5              1610
## 15     ZACHARY JAMES HOUGHTON           MI          4.5              1220
## 16               MIKE NIKITIN           MI          4.0              1604
## 17         RONALD GRZEGORCZYK           MI          4.0              1629
## 18              DAVID SUNDEEN           MI          4.0              1600
## 19               DIPANKAR ROY           MI          4.0              1564
## 20                JASON ZHENG           MI          4.0              1595
## 21              DINH DANG BUI           ON          4.0              1563
## 22           EUGENE L MCCLURE           MI          4.0              1555
## 23                   ALAN BUI           ON          4.0              1363
## 24          MICHAEL R ALDRICH           MI          4.0              1229
## 25           LOREN SCHWIEBERT           MI          3.5              1745
## 26                    MAX ZHU           ON          3.5              1579
## 27             GAURAV GIDWANI           MI          3.5              1552
## 28 SOFIA ADINA STANESCU-BELLU           MI          3.5              1507
## 29           CHIEDOZIE OKORIE           MI          3.5              1602
## 30         GEORGE AVERY JONES           ON          3.5              1522
## 31               RISHI SHETTY           MI          3.5              1494
## 32      JOSHUA PHILIP MATHEWS           ON          3.5              1441
## 33                    JADE GE           MI          3.5              1449
## 34     MICHAEL JEFFERY THOMAS           MI          3.5              1399
## 35           JOSHUA DAVID LEE           MI          3.5              1438
## 36              SIDDHARTH JHA           MI          3.5              1355
## 37       AMIYATOSH PWNANANDAM           MI          3.5               980
## 38                  BRIAN LIU           MI          3.0              1423
## 39              JOEL R HENDON           MI          3.0              1436
## 40               FOREST ZHANG           MI          3.0              1348
## 41        KYLE WILLIAM MURPHY           MI          3.0              1403
## 42                   JARED GE           MI          3.0              1332
## 43          ROBERT GLEN VASEY           MI          3.0              1283
## 44         JUSTIN D SCHILLING           MI          3.0              1199
## 45                  DEREK YAN           MI          3.0              1242
## 46   JACOB ALEXANDER LAVALLEY           MI          3.0               377
## 47                ERIC WRIGHT           MI          2.5              1362
## 48               DANIEL KHAIN           MI          2.5              1382
## 49           MICHAEL J MARTIN           MI          2.5              1291
## 50                 SHIVAM JHA           MI          2.5              1056
## 51             TEJAS AYYAGARI           MI          2.5              1011
## 52                  ETHAN GUO           MI          2.5               935
## 53              JOSE C YBARRA           MI          2.0              1393
## 54                LARRY HODGE           MI          2.0              1270
## 55                  ALEX KONG           MI          2.0              1186
## 56               MARISA RICCI           MI          2.0              1153
## 57                 MICHAEL LU           MI          2.0              1092
## 58               VIRAJ MOHILE           MI          2.0               917
## 59          SEAN M MC CORMICK           MI          2.0               853
## 60                 JULIA SHEN           MI          1.5               967
## 61              JEZZEL FARKAS           ON          1.5               955
## 62              ASHWIN BALAJI           MI          1.0              1530
## 63       THOMAS JOSEPH HOSMER           MI          1.0              1175
## 64                     BEN LI           MI          1.0              1163
##    Avg_Pre_Chess_Rating
## 1                  1605
## 2                  1469
## 3                  1564
## 4                  1574
## 5                  1501
## 6                  1519
## 7                  1372
## 8                  1468
## 9                  1523
## 10                 1554
## 11                 1468
## 12                 1506
## 13                 1498
## 14                 1515
## 15                 1484
## 16                 1386
## 17                 1499
## 18                 1480
## 19                 1426
## 20                 1411
## 21                 1470
## 22                 1300
## 23                 1214
## 24                 1357
## 25                 1363
## 26                 1507
## 27                 1222
## 28                 1522
## 29                 1314
## 30                 1144
## 31                 1260
## 32                 1379
## 33                 1277
## 34                 1375
## 35                 1150
## 36                 1388
## 37                 1385
## 38                 1539
## 39                 1430
## 40                 1391
## 41                 1248
## 42                 1150
## 43                 1107
## 44                 1327
## 45                 1152
## 46                 1358
## 47                 1392
## 48                 1356
## 49                 1286
## 50                 1296
## 51                 1356
## 52                 1495
## 53                 1345
## 54                 1206
## 55                 1406
## 56                 1414
## 57                 1363
## 58                 1391
## 59                 1319
## 60                 1330
## 61                 1327
## 62                 1186
## 63                 1350
## 64                 1263