Load the data

#install.packages("stringr", repos = "https://cloud.r-project.org/")
library(stringr)   
## Warning: package 'stringr' was built under R version 4.4.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
## Warning: package 'readr' was built under R version 4.4.1
tournament_data <- read_lines("D:\\CUNY Data Science\\607 - Statistics and Probability for Data Analytics\\data\\tournamentinfo.txt")

head(tournament_data)
## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

Parsing the data for the player’s info

cleaned_tournament_data <- tournament_data[!grepl("^[-]+$", tournament_data)]

# Player info: odd lines
player_info <- cleaned_tournament_data[seq(1, length(cleaned_tournament_data), by = 2)]   

# Ratings info: even lines
ratings_info <- cleaned_tournament_data[seq(2, length(cleaned_tournament_data), by = 2)] 

print(head(player_info))  
## [1] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [2] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [3] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
## [4] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|" 
## [5] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|" 
## [6] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
print(head(ratings_info))  
## [1] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [3] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [4] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [5] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |" 
## [6] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"

Creating dataframes out the split data

split_by_pipe <- function(line, num_cols) {
  split_line <- str_trim(unlist(strsplit(line, "\\|")))
  length_diff <- num_cols - length(split_line)
  
  if (length_diff > 0) {
    split_line <- c(split_line, rep(NA, length_diff))
  }
  return(split_line)
}

# Find the maximum number of columns
max_cols_player <- max(sapply(player_info, function(x) length(unlist(strsplit(x, "\\|")))))
max_cols_ratings <- max(sapply(ratings_info, function(x) length(unlist(strsplit(x, "\\|")))))

# Apply the function to split each line
player_info_df <- do.call(rbind, lapply(player_info, split_by_pipe, num_cols = max_cols_player))
ratings_info_df <- do.call(rbind, lapply(ratings_info, split_by_pipe, num_cols = max_cols_ratings))

# Convert to data frames
player_info_df <- as.data.frame(player_info_df, stringsAsFactors = FALSE)
ratings_info_df <- as.data.frame(ratings_info_df, stringsAsFactors = FALSE)

player_header <- c("Pair Num", "Player Name", "Total Pts", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7")
colnames(player_info_df) <- player_header

ratings_header <- c("Pair Num", "USCF ID / Rtg (Pre->Post)", "Total Pts", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7")
colnames(ratings_info_df) <- ratings_header

player_info_df <- player_info_df[-1, ]
ratings_info_df <- ratings_info_df[-1, ]


print(head(player_info_df))
##   Pair Num         Player Name Total Pts Round1 Round2 Round3 Round4 Round5
## 2        1            GARY HUA       6.0  W  39  W  21  W  18  W  14  W   7
## 3        2     DAKSHESH DARURI       6.0  W  63  W  58  L   4  W  17  W  16
## 4        3        ADITYA BAJAJ       6.0  L   8  W  61  W  25  W  21  W  11
## 5        4 PATRICK H SCHILLING       5.5  W  23  D  28  W   2  W  26  D   5
## 6        5          HANSHI ZUO       5.5  W  45  W  37  D  12  D  13  D   4
## 7        6         HANSEN SONG       5.0  W  34  D  29  L  11  W  35  D  10
##   Round6 Round7   NA
## 2  D  12  D   4 <NA>
## 3  W  20  W   7 <NA>
## 4  W  13  W  12 <NA>
## 5  W  19  D   1 <NA>
## 6  W  14  W  17 <NA>
## 7  W  27  W  21 <NA>
print(head(ratings_info_df))
##   Pair Num   USCF ID / Rtg (Pre->Post) Total Pts Round1 Round2 Round3 Round4
## 2       ON 15445895 / R: 1794   ->1817       N:2      W      B      W      B
## 3       MI 14598900 / R: 1553   ->1663       N:2      B      W      B      W
## 4       MI 14959604 / R: 1384   ->1640       N:2      W      B      W      B
## 5       MI 12616049 / R: 1716   ->1744       N:2      W      B      W      B
## 6       MI 14601533 / R: 1655   ->1690       N:2      B      W      B      W
## 7       OH 15055204 / R: 1686   ->1687       N:3      W      B      W      B
##   Round5 Round6 Round7   NA
## 2      W      B      W <NA>
## 3      B      W      B <NA>
## 4      W      B      W <NA>
## 5      W      B      B <NA>
## 6      B      W      B <NA>
## 7      B      W      B <NA>

Extracting from the two dataframes before we create a final dataframe

player_names <- player_info_df$`Player Name`
head(player_names)
## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"
player_state <- ratings_info_df$`Pair Num`
print(player_state)
##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI"
## [16] "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI" "MI" "ON"
## [31] "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [46] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [61] "ON" "MI" "MI" "MI"
total_points <- as.numeric(player_info_df$'Total Pts') 
print(total_points)
##  [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0 4.0 4.0
## [20] 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.0
## [39] 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5 2.5 2.0 2.0 2.0 2.0 2.0
## [58] 2.0 2.0 1.5 1.5 1.0 1.0 1.0
rounds_df <- player_info_df[, c("Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7")]
#print(rounds_df)

round_results <- rounds_df  
round_opponents <- rounds_df  
for (col in colnames(rounds_df)) {

  round_results[[col]] <- gsub("[0-9]+", "", rounds_df[[col]])  
  
  round_opponents[[col]] <- gsub("[^0-9]", "", rounds_df[[col]])  
}

colnames(round_results) <- paste0("Round", 1:7, "_Result")

colnames(round_opponents) <- paste0("Round", 1:7, "_Opponent")


print(head(round_results))
##   Round1_Result Round2_Result Round3_Result Round4_Result Round5_Result
## 2           W             W             W             W            W   
## 3           W             W            L              W             W  
## 4          L              W             W             W             W  
## 5           W             D            W              W            D   
## 6           W             W             D             D            D   
## 7           W             D             L             W             D  
##   Round6_Result Round7_Result
## 2           D            D   
## 3           W            W   
## 4           W             W  
## 5           W            D   
## 6           W             W  
## 7           W             W
print(head(round_opponents))
##   Round1_Opponent Round2_Opponent Round3_Opponent Round4_Opponent
## 2              39              21              18              14
## 3              63              58               4              17
## 4               8              61              25              21
## 5              23              28               2              26
## 6              45              37              12              13
## 7              34              29              11              35
##   Round5_Opponent Round6_Opponent Round7_Opponent
## 2               7              12               4
## 3              16              20               7
## 4              11              13              12
## 5               5              19               1
## 6               4              14              17
## 7              10              27              21
ratings <- ratings_info_df$'USCF ID / Rtg (Pre->Post)'
head(ratings)
## [1] "15445895 / R: 1794   ->1817" "14598900 / R: 1553   ->1663"
## [3] "14959604 / R: 1384   ->1640" "12616049 / R: 1716   ->1744"
## [5] "14601533 / R: 1655   ->1690" "15055204 / R: 1686   ->1687"
pre_rating <- as.numeric(gsub(".*R:\\s*(\\d+).*", "\\1", ratings_info_df$`USCF ID / Rtg (Pre->Post)`))
print(pre_rating)
##  [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610 1220
## [16] 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507 1602 1522
## [31] 1494 1441 1449 1399 1438 1355  980 1423 1436 1348 1403 1332 1283 1199 1242
## [46]  377 1362 1382 1291 1056 1011  935 1393 1270 1186 1153 1092  917  853  967
## [61]  955 1530 1175 1163

Merging all my extracted results into a single dataframe

final_df <- data.frame(
  Player_Name = player_names,
  Player_State = player_state,
  Total_Points = total_points,
  Pre_Rating = pre_rating,
  Round1_Opponent = round_opponents$Round1_Opponent,
  Round1_Result = round_results$Round1_Result,
  Round2_Opponent = round_opponents$Round2_Opponent,
  Round2_Result = round_results$Round2_Result,
  Round3_Opponent = round_opponents$Round3_Opponent,
  Round3_Result = round_results$Round3_Result,
  Round4_Opponent = round_opponents$Round4_Opponent,
  Round4_Result = round_results$Round4_Result,
  Round5_Opponent = round_opponents$Round5_Opponent,
  Round5_Result = round_results$Round5_Result,
  Round6_Opponent = round_opponents$Round6_Opponent,
  Round6_Result = round_results$Round6_Result,
  Round7_Opponent = round_opponents$Round7_Opponent,
  Round7_Result = round_results$Round7_Result,
  stringsAsFactors = FALSE
)

print(head(final_df))
##           Player_Name Player_State Total_Points Pre_Rating Round1_Opponent
## 1            GARY HUA           ON          6.0       1794              39
## 2     DAKSHESH DARURI           MI          6.0       1553              63
## 3        ADITYA BAJAJ           MI          6.0       1384               8
## 4 PATRICK H SCHILLING           MI          5.5       1716              23
## 5          HANSHI ZUO           MI          5.5       1655              45
## 6         HANSEN SONG           OH          5.0       1686              34
##   Round1_Result Round2_Opponent Round2_Result Round3_Opponent Round3_Result
## 1           W                21           W                18           W  
## 2           W                58           W                 4          L   
## 3          L                 61           W                25           W  
## 4           W                28           D                 2          W   
## 5           W                37           W                12           D  
## 6           W                29           D                11           L  
##   Round4_Opponent Round4_Result Round5_Opponent Round5_Result Round6_Opponent
## 1              14           W                 7          W                 12
## 2              17           W                16           W                20
## 3              21           W                11           W                13
## 4              26           W                 5          D                 19
## 5              13           D                 4          D                 14
## 6              35           W                10           D                27
##   Round6_Result Round7_Opponent Round7_Result
## 1           D                 4          D   
## 2           W                 7          W   
## 3           W                12           W  
## 4           W                 1          D   
## 5           W                17           W  
## 6           W                21           W
avg_opponent_rating <- numeric(nrow(final_df))

for (i in 1:nrow(final_df)) {
  opponent_ratings <- c()
  
  opponent_numbers <- as.numeric(c(final_df$Round1_Opponent[i],
                                   final_df$Round2_Opponent[i],
                                   final_df$Round3_Opponent[i],
                                   final_df$Round4_Opponent[i],
                                   final_df$Round5_Opponent[i],
                                   final_df$Round6_Opponent[i],
                                   final_df$Round7_Opponent[i]))
  
  valid_opponents <- opponent_numbers[!is.na(opponent_numbers) & opponent_numbers > 0]
  if (length(valid_opponents) > 0) {
    for (opponent in valid_opponents) {
      opponent_rating <- final_df$Pre_Rating[opponent]  # Get the opponent's pre-rating
      opponent_ratings <- c(opponent_ratings, opponent_rating)  # Add to the list
    }
    avg_rating <- mean(as.numeric(opponent_ratings), na.rm = TRUE)
    print(avg_rating)
    avg_opponent_rating[i] <- avg_rating
  } else {
    avg_opponent_rating[i] <- NA
  }
}
## [1] 1605.286
## [1] 1469.286
## [1] 1563.571
## [1] 1573.571
## [1] 1500.857
## [1] 1518.714
## [1] 1372.143
## [1] 1468.429
## [1] 1523.143
## [1] 1554.143
## [1] 1467.571
## [1] 1506.167
## [1] 1497.857
## [1] 1515
## [1] 1483.857
## [1] 1385.8
## [1] 1498.571
## [1] 1480
## [1] 1426.286
## [1] 1410.857
## [1] 1470.429
## [1] 1300.333
## [1] 1213.857
## [1] 1357
## [1] 1363.286
## [1] 1506.857
## [1] 1221.667
## [1] 1522.143
## [1] 1313.5
## [1] 1144.143
## [1] 1259.857
## [1] 1378.714
## [1] 1276.857
## [1] 1375.286
## [1] 1149.714
## [1] 1388.167
## [1] 1384.8
## [1] 1539.167
## [1] 1429.571
## [1] 1390.571
## [1] 1248.5
## [1] 1149.857
## [1] 1106.571
## [1] 1327
## [1] 1152
## [1] 1357.714
## [1] 1392
## [1] 1355.8
## [1] 1285.8
## [1] 1296
## [1] 1356.143
## [1] 1494.571
## [1] 1345.333
## [1] 1206.167
## [1] 1406
## [1] 1414.4
## [1] 1363
## [1] 1391
## [1] 1319
## [1] 1330.2
## [1] 1327.286
## [1] 1186
## [1] 1350.2
## [1] 1263
final_df$Avg_Opponent_Rating <- avg_opponent_rating

print(head(final_df))
##           Player_Name Player_State Total_Points Pre_Rating Round1_Opponent
## 1            GARY HUA           ON          6.0       1794              39
## 2     DAKSHESH DARURI           MI          6.0       1553              63
## 3        ADITYA BAJAJ           MI          6.0       1384               8
## 4 PATRICK H SCHILLING           MI          5.5       1716              23
## 5          HANSHI ZUO           MI          5.5       1655              45
## 6         HANSEN SONG           OH          5.0       1686              34
##   Round1_Result Round2_Opponent Round2_Result Round3_Opponent Round3_Result
## 1           W                21           W                18           W  
## 2           W                58           W                 4          L   
## 3          L                 61           W                25           W  
## 4           W                28           D                 2          W   
## 5           W                37           W                12           D  
## 6           W                29           D                11           L  
##   Round4_Opponent Round4_Result Round5_Opponent Round5_Result Round6_Opponent
## 1              14           W                 7          W                 12
## 2              17           W                16           W                20
## 3              21           W                11           W                13
## 4              26           W                 5          D                 19
## 5              13           D                 4          D                 14
## 6              35           W                10           D                27
##   Round6_Result Round7_Opponent Round7_Result Avg_Opponent_Rating
## 1           D                 4          D               1605.286
## 2           W                 7          W               1469.286
## 3           W                12           W              1563.571
## 4           W                 1          D               1573.571
## 5           W                17           W              1500.857
## 6           W                21           W              1518.714

Writing my dataframe to csv file

write_csv(final_df, "D:\\CUNY Data Science\\607 - Statistics and Probability for Data Analytics\\output_data\\chess_tournament_averages.csv")