Load the data
#install.packages("stringr", repos = "https://cloud.r-project.org/")
library(stringr)
## Warning: package 'stringr' was built under R version 4.4.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
## Warning: package 'readr' was built under R version 4.4.1
tournament_data <- read_lines("D:\\CUNY Data Science\\607 - Statistics and Probability for Data Analytics\\data\\tournamentinfo.txt")
head(tournament_data)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
Parsing the data for the player’s info
cleaned_tournament_data <- tournament_data[!grepl("^[-]+$", tournament_data)]
# Player info: odd lines
player_info <- cleaned_tournament_data[seq(1, length(cleaned_tournament_data), by = 2)]
# Ratings info: even lines
ratings_info <- cleaned_tournament_data[seq(2, length(cleaned_tournament_data), by = 2)]
print(head(player_info))
## [1] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [2] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [3] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [4] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [5] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [6] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
print(head(ratings_info))
## [1] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [4] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [5] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [6] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
Creating dataframes out the split data
split_by_pipe <- function(line, num_cols) {
split_line <- str_trim(unlist(strsplit(line, "\\|")))
length_diff <- num_cols - length(split_line)
if (length_diff > 0) {
split_line <- c(split_line, rep(NA, length_diff))
}
return(split_line)
}
# Find the maximum number of columns
max_cols_player <- max(sapply(player_info, function(x) length(unlist(strsplit(x, "\\|")))))
max_cols_ratings <- max(sapply(ratings_info, function(x) length(unlist(strsplit(x, "\\|")))))
# Apply the function to split each line
player_info_df <- do.call(rbind, lapply(player_info, split_by_pipe, num_cols = max_cols_player))
ratings_info_df <- do.call(rbind, lapply(ratings_info, split_by_pipe, num_cols = max_cols_ratings))
# Convert to data frames
player_info_df <- as.data.frame(player_info_df, stringsAsFactors = FALSE)
ratings_info_df <- as.data.frame(ratings_info_df, stringsAsFactors = FALSE)
player_header <- c("Pair Num", "Player Name", "Total Pts", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7")
colnames(player_info_df) <- player_header
ratings_header <- c("Pair Num", "USCF ID / Rtg (Pre->Post)", "Total Pts", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7")
colnames(ratings_info_df) <- ratings_header
player_info_df <- player_info_df[-1, ]
ratings_info_df <- ratings_info_df[-1, ]
print(head(player_info_df))
## Pair Num Player Name Total Pts Round1 Round2 Round3 Round4 Round5
## 2 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7
## 3 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16
## 4 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11
## 5 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26 D 5
## 6 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13 D 4
## 7 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35 D 10
## Round6 Round7 NA
## 2 D 12 D 4 <NA>
## 3 W 20 W 7 <NA>
## 4 W 13 W 12 <NA>
## 5 W 19 D 1 <NA>
## 6 W 14 W 17 <NA>
## 7 W 27 W 21 <NA>
print(head(ratings_info_df))
## Pair Num USCF ID / Rtg (Pre->Post) Total Pts Round1 Round2 Round3 Round4
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B W B
## 3 MI 14598900 / R: 1553 ->1663 N:2 B W B W
## 4 MI 14959604 / R: 1384 ->1640 N:2 W B W B
## 5 MI 12616049 / R: 1716 ->1744 N:2 W B W B
## 6 MI 14601533 / R: 1655 ->1690 N:2 B W B W
## 7 OH 15055204 / R: 1686 ->1687 N:3 W B W B
## Round5 Round6 Round7 NA
## 2 W B W <NA>
## 3 B W B <NA>
## 4 W B W <NA>
## 5 W B B <NA>
## 6 B W B <NA>
## 7 B W B <NA>
Extracting from the two dataframes before we create a final dataframe
player_names <- player_info_df$`Player Name`
head(player_names)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
player_state <- ratings_info_df$`Pair Num`
print(player_state)
## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI"
## [16] "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI" "MI" "ON"
## [31] "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [46] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [61] "ON" "MI" "MI" "MI"
total_points <- as.numeric(player_info_df$'Total Pts')
print(total_points)
## [1] 6.0 6.0 6.0 5.5 5.5 5.0 5.0 5.0 5.0 5.0 4.5 4.5 4.5 4.5 4.5 4.0 4.0 4.0 4.0
## [20] 4.0 4.0 4.0 4.0 4.0 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.0
## [39] 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.5 2.5 2.5 2.5 2.5 2.5 2.0 2.0 2.0 2.0 2.0
## [58] 2.0 2.0 1.5 1.5 1.0 1.0 1.0
rounds_df <- player_info_df[, c("Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7")]
#print(rounds_df)
round_results <- rounds_df
round_opponents <- rounds_df
for (col in colnames(rounds_df)) {
round_results[[col]] <- gsub("[0-9]+", "", rounds_df[[col]])
round_opponents[[col]] <- gsub("[^0-9]", "", rounds_df[[col]])
}
colnames(round_results) <- paste0("Round", 1:7, "_Result")
colnames(round_opponents) <- paste0("Round", 1:7, "_Opponent")
print(head(round_results))
## Round1_Result Round2_Result Round3_Result Round4_Result Round5_Result
## 2 W W W W W
## 3 W W L W W
## 4 L W W W W
## 5 W D W W D
## 6 W W D D D
## 7 W D L W D
## Round6_Result Round7_Result
## 2 D D
## 3 W W
## 4 W W
## 5 W D
## 6 W W
## 7 W W
print(head(round_opponents))
## Round1_Opponent Round2_Opponent Round3_Opponent Round4_Opponent
## 2 39 21 18 14
## 3 63 58 4 17
## 4 8 61 25 21
## 5 23 28 2 26
## 6 45 37 12 13
## 7 34 29 11 35
## Round5_Opponent Round6_Opponent Round7_Opponent
## 2 7 12 4
## 3 16 20 7
## 4 11 13 12
## 5 5 19 1
## 6 4 14 17
## 7 10 27 21
ratings <- ratings_info_df$'USCF ID / Rtg (Pre->Post)'
head(ratings)
## [1] "15445895 / R: 1794 ->1817" "14598900 / R: 1553 ->1663"
## [3] "14959604 / R: 1384 ->1640" "12616049 / R: 1716 ->1744"
## [5] "14601533 / R: 1655 ->1690" "15055204 / R: 1686 ->1687"
pre_rating <- as.numeric(gsub(".*R:\\s*(\\d+).*", "\\1", ratings_info_df$`USCF ID / Rtg (Pre->Post)`))
print(pre_rating)
## [1] 1794 1553 1384 1716 1655 1686 1649 1641 1411 1365 1712 1663 1666 1610 1220
## [16] 1604 1629 1600 1564 1595 1563 1555 1363 1229 1745 1579 1552 1507 1602 1522
## [31] 1494 1441 1449 1399 1438 1355 980 1423 1436 1348 1403 1332 1283 1199 1242
## [46] 377 1362 1382 1291 1056 1011 935 1393 1270 1186 1153 1092 917 853 967
## [61] 955 1530 1175 1163
Merging all my extracted results into a single dataframe
final_df <- data.frame(
Player_Name = player_names,
Player_State = player_state,
Total_Points = total_points,
Pre_Rating = pre_rating,
Round1_Opponent = round_opponents$Round1_Opponent,
Round1_Result = round_results$Round1_Result,
Round2_Opponent = round_opponents$Round2_Opponent,
Round2_Result = round_results$Round2_Result,
Round3_Opponent = round_opponents$Round3_Opponent,
Round3_Result = round_results$Round3_Result,
Round4_Opponent = round_opponents$Round4_Opponent,
Round4_Result = round_results$Round4_Result,
Round5_Opponent = round_opponents$Round5_Opponent,
Round5_Result = round_results$Round5_Result,
Round6_Opponent = round_opponents$Round6_Opponent,
Round6_Result = round_results$Round6_Result,
Round7_Opponent = round_opponents$Round7_Opponent,
Round7_Result = round_results$Round7_Result,
stringsAsFactors = FALSE
)
print(head(final_df))
## Player_Name Player_State Total_Points Pre_Rating Round1_Opponent
## 1 GARY HUA ON 6.0 1794 39
## 2 DAKSHESH DARURI MI 6.0 1553 63
## 3 ADITYA BAJAJ MI 6.0 1384 8
## 4 PATRICK H SCHILLING MI 5.5 1716 23
## 5 HANSHI ZUO MI 5.5 1655 45
## 6 HANSEN SONG OH 5.0 1686 34
## Round1_Result Round2_Opponent Round2_Result Round3_Opponent Round3_Result
## 1 W 21 W 18 W
## 2 W 58 W 4 L
## 3 L 61 W 25 W
## 4 W 28 D 2 W
## 5 W 37 W 12 D
## 6 W 29 D 11 L
## Round4_Opponent Round4_Result Round5_Opponent Round5_Result Round6_Opponent
## 1 14 W 7 W 12
## 2 17 W 16 W 20
## 3 21 W 11 W 13
## 4 26 W 5 D 19
## 5 13 D 4 D 14
## 6 35 W 10 D 27
## Round6_Result Round7_Opponent Round7_Result
## 1 D 4 D
## 2 W 7 W
## 3 W 12 W
## 4 W 1 D
## 5 W 17 W
## 6 W 21 W
avg_opponent_rating <- numeric(nrow(final_df))
for (i in 1:nrow(final_df)) {
opponent_ratings <- c()
opponent_numbers <- as.numeric(c(final_df$Round1_Opponent[i],
final_df$Round2_Opponent[i],
final_df$Round3_Opponent[i],
final_df$Round4_Opponent[i],
final_df$Round5_Opponent[i],
final_df$Round6_Opponent[i],
final_df$Round7_Opponent[i]))
valid_opponents <- opponent_numbers[!is.na(opponent_numbers) & opponent_numbers > 0]
if (length(valid_opponents) > 0) {
for (opponent in valid_opponents) {
opponent_rating <- final_df$Pre_Rating[opponent] # Get the opponent's pre-rating
opponent_ratings <- c(opponent_ratings, opponent_rating) # Add to the list
}
avg_rating <- mean(as.numeric(opponent_ratings), na.rm = TRUE)
print(avg_rating)
avg_opponent_rating[i] <- avg_rating
} else {
avg_opponent_rating[i] <- NA
}
}
## [1] 1605.286
## [1] 1469.286
## [1] 1563.571
## [1] 1573.571
## [1] 1500.857
## [1] 1518.714
## [1] 1372.143
## [1] 1468.429
## [1] 1523.143
## [1] 1554.143
## [1] 1467.571
## [1] 1506.167
## [1] 1497.857
## [1] 1515
## [1] 1483.857
## [1] 1385.8
## [1] 1498.571
## [1] 1480
## [1] 1426.286
## [1] 1410.857
## [1] 1470.429
## [1] 1300.333
## [1] 1213.857
## [1] 1357
## [1] 1363.286
## [1] 1506.857
## [1] 1221.667
## [1] 1522.143
## [1] 1313.5
## [1] 1144.143
## [1] 1259.857
## [1] 1378.714
## [1] 1276.857
## [1] 1375.286
## [1] 1149.714
## [1] 1388.167
## [1] 1384.8
## [1] 1539.167
## [1] 1429.571
## [1] 1390.571
## [1] 1248.5
## [1] 1149.857
## [1] 1106.571
## [1] 1327
## [1] 1152
## [1] 1357.714
## [1] 1392
## [1] 1355.8
## [1] 1285.8
## [1] 1296
## [1] 1356.143
## [1] 1494.571
## [1] 1345.333
## [1] 1206.167
## [1] 1406
## [1] 1414.4
## [1] 1363
## [1] 1391
## [1] 1319
## [1] 1330.2
## [1] 1327.286
## [1] 1186
## [1] 1350.2
## [1] 1263
final_df$Avg_Opponent_Rating <- avg_opponent_rating
print(head(final_df))
## Player_Name Player_State Total_Points Pre_Rating Round1_Opponent
## 1 GARY HUA ON 6.0 1794 39
## 2 DAKSHESH DARURI MI 6.0 1553 63
## 3 ADITYA BAJAJ MI 6.0 1384 8
## 4 PATRICK H SCHILLING MI 5.5 1716 23
## 5 HANSHI ZUO MI 5.5 1655 45
## 6 HANSEN SONG OH 5.0 1686 34
## Round1_Result Round2_Opponent Round2_Result Round3_Opponent Round3_Result
## 1 W 21 W 18 W
## 2 W 58 W 4 L
## 3 L 61 W 25 W
## 4 W 28 D 2 W
## 5 W 37 W 12 D
## 6 W 29 D 11 L
## Round4_Opponent Round4_Result Round5_Opponent Round5_Result Round6_Opponent
## 1 14 W 7 W 12
## 2 17 W 16 W 20
## 3 21 W 11 W 13
## 4 26 W 5 D 19
## 5 13 D 4 D 14
## 6 35 W 10 D 27
## Round6_Result Round7_Opponent Round7_Result Avg_Opponent_Rating
## 1 D 4 D 1605.286
## 2 W 7 W 1469.286
## 3 W 12 W 1563.571
## 4 W 1 D 1573.571
## 5 W 17 W 1500.857
## 6 W 21 W 1518.714
Writing my dataframe to csv file
write_csv(final_df, "D:\\CUNY Data Science\\607 - Statistics and Probability for Data Analytics\\output_data\\chess_tournament_averages.csv")