# Load Raw Text file Data from github
file_path <- 'https://raw.githubusercontent.com/Badigun/Data-607-Assignments/refs/heads/main/chess%20tournament%20file.txt'
tournament_data <- read_lines(file_path)
# View the first few lines of the Data
head(tournament_data)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
tail(tournament_data)
## [1] " 63 | THOMAS JOSEPH HOSMER |1.0 |L 2|L 48|D 49|L 43|L 45|H |U |"
## [2] " MI | 15057092 / R: 1175 ->1125 | |W |B |W |B |B | | |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 64 | BEN LI |1.0 |L 22|D 30|L 31|D 49|L 46|L 42|L 54|"
## [5] " MI | 15006561 / R: 1163 ->1112 | |B |W |W |B |W |B |B |"
## [6] "-----------------------------------------------------------------------------------------"
I observe that the data follows a specific pattern. The first four lines contain non-data information, so they are excluded from the dataset. After that, the player details and game statistics appear in a repeating sequence every three lines. The data is then organize into two matrices, one for player information and the other for their game statistics, to structure it like a more conventional dataset.
Each player has two lines in the file, which includes;
Line 1: Name, total points, and opponent numbers.
Line 2: State and pre-rating.
# Name, total points, and opponent numbers
edit_tournament_data <- matrix(unlist(tournament_data), byrow=TRUE)
m1 <- edit_tournament_data[seq(5,length(edit_tournament_data),3)]
head(m1)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
Another way to extract player information for Line 1: Name, total points, and opponent numbers
# Identify lines that contain player information
player_lines <- tournament_data[grep("^\\s*\\d+", tournament_data)]
# Extract Player's Name, Points, and Opponents
players_info <- lapply(player_lines, function(line) {
elements <- unlist(strsplit(line, "\\|"))
elements <- trimws(elements) # Remove leading and trailing spaces
# Extract fields
name <- elements[2] # Player Name
total_points <- as.numeric(elements[3]) # Total Points
opponents <- unlist(str_extract_all(elements[6:length(elements)], "\\d+")) # Opponent Numbers
return(list(name, total_points, opponents))
})
# Convert to DataFrame
players_df <- data.frame(
Name = sapply(players_info, `[[`, 1),
Points = sapply(players_info, `[[`, 2),
Opponents = sapply(players_info, function(x) paste(x[[3]], collapse = ",")), # Convert list to string
stringsAsFactors = FALSE
)
head(players_df)
## Name Points Opponents
## 1 GARY HUA 6.0 18,14,7,12,4
## 2 DAKSHESH DARURI 6.0 4,17,16,20,7
## 3 ADITYA BAJAJ 6.0 25,21,11,13,12
## 4 PATRICK H SCHILLING 5.5 2,26,5,19,1
## 5 HANSHI ZUO 5.5 12,13,4,14,17
## 6 HANSEN SONG 5.0 11,35,10,27,21
This code is to extract Line 2: State and pre-rating
m2 <- edit_tournament_data[seq(6,length(edit_tournament_data),3)]
head(m2)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
Another way to extract Line 2: State and pre-rating
# Find the second line for each player's entry (contains state & rating)
rating_lines <- tournament_data[grep("R:\\s*\\d+", tournament_data)]
# Extract State and Pre-Rating
ratings_info <- lapply(rating_lines, function(line) {
elements <- unlist(strsplit(line, "\\|"))
elements <- trimws(elements)
state <- substr(elements[1], 1, 2) # Extract first two letters as state
pre_rating <- as.numeric(str_extract(elements[2], "(?<=R:\\s)\\d+")) # Extract pre-rating
return(list(state, pre_rating))
})
# Add to DataFrame
players_df$State <- sapply(ratings_info, `[[`, 1)
players_df$Pre_Rating <- sapply(ratings_info, `[[`, 2)
Since the Data is more organized, capturing of the data featured can be done
# Convert m1 and m2 to character vectors
m1 <- as.character(m1)
m2 <- as.character(m2)
# Extract the ID (first number)
ID <- as.numeric(str_extract(m1, '\\d+'))
# Extract the Name (first combination of a letter, any amount of characters, and "|")
Name <- str_extract(m1, '[A-Za-z].{1,32}')
# Extract the name part (up to two spaces) and trim whitespace
Name <- str_trim(str_extract(Name, '.+\\s{2,}'))
# Extract the state (first two uppercase letters) from m2
State <- str_extract(m2, '[A-Z]{2}')
# Extract the total number of points (number with a decimal)
TotalNumberofPoints <- as.numeric(str_extract(m1, '\\d+\\.\\d'))
# Extract the pre-rating (combination of "R", characters, and "-")
PreRating <- str_extract(m2, 'R:.{8,}-')
# Extract the numeric value from the pre-rating
PreRating <- as.numeric(str_extract(PreRating, '\\d{1,4}'))
# Extract all rounds (combinations of 1 letter, 2 spaces, and numbers)
Rounds <- str_extract_all(m1, '[A-Z]\\s{2,}\\d+', simplify = TRUE)
# Extract numbers from the rounds
Rounds <- str_extract_all(Rounds, '\\d+', simplify = TRUE)
Compute the average pre-rating of opponents using the vectors from the previous step
# Check the structure of the dataframe before applying the function
print(str(players_df)) # Ensure 'Opponents' column exists
## 'data.frame': 64 obs. of 5 variables:
## $ Name : chr "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" "PATRICK H SCHILLING" ...
## $ Points : num 6 6 6 5.5 5.5 5 5 5 5 5 ...
## $ Opponents : chr "18,14,7,12,4" "4,17,16,20,7" "25,21,11,13,12" "2,26,5,19,1" ...
## $ State : chr "ON" "MI" "MI" "MI" ...
## $ Pre_Rating: num 1794 1553 1384 1716 1655 ...
## NULL
# Function to compute the average pre-rating of opponents
calculate_avg_opponent_rating <- function(opponents, players_df) {
# Check if opponents string is valid
if (opponents == "" || is.na(opponents)) {
return(NA) # Return NA if opponents data is missing
}
# Convert opponent list (stored as string) into numeric vector
opponent_numbers <- as.numeric(unlist(strsplit(opponents, ",")))
# Check if the opponent_numbers vector is valid
if (length(opponent_numbers) == 0 || any(is.na(opponent_numbers))) {
return(NA) # Return NA if opponent numbers are invalid
}
# Get the pre-ratings of the opponents, ensuring valid indexing
opponent_ratings <- players_df$Pre_Rating[match(opponent_numbers, seq_len(nrow(players_df)))]
# Compute and return the average, handling NA values
return(round(mean(opponent_ratings, na.rm = TRUE), 1))
}
# Apply the function to the 'Opponents' column
avg_ratings <- lapply(players_df$Opponents, function(opponents) calculate_avg_opponent_rating(opponents, players_df))
# Check the result of the function
print(length(avg_ratings)) # Should print the number of players (64)
## [1] 64
print(head(avg_ratings)) # Check the first few elements
## [[1]]
## [1] 1647.6
##
## [[2]]
## [1] 1638.6
##
## [[3]]
## [1] 1669.8
##
## [[4]]
## [1] 1629
##
## [[5]]
## [1] 1656.8
##
## [[6]]
## [1] 1526
# If the result is correct, proceed to assign it to the dataframe
if (length(avg_ratings) == nrow(players_df)) {
players_df$Avg_Opp_Pre_Rating <- unlist(avg_ratings)
} else {
warning("The number of calculated ratings does not match the number of players!")
}
# Check the updated dataframe
print(str(players_df))
## 'data.frame': 64 obs. of 6 variables:
## $ Name : chr "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" "PATRICK H SCHILLING" ...
## $ Points : num 6 6 6 5.5 5.5 5 5 5 5 5 ...
## $ Opponents : chr "18,14,7,12,4" "4,17,16,20,7" "25,21,11,13,12" "2,26,5,19,1" ...
## $ State : chr "ON" "MI" "MI" "MI" ...
## $ Pre_Rating : num 1794 1553 1384 1716 1655 ...
## $ Avg_Opp_Pre_Rating: num 1648 1639 1670 1629 1657 ...
## NULL
# Remove the 'Opponents' column if needed
if("Opponents" %in% colnames(players_df)) {
players_df <- players_df %>% select(-Opponents)
} else {
warning("Opponents column does not exist!")
}
write_csv(players_df, "ChessTournament_results.csv")