Project 1 Data 607

Project 1

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents. For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605. 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

Loading Data

# Load Raw Text file Data from github

file_path <- 'https://raw.githubusercontent.com/Badigun/Data-607-Assignments/refs/heads/main/chess%20tournament%20file.txt'

tournament_data <- read_lines(file_path)

# View the first few lines of the Data
head(tournament_data)

## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

tail(tournament_data)

## [1] "   63 | THOMAS JOSEPH HOSMER            |1.0  |L   2|L  48|D  49|L  43|L  45|H    |U    |"
## [2] "   MI | 15057092 / R: 1175   ->1125     |     |W    |B    |W    |B    |B    |     |     |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] "   64 | BEN LI                          |1.0  |L  22|D  30|L  31|D  49|L  46|L  42|L  54|"
## [5] "   MI | 15006561 / R: 1163   ->1112     |     |B    |W    |W    |B    |W    |B    |B    |"
## [6] "-----------------------------------------------------------------------------------------"

Edit Player information in the Data

I observe that the data follows a specific pattern. The first four lines contain non-data information, so they are excluded from the dataset. After that, the player details and game statistics appear in a repeating sequence every three lines. The data is then organize into two matrices, one for player information and the other for their game statistics, to structure it like a more conventional dataset.

Each player has two lines in the file, which includes;

Line 1: Name, total points, and opponent numbers.

Line 2: State and pre-rating.

# Name, total points, and opponent numbers
edit_tournament_data <- matrix(unlist(tournament_data), byrow=TRUE)

m1 <- edit_tournament_data[seq(5,length(edit_tournament_data),3)]
head(m1)

## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
## [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"

Another way to extract player information for Line 1: Name, total points, and opponent numbers

# Identify lines that contain player information 
player_lines <- tournament_data[grep("^\\s*\\d+", tournament_data)]

# Extract Player's Name, Points, and Opponents
players_info <- lapply(player_lines, function(line) {
  elements <- unlist(strsplit(line, "\\|"))
  elements <- trimws(elements)  # Remove leading and trailing spaces
  
  # Extract fields
  name <- elements[2]  # Player Name
  total_points <- as.numeric(elements[3])  # Total Points
  opponents <- unlist(str_extract_all(elements[6:length(elements)], "\\d+"))  # Opponent Numbers
  
  return(list(name, total_points, opponents))
})

# Convert to DataFrame
players_df <- data.frame(
  Name = sapply(players_info, `[[`, 1),
  Points = sapply(players_info, `[[`, 2),
  Opponents = sapply(players_info, function(x) paste(x[[3]], collapse = ",")), # Convert list to string
  stringsAsFactors = FALSE
)
head(players_df)

##                  Name Points      Opponents
## 1            GARY HUA    6.0   18,14,7,12,4
## 2     DAKSHESH DARURI    6.0   4,17,16,20,7
## 3        ADITYA BAJAJ    6.0 25,21,11,13,12
## 4 PATRICK H SCHILLING    5.5    2,26,5,19,1
## 5          HANSHI ZUO    5.5  12,13,4,14,17
## 6         HANSEN SONG    5.0 11,35,10,27,21

This code is to extract Line 2: State and pre-rating

m2 <- edit_tournament_data[seq(6,length(edit_tournament_data),3)]
head(m2)

## [1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"

Another way to extract Line 2: State and pre-rating

# Find the second line for each player's entry (contains state & rating)
rating_lines <- tournament_data[grep("R:\\s*\\d+", tournament_data)]

# Extract State and Pre-Rating
ratings_info <- lapply(rating_lines, function(line) {
  elements <- unlist(strsplit(line, "\\|"))
  elements <- trimws(elements)

  state <- substr(elements[1], 1, 2)  # Extract first two letters as state
  pre_rating <- as.numeric(str_extract(elements[2], "(?<=R:\\s)\\d+"))  # Extract pre-rating

  return(list(state, pre_rating))
})

# Add to DataFrame
players_df$State <- sapply(ratings_info, `[[`, 1)
players_df$Pre_Rating <- sapply(ratings_info, `[[`, 2)

Capturing The Data

Since the Data is more organized, capturing of the data featured can be done

# Convert m1 and m2 to character vectors 
m1 <- as.character(m1)
m2 <- as.character(m2)

# Extract the ID (first number)
ID <- as.numeric(str_extract(m1, '\\d+'))

# Extract the Name (first combination of a letter, any amount of characters, and "|")
Name <- str_extract(m1, '[A-Za-z].{1,32}') 

# Extract the name part (up to two spaces) and trim whitespace
Name <- str_trim(str_extract(Name, '.+\\s{2,}'))

# Extract the state (first two uppercase letters) from m2
State <- str_extract(m2, '[A-Z]{2}')

# Extract the total number of points (number with a decimal)
TotalNumberofPoints <- as.numeric(str_extract(m1, '\\d+\\.\\d'))

# Extract the pre-rating (combination of "R", characters, and "-")
PreRating <- str_extract(m2, 'R:.{8,}-')

# Extract the numeric value from the pre-rating
PreRating <- as.numeric(str_extract(PreRating, '\\d{1,4}'))

# Extract all rounds (combinations of 1 letter, 2 spaces, and numbers)
Rounds <- str_extract_all(m1, '[A-Z]\\s{2,}\\d+', simplify = TRUE)

# Extract numbers from the rounds
Rounds <- str_extract_all(Rounds, '\\d+', simplify = TRUE)

Compute Average Opponent Pre-Rating

Compute the average pre-rating of opponents using the vectors from the previous step

# Check the structure of the dataframe before applying the function
print(str(players_df))  # Ensure 'Opponents' column exists

## 'data.frame':    64 obs. of  5 variables:
##  $ Name      : chr  "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" "PATRICK H SCHILLING" ...
##  $ Points    : num  6 6 6 5.5 5.5 5 5 5 5 5 ...
##  $ Opponents : chr  "18,14,7,12,4" "4,17,16,20,7" "25,21,11,13,12" "2,26,5,19,1" ...
##  $ State     : chr  "ON" "MI" "MI" "MI" ...
##  $ Pre_Rating: num  1794 1553 1384 1716 1655 ...
## NULL

# Function to compute the average pre-rating of opponents
calculate_avg_opponent_rating <- function(opponents, players_df) {
  # Check if opponents string is valid
  if (opponents == "" || is.na(opponents)) {
    return(NA)  # Return NA if opponents data is missing
  }
  
  # Convert opponent list (stored as string) into numeric vector
  opponent_numbers <- as.numeric(unlist(strsplit(opponents, ",")))
  
  # Check if the opponent_numbers vector is valid
  if (length(opponent_numbers) == 0 || any(is.na(opponent_numbers))) {
    return(NA)  # Return NA if opponent numbers are invalid
  }
  
  # Get the pre-ratings of the opponents, ensuring valid indexing
  opponent_ratings <- players_df$Pre_Rating[match(opponent_numbers, seq_len(nrow(players_df)))]
  
  # Compute and return the average, handling NA values
  return(round(mean(opponent_ratings, na.rm = TRUE), 1))
}

# Apply the function to the 'Opponents' column
avg_ratings <- lapply(players_df$Opponents, function(opponents) calculate_avg_opponent_rating(opponents, players_df))

# Check the result of the function
print(length(avg_ratings))  # Should print the number of players (64)

## [1] 64

print(head(avg_ratings))    # Check the first few elements

## [[1]]
## [1] 1647.6
## 
## [[2]]
## [1] 1638.6
## 
## [[3]]
## [1] 1669.8
## 
## [[4]]
## [1] 1629
## 
## [[5]]
## [1] 1656.8
## 
## [[6]]
## [1] 1526

# If the result is correct, proceed to assign it to the dataframe
if (length(avg_ratings) == nrow(players_df)) {
  players_df$Avg_Opp_Pre_Rating <- unlist(avg_ratings)
} else {
  warning("The number of calculated ratings does not match the number of players!")
}

# Check the updated dataframe
print(str(players_df))

## 'data.frame':    64 obs. of  6 variables:
##  $ Name              : chr  "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" "PATRICK H SCHILLING" ...
##  $ Points            : num  6 6 6 5.5 5.5 5 5 5 5 5 ...
##  $ Opponents         : chr  "18,14,7,12,4" "4,17,16,20,7" "25,21,11,13,12" "2,26,5,19,1" ...
##  $ State             : chr  "ON" "MI" "MI" "MI" ...
##  $ Pre_Rating        : num  1794 1553 1384 1716 1655 ...
##  $ Avg_Opp_Pre_Rating: num  1648 1639 1670 1629 1657 ...
## NULL

# Remove the 'Opponents' column if needed
if("Opponents" %in% colnames(players_df)) {
  players_df <- players_df %>% select(-Opponents)
} else {
  warning("Opponents column does not exist!")
}

Writing/Saving Data to CSV

write_csv(players_df, "ChessTournament_results.csv")