# Load the necessary library
library(stringr)
library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Read the data from the URL
chess <- read.delim('https://raw.githubusercontent.com/LwinShwe/607Project1/main/tournament.txt', skip = 3, header = FALSE, stringsAsFactors=FALSE)
# visualize the first few rows of the data
head(chess)
## V1
## 1 -----------------------------------------------------------------------------------------
## 2 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 3 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 4 -----------------------------------------------------------------------------------------
## 5 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## 6 MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
Raw Data is cleaned by removing extra lines, eliminating empty lines and taking off “-” hyphen from V1 column in between the text.A regular expression pattern is used to extract Player Names from V1 Column that matches at least a first and last name and empty string are removed.In addition, Player States are retrieved that matches two capital letters followed by a space and a pipe symbol “|”. The total number of points are caluculated that capture all possible decimal numbers and remove blank rows.
# Remove hyphens ("-") from the V1 column
chess$V1 <- str_replace_all(chess$V1, "-|\\n", "") # Remove hyphens ("-") from the V1 column
# Find player's names from the V1 column using regx and remove empty strings
PlayerNames <- unlist(str_extract_all(chess$V1, "\\w+[[:space:]]\\w+([[:space:]]\\w+)*", simplify = TRUE))
PlayerNames <- PlayerNames[PlayerNames != ""]
head(PlayerNames)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
# Find player's States from the V1 column using regx then remove a pipe symbol "|" and blank rows from the data
PlayerStates <- unlist(str_extract_all(chess$V1, "[A-Z][A-Z][[:space:]]\\|"))
PlayerStates <- str_split(PlayerStates, "[[:space:]]\\|", simplify = TRUE)
PlayerStates <- PlayerStates[, -ncol(PlayerStates)]
PlayerStates <- PlayerStates[PlayerStates != ""]
head(PlayerStates)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
# Find decimal numbers from the V1 column using regular expression pattern and remove blank rows
TotalPoints <- unlist(str_extract_all(chess$V1, "\\b\\d+\\.\\d+\\b", simplify = TRUE))
TotalPoints <- TotalPoints[TotalPoints != ""]
head(TotalPoints)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"
# Find the Players Pre-Ratings that get numbers after R: and before any number of space and remove blank rows from the data
PreRatings <- unlist(str_extract_all(chess[,], "[R:]([[:space:]]+)([[:alnum:]]+)([[:space:]]*)", simplify=TRUE))
PreRatings <- unlist(str_extract_all(PreRatings, "\\d+[[:alnum:]]+", simplify=TRUE))
PreRatings <- unlist(str_extract_all(PreRatings, "\\d\\d\\d+", simplify=TRUE))
PreRatings <- PreRatings[!PreRatings[,] == "",]
PreRatings <- as.numeric(PreRatings)
head(PreRatings)
## [1] 1794 1553 1384 1716 1655 1686
# Extract the opponent strings
OpponentData <- unlist(str_extract_all(chess[,], "([\\|][A-Z]([[:space:]]+)\\d*[\\|])([A-Z]([[:space:]]+)\\d*[\\|])*", simplify=TRUE))
Opponents <- matrix(ncol=7)
# Find the individual Opponent Indexes into a matrix of 7 columns
Opponents <- unlist(str_extract_all(OpponentData[,], "\\d+", simplify=TRUE))
Opponents <- Opponents[rowSums(Opponents=="")!=ncol(Opponents), ]
# Set an empty vector to store the Average Pre Chess Rating of Opponents
AvgPreRaing <- numeric()
# Loop through each row of Opponents
for (row in 1:nrow(Opponents)) {
indexes <- Opponents[row, Opponents[row, ] != ""] # Get the opponent indexes for the current row excluding empty values
indexes <- as.integer(indexes) # Convert the indexes to integers
opponent_ratings <- PreRatings[indexes] # Extract corresponding pre-ratings for the indexes
avg_prerating <- mean(opponent_ratings, na.rm = TRUE) # Calculate the average opponent rating for the current row
AvgPreRaing <- c(AvgPreRaing, avg_prerating) # Append the average rating to the Rating Averages vector
}
# Display average opponent ratings
head(AvgPreRaing)
## [1] 1605.286 1469.286 1563.571 1573.571 1500.857 1518.714
# Create the Tournament Results data frame
Results <- data.frame(
"Player Name" = PlayerNames,
"Player State" = PlayerStates,
"Total Number of Points" = TotalPoints,
"Player's Pre-Rating" = PreRatings,
"Average Pre Chess Rating of Opponents" = AvgPreRaing
)
#Writing CSV
write.csv(Results, 'game-results.csv', TRUE)
## Warning in utils::write.table(Results, "game-results.csv", TRUE, col.names =
## NA, : appending column names to file
The “Results” data frame has 64 observations and 5 variables which is exported as game-results.csv file.