607-Project1

Loading the given Text File from raw data in GitHub repository

# Load the necessary library
library(stringr)
library(readr)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Read the data from the URL
chess <- read.delim('https://raw.githubusercontent.com/LwinShwe/607Project1/main/tournament.txt', skip = 3, header = FALSE, stringsAsFactors=FALSE)

# visualize the first few rows of the data
head(chess)

##                                                                                          V1
## 1 -----------------------------------------------------------------------------------------
## 2     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 3    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 4 -----------------------------------------------------------------------------------------
## 5     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## 6    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |

Organizing Data

Raw Data is cleaned by removing extra lines, eliminating empty lines and taking off “-” hyphen from V1 column in between the text.A regular expression pattern is used to extract Player Names from V1 Column that matches at least a first and last name and empty string are removed.In addition, Player States are retrieved that matches two capital letters followed by a space and a pipe symbol “|”. The total number of points are caluculated that capture all possible decimal numbers and remove blank rows.

# Remove hyphens ("-") from the V1 column
chess$V1 <- str_replace_all(chess$V1, "-|\\n", "") # Remove hyphens ("-") from the V1 column

# Find player's names from the V1 column using regx and remove empty strings
PlayerNames <- unlist(str_extract_all(chess$V1, "\\w+[[:space:]]\\w+([[:space:]]\\w+)*", simplify = TRUE))
PlayerNames <- PlayerNames[PlayerNames != ""]
head(PlayerNames)

## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"

# Find player's States from the V1 column using regx then remove a pipe symbol "|" and blank rows from the data
PlayerStates <- unlist(str_extract_all(chess$V1, "[A-Z][A-Z][[:space:]]\\|"))
PlayerStates <- str_split(PlayerStates, "[[:space:]]\\|", simplify = TRUE)
PlayerStates <- PlayerStates[, -ncol(PlayerStates)]
PlayerStates <- PlayerStates[PlayerStates != ""]
head(PlayerStates)

## [1] "ON" "MI" "MI" "MI" "MI" "OH"

# Find decimal numbers from the V1 column using regular expression pattern and remove blank rows
TotalPoints <- unlist(str_extract_all(chess$V1, "\\b\\d+\\.\\d+\\b", simplify = TRUE))
TotalPoints <- TotalPoints[TotalPoints != ""] 
head(TotalPoints)

## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"

# Find the Players Pre-Ratings that get numbers after R: and before any number of space and remove blank rows from the data
PreRatings <- unlist(str_extract_all(chess[,], "[R:]([[:space:]]+)([[:alnum:]]+)([[:space:]]*)", simplify=TRUE))
PreRatings <- unlist(str_extract_all(PreRatings, "\\d+[[:alnum:]]+", simplify=TRUE))
PreRatings <- unlist(str_extract_all(PreRatings, "\\d\\d\\d+", simplify=TRUE))
PreRatings <- PreRatings[!PreRatings[,] == "",]
PreRatings <- as.numeric(PreRatings)
head(PreRatings)

## [1] 1794 1553 1384 1716 1655 1686

# Extract the opponent strings
OpponentData <- unlist(str_extract_all(chess[,], "([\\|][A-Z]([[:space:]]+)\\d*[\\|])([A-Z]([[:space:]]+)\\d*[\\|])*", simplify=TRUE))
Opponents <- matrix(ncol=7)

# Find the individual Opponent Indexes into a matrix of 7 columns
Opponents <- unlist(str_extract_all(OpponentData[,], "\\d+", simplify=TRUE))
Opponents <- Opponents[rowSums(Opponents=="")!=ncol(Opponents), ]

Calculate Average Pre Chess Rating of Opponents

# Set an empty vector to store the Average Pre Chess Rating of Opponents
AvgPreRaing <- numeric()

# Loop through each row of Opponents

for (row in 1:nrow(Opponents)) {
  indexes <- Opponents[row, Opponents[row, ] != ""] # Get the opponent indexes for the current row excluding empty values
  indexes <- as.integer(indexes) # Convert the indexes to integers
  opponent_ratings <- PreRatings[indexes] # Extract corresponding pre-ratings for the indexes
  avg_prerating <- mean(opponent_ratings, na.rm = TRUE) # Calculate the average opponent rating for the current row
  AvgPreRaing <- c(AvgPreRaing, avg_prerating) # Append the average rating to the Rating Averages vector
}

# Display average opponent ratings
head(AvgPreRaing)

## [1] 1605.286 1469.286 1563.571 1573.571 1500.857 1518.714

Build a cleaned and organized data frame of “tournament.txt” given in the project 1

# Create the Tournament Results data frame 
Results <- data.frame(
  "Player Name" = PlayerNames,
  "Player State" = PlayerStates,
  "Total Number of Points" = TotalPoints,
  "Player's Pre-Rating" = PreRatings,
  "Average Pre Chess Rating of Opponents" = AvgPreRaing
)

#Writing CSV 
write.csv(Results, 'game-results.csv', TRUE)

## Warning in utils::write.table(Results, "game-results.csv", TRUE, col.names =
## NA, : appending column names to file

The “Results” data frame has 64 observations and 5 variables which is exported as game-results.csv file.

607-Project1

Lwin Shwe

2023-09-22

Loading the given Text File from raw data in GitHub repository

Organizing Data

Calculate Average Pre Chess Rating of Opponents

Build a cleaned and organized data frame of “tournament.txt” given in the project 1