Data 607 Project 1

In Project 1 for Data 607, a text file is given with chess tournament results. In the assignment, the text file had to first be transformed organized data that can then be made into a .csv file.

Load libraries.

library(RCurl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.3.6     ✔ readr   2.1.2
## ✔ tibble  3.1.7     ✔ purrr   0.3.4
## ✔ tidyr   1.2.0     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::complete() masks RCurl::complete()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ dplyr::lag()      masks stats::lag()

Reading from a url text file.

url <- "https://raw.githubusercontent.com/melbow2424/Data-607--Project-1/main/tournamentinfo.txt"
tournament <- read_tsv(url)

## Rows: 195 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): -------------------------------------------------------------------...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

print(tournament)

## # A tibble: 195 × 1
##    ---------------------------------------------------------------------------…¹
##    <chr>                                                                        
##  1 Pair | Player Name                     |Total|Round|Round|Round|Round|Round|…
##  2 Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |…
##  3 ----------------------------------------------------------------------------…
##  4 1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  …
##  5 ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B …
##  6 ----------------------------------------------------------------------------…
##  7 2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  …
##  8 MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W …
##  9 ----------------------------------------------------------------------------…
## 10 3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  …
## # … with 185 more rows, and abbreviated variable name
## #   ¹`-----------------------------------------------------------------------------------------`
## # ℹ Use `print(n = ...)` to see more rows

Here, I went from text file info right into finding the info needed for the .csv file.

Had to un-list the regex pattern due to the fact that when they are added to the data frame, the list info is printed as header info.

#Getting State: 
url_state <- str_extract_all(tournament, "([A-Z][A-Z])\\s\\|")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

#print(url_state) Test
#Unlist State to place in DataFrame 
state = unlist(url_state)
#print(state) Test

#Getting Total Number of Points:
url_points <- str_extract_all(tournament, "\\d\\.\\d")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

#print(url_points) Test
#Unlist Points to place in DataFrame 
points = unlist(url_points)
#print(points)

#Getting  Player’s Pre-Rating
url_rating <- str_extract_all(tournament, "R.\\s+\\d+")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

#print(state_rate_cleaning)
#Unlist Ratings to place in DataFrame
rating = unlist(url_rating)
#print(rating)

To isolate the names with a type length of 64 required more data extraction and removal then the other info needed in the csv file.

#Getting Player Name Only: 
#Trying to remove patterns from a list with string elements
url_extract <- str_extract_all(tournament, "(\\s[A-Z]+\\s[A-Z]+)?((\\s|-)[A-Z]+)?(\\s|-)[A-Z]+")

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

#print(url_extract) Testing
x = unlist(url_extract)
#print(x) Test

remove_url <- str_remove(x, pattern = " R")
remove_url_P <- str_remove(remove_url, pattern = " P")
remove_url_N <- str_remove(remove_url_P, pattern = " N")
remove_url_USCF_ID <- str_remove(remove_url_N, pattern = " USCF ID")
#Filter a Vector in R
remove_url_last <- remove_url_USCF_ID[remove_url_USCF_ID != '']

Once the column information was isolated, it was placed into a data frame.

df <- data.frame(player_name = remove_url_last, state = state, points = points, rating = rating)
head(df)

##          player_name state points  rating
## 1           GARY HUA  ON |    6.0 R: 1794
## 2    DAKSHESH DARURI  MI |    6.0 R: 1553
## 3       ADITYA BAJAJ  MI |    6.0 R: 1384
## 4 ATRICK H SCHILLING  MI |    5.5 R: 1716
## 5         HANSHI ZUO  MI |    5.5 R: 1655
## 6        HANSEN SONG  OH |    5.0 R: 1686

As shown, there was information that needed correcting in the data frame. In the player names column, some letters where missing which was due to the fact of how the cleaning of the text file was approached in the beginning. The state text information had the extra element of ” |“. That needed to be removed and in the rating the element”R: ” needed removing from the data frame.

#Cleaning State by getting ride of " |"
df_state_cleaning <- str_remove(df$state, pattern = "\\|")

#Cleaning Rate by getting ride of "R: "
df_rate_cleaning <- str_remove(df$rating, pattern = "R.\\s")

#Cleaning Names in Player Names
df_player_clean_1 <- str_replace(df$player_name, "ATRICK", "PATRICK")
df_player_clean_2 <- str_replace(df_player_clean_1, "JOSHUAHILIP", "JOSHUA PHILIP")
df_player_clean_3 <- str_replace(df_player_clean_2, "AMIYATOSHWNANANDAM", "AMIYATOSH PWNANANDAM")
df_player_clean_4 <- str_replace(df_player_clean_3, "MIKEIKITIN", "MIKE NIKITIN")
df_player_clean_5 <- str_replace(df_player_clean_4, "ANVITAO", "ANVIT RAO")
df_player_clean_6 <- str_replace(df_player_clean_5, "ONALD GRZEGORCZYK", "RONALD GRZEGORCZYK")
df_player_clean_7 <- str_replace(df_player_clean_6, "DIPANKAROY", "DIPANKAR ROY")
df_player_clean_8 <- str_replace(df_player_clean_7, "MICHAEL ALDRICH", "MICHAEL R ALDRICH")
df_player_clean_9 <- str_replace(df_player_clean_8, "ISHI SHETTY", "RISHI SHETTY")
df_player_clean_10 <- str_replace(df_player_clean_9, "JOEL HENDON", "JOEL R HENDON")
df_player_clean_11 <- str_replace(df_player_clean_10, "MARISAICCI", "MARISA RICCI")

Completed clean data frame:

df <- data.frame(player_name = df_player_clean_11, state = df_state_cleaning, points = points, rating = df_rate_cleaning)

head(df)

##           player_name state points rating
## 1            GARY HUA   ON     6.0   1794
## 2     DAKSHESH DARURI   MI     6.0   1553
## 3        ADITYA BAJAJ   MI     6.0   1384
## 4 PATRICK H SCHILLING   MI     5.5   1716
## 5          HANSHI ZUO   MI     5.5   1655
## 6         HANSEN SONG   OH     5.0   1686

To finish off the assignment, the data frame was then written into a .csv file.

write.csv(df, file = "ChessResults.csv")

Data 607 Project 1

Melissa Bowman

2022-09-25