Introduction

In this project, the primary focus was on wrangling chess tournament data using regular expressions. The initial step involved pulling the raw data from GitHub, where the data was stored. Following this, the data underwent a series of transformations to ensure it was in a usable format. Notable attention was paid to extracting and organizing crucial information, such as the pre-ratings of players and their respective opponents. This step was particularly important, as it laid the foundation for the subsequent analysis. As the project neared its conclusion, a final dataframe was created, which included essential columns such as Player.Name, Player.State, Total.Pts, Pre.Rating, and Avg_Opp_PreRating. The dataframe was streamlined to eliminate any unnecessary columns, ensuring it was concise and focused. To ensure accessibility and data preservation, the final dataframe was stored in two formats: a MySQL database table provided ease of access for future analysis, while a CSV file served as a backup.

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'RMySQL' was built under R version 4.3.2

Read data from the github

raw_data <- readLines("https://raw.githubusercontent.com/mandiemannz/data-607--Fall-18/master/project1txt", warn = F)
#raw_data

Remove all boarders with dash lines

#replace all dashes 
player_data = str_replace_all(raw_data,"-","") 
#player_data

Convert raw data into R frame and drop unnecessary columns

#read.delim function is used to read dato into R and connection object is created to interface to string "player data"
cleaned_data = read.delim(textConnection(player_data),header = F, sep = "|", stringsAsFactors = F)
# There is nothing in the column 11 so decided to drop that column
cleaned_data = cleaned_data[,-11]
#cleaned_data

Cleanup first two columns and concatenate rest characters to build up a header

#remove all white spaces from row 1 and 2 and concatenate them in 1
header <- make.names(str_sub(paste(trimws(as.character(cleaned_data[1,])), trimws(as.character(cleaned_data[2,]))), 1, 11))

Header

# Convert all rows excluding into tibble 
cleaned_data <- as_tibble(cleaned_data[-c(1, 2), ])

# Assign the head earlier made using make.names function.
names(cleaned_data) <- header
#cleaned_data

Each Player has covered 2 rows, extract State and player ID

# Identify indices for players and state ranks
player_id <- seq(1, nrow(cleaned_data), 2)
state_rank_id <- seq(2, nrow(cleaned_data), 2)

# Select rows for players and state ranks according to the plaeyer_id and state_rank vector earlier made
players <- cleaned_data[player_id, ]
state_ranks <- (select(cleaned_data[state_rank_id, ],State = Pair.Num, Pre.Rating = Player.Name))
# Combine the dataframes
combined_data <- cbind(players, state_ranks)

# Convert the combined dataframe to a tibble
players_tibble <- as_tibble(combined_data)

# Print the first 5 rows of the tibble
print(players_tibble[1:5, ])

## # A tibble: 5 × 12
##   Pair.Num Player.Name Total.Pts Round.1 Round.2 Round.3 Round.4 Round.5 Round.6
##   <chr>    <chr>       <chr>     <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
## 1 "    1 " " GARY HUA… "6.0  "   W  39   W  21   W  18   W  14   W   7   D  12  
## 2 "    2 " " DAKSHESH… "6.0  "   W  63   W  58   L   4   W  17   W  16   W  20  
## 3 "    3 " " ADITYA B… "6.0  "   L   8   W  61   W  25   W  21   W  11   W  13  
## 4 "    4 " " PATRICK … "5.5  "   W  23   D  28   W   2   W  26   D   5   W  19  
## 5 "    5 " " HANSHI Z… "5.5  "   W  45   W  37   D  12   D  13   D   4   W  14  
## # ℹ 3 more variables: Round.7 <chr>, State <chr>, Pre.Rating <chr>

Clean the value of pre.rating column to keep only the pre_rating value.

players_tibble <- players_tibble %>%
  mutate(Pre.Rating = str_replace(Pre.Rating, "[0-9]\\s+/R:\\s+|^[P].+|>.+", "")) %>%
  mutate(Pre.Rating = as.integer(str_extract(Pre.Rating, "\\d+")))
#players_tibble

Fix formats and types of the Total points, Pair.Num, and Player.Name columns

players_tibble <- players_tibble %>%
  mutate(Total.Pts = as.numeric(Total.Pts),
         Pair.Num = as.character(Pair.Num),  # Ensure Pair.Num is treated as character
         Pair.Num = parse_number(Pair.Num),
         Player.Name = trimws(Player.Name))
head(players_tibble)

## # A tibble: 6 × 12
##   Pair.Num Player.Name Total.Pts Round.1 Round.2 Round.3 Round.4 Round.5 Round.6
##      <dbl> <chr>           <dbl> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
## 1        1 GARY HUA          6   W  39   W  21   W  18   W  14   W   7   D  12  
## 2        2 DAKSHESH D…       6   W  63   W  58   L   4   W  17   W  16   W  20  
## 3        3 ADITYA BAJ…       6   L   8   W  61   W  25   W  21   W  11   W  13  
## 4        4 PATRICK H …       5.5 W  23   D  28   W   2   W  26   D   5   W  19  
## 5        5 HANSHI ZUO        5.5 W  45   W  37   D  12   D  13   D   4   W  14  
## 6        6 HANSEN SONG       5   W  34   D  29   L  11   W  35   D  10   W  27  
## # ℹ 3 more variables: Round.7 <chr>, State <chr>, Pre.Rating <int>

Arrange Opponent Pair-Num to corresponding player

# Gathering and mutating
players_tibble <- players_tibble %>%
    # The gather function will stalk all round column names as key and value of those columns gonna be the key
  gather("Round","Oppo_PNum", 4:10) %>% # Round 1 to Round 7
  mutate(Oppo_PNum = parse_number(Oppo_PNum))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Oppo_PNum = parse_number(Oppo_PNum)`.
## Caused by warning:
## ! 40 parsing failures.
## row col expected actual
##  37  -- a number      B
##  44  -- a number      B
##  53  -- a number      H
##  56  -- a number      H
## 123  -- a number      B
## ... ... ........ ......
## See problems(...) for more details.

# create a new column for opponent pre rating
playerRatings = select(players_tibble, Pair.Num,OppoP_Rating = Pre.Rating) %>% distinct()

# Now join the opponent pre rating column to players_tibble dataframe.
players_tibble = left_join(players_tibble, playerRatings, by = c("Oppo_PNum" = "Pair.Num"))
# Print players_tibble after gathering and mutating
head(players_tibble)

## # A tibble: 6 × 8
##   Pair.Num Player.Name   Total.Pts State Pre.Rating Round Oppo_PNum OppoP_Rating
##      <dbl> <chr>             <dbl> <chr>      <int> <chr>     <dbl>        <int>
## 1        1 GARY HUA            6   "   …   15445895 Roun…        39     12923035
## 2        2 DAKSHESH DAR…       6   "   …   14598900 Roun…        63     15057092
## 3        3 ADITYA BAJAJ        6   "   …   14959604 Roun…         8     15142253
## 4        4 PATRICK H SC…       5.5 "   …   12616049 Roun…        23     15030142
## 5        5 HANSHI ZUO          5.5 "   …   14601533 Roun…        45     15372807
## 6        6 HANSEN SONG         5   "   …   15055204 Roun…        34     15051807

Find the Average of opponent Player’s pre-rating

players_tibble = players_tibble %>% 
  group_by(Player.Name) %>% 
  mutate(Avg_Opp_PreRating = round(mean(OppoP_Rating, na.rm=T)))

final_df <- players_tibble %>% 
  select(Player.Name, Player.State = State, Total.Pts, Pre.Rating, Avg_Opp_PreRating) %>% 
  ungroup() %>% 
  distinct()

head(final_df)

## # A tibble: 6 × 5
##   Player.Name         Player.State Total.Pts Pre.Rating Avg_Opp_PreRating
##   <chr>               <chr>            <dbl>      <int>             <dbl>
## 1 GARY HUA            "   ON "           6     15445895          12333625
## 2 DAKSHESH DARURI     "   MI "           6     14598900          12663102
## 3 ADITYA BAJAJ        "   MI "           6     14959604          14177344
## 4 PATRICK H SCHILLING "   MI "           5.5   12616049          14936182
## 5 HANSHI ZUO          "   MI "           5.5   14601533          13095983
## 6 HANSEN SONG         "   OH "           5     15055204          14525725

Connect to MySQL Databse

lamadb <- dbConnect(RMySQL::MySQL(), 
                 user = "chhiring.lama65", 
                 password = passwd, 
                 dbname = "chhiring.lama65",
                 host = "cunydata607sql.mysql.database.azure.com")

Due to limited privileges, I am unable to create the schema

#sql_statement <- "CREATE SCHEMA IF NOT EXISTS chess_tournament"

Create a table named “chess_tournament”

dbWriteTable(lamadb, "chess_tournament", final_df, overwrite = TRUE)

## [1] TRUE

List tables

dbListTables(lamadb)

## [1] "chess_tournament" "movie_ratings"

Disconnect from the Databse

dbDisconnect(lamadb)

## [1] TRUE

Create a csv file from this R dataframe

write.csv(players_tibble, "C:\\Users\\Chhiring\\iCloudDrive\\CUNY_MS\\DATA607\\Project_1\\project1.csv", row.names = FALSE)

Project_1

Chhiring Lama

2024-02-12

Project 1

Introduction