Project01

Introduction:

Saved on Kaylie’s public GitHub is a .txt file with chess tournament results. This R Markdown takes that file, cleans it, transforms it, summarizes it, and exports the result as a .csv file. The final file will have the following columns: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents. That file does not have headers.

Load necessary libraries

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Read the text file, clean it, and save as a data frame

Reading the text file:

#Read the txt file and save as object
data_read <- read_lines(
          file = "https://raw.githubusercontent.com/evanskaylie/DATA607/main/Project01PlayerInfo",
          skip = 4
        )

Delimit and clean the data:

#Split the data on the row delimiter "------"
data_column <- str_split(
                  data_read, 
                  pattern = "-----", 
                  simplify = TRUE
                )

#Remove empty rows
data_column <- data_column[!data_column == ""]

#Convert character column to a data frame
df <- as.data.frame(data_column)

#Separate the column with the | delimiter
df <- df |>
  separate(
    col = data_column, 
    into = paste0("col", 1:10), 
    sep = "\\|", 
    convert = TRUE
  )

## Warning: Expected 10 pieces. Additional pieces discarded in 128 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

## Warning: Expected 10 pieces. Missing pieces filled with `NA` in 64 rows [129, 130, 131,
## 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
## 148, ...].

#Show the data frame
head(df)

##     col1                              col2  col3  col4  col5  col6  col7  col8
## 1     1   GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7
## 2    ON   15445895 / R: 1794   ->1817      N:2   W     B     W     B     W    
## 3     2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17 W  16
## 4    MI   14598900 / R: 1553   ->1663      N:2   B     W     B     W     B    
## 5     3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21 W  11
## 6    MI   14959604 / R: 1384   ->1640      N:2   W     B     W     B     W    
##    col9 col10
## 1 D  12 D   4
## 2 B     W    
## 3 W  20 W   7
## 4 W     B    
## 5 W  13 W  12
## 6 B     W

The above data frame is not tidy. That is to say there are multiple rows that contain a single observation. Exactly 2 rows contain each observation. One solution is to widen the data frame and shorten it, both by exactly twice and half as much, respectively.

Now take each even row, and glue it to the end of the odd row above it to create a tidy data frame:

#Identify every other row using seq()
rows_to_keep <- seq(2, nrow(df), by = 2)

#Select the rows to keep as a data frame
df_to_add <- df[rows_to_keep, ]

#Rename the columns of df_to_add to avoid conflicts
colnames(df_to_add) <- paste0(colnames(df_to_add), "_ext")

# Bind the selected rows to the row above
df <- bind_cols(df[-rows_to_keep, ], df_to_add)

#Keep only the 64 cells that contain values
df <- df[1:64,]

#Preview of data
head(df)

##     col1                              col2  col3  col4  col5  col6  col7  col8
## 1     1   GARY HUA                         6.0   W  39 W  21 W  18 W  14 W   7
## 2     2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17 W  16
## 3     3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21 W  11
## 4     4   PATRICK H SCHILLING              5.5   W  23 D  28 W   2 W  26 D   5
## 5     5   HANSHI ZUO                       5.5   W  45 W  37 D  12 D  13 D   4
## 6     6   HANSEN SONG                      5.0   W  34 D  29 L  11 W  35 D  10
##    col9 col10 col1_ext                          col2_ext col3_ext col4_ext
## 1 D  12 D   4      ON   15445895 / R: 1794   ->1817         N:2      W    
## 2 W  20 W   7      MI   14598900 / R: 1553   ->1663         N:2      B    
## 3 W  13 W  12      MI   14959604 / R: 1384   ->1640         N:2      W    
## 4 W  19 D   1      MI   12616049 / R: 1716   ->1744         N:2      W    
## 5 W  14 W  17      MI   14601533 / R: 1655   ->1690         N:2      B    
## 6 W  27 W  21      OH   15055204 / R: 1686   ->1687         N:3      W    
##   col5_ext col6_ext col7_ext col8_ext col9_ext col10_ext
## 1    B        W        B        W        B         W    
## 2    W        B        W        B        W         B    
## 3    B        W        B        W        B         W    
## 4    B        W        B        W        B         B    
## 5    W        B        W        B        W         B    
## 6    B        W        B        B        W         B

Now the data is clean enough to start the analysis.

Get information for the summarized data frame

In order, the below chunks will collect data frames and character vectors that give the following information: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

1: Player’s Name

#Save player names as a character vector
player_name <- df$col2

#Find all the alphanumeric values in the row
player_name <- str_extract_all(player_name, "[[:alnum:]]+[[:space:]]?")

#Flatten the alphanumeric values and remove extra space
player_name <- player_name |>
  sapply(paste, collapse = "") |>
  str_sub(end = -2)

#Preview of data
head(player_name)

## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"

2: Player’s State

#Save player states as a character vector
player_state <- df$col1_ext

#Find all the alphanumeric values in the row
player_state <- str_extract_all(player_state, "[[:alnum:]]+")

#Flatten the alphanumeric values 
player_state <- player_state |>
  sapply(paste, collapse = "")

#Preview of data
head(player_state)

## [1] "ON" "MI" "MI" "MI" "MI" "OH"

3: Total Number of Points

#Save player total points as an number vector of doubles
player_points <- as.double(df$col3)

#Preview of data
head(player_points)

## [1] 6.0 6.0 6.0 5.5 5.5 5.0

4: Player’s Pre-Rating

#Save entire row as character vector
player_pre_rating <- df$col2_ext

##Get the data for pre-rating from the rows
player_pre_rating <- str_extract_all(player_pre_rating, "[:]\\s+\\d\\d\\d+")

##Remove extra characters, leaving only the rating
player_pre_rating <- str_extract_all(player_pre_rating, "\\d+")

##Reformat to character vector rather than list
player_pre_rating <- unlist(player_pre_rating)

#Preview of data
head(player_pre_rating)

## [1] "1794" "1553" "1384" "1716" "1655" "1686"

5: Average Pre Chess Rating of Opponents

Because of the direction taken on this R Markdown, this section is going to be a bit more convoluted than the above chunks. It will be separated by stage of the process.

Save the data for who a competitor had as opponents in the tournament

#Save each round's number as character vector
round_opponents <- paste(
  df$col4,
  df$col5,
  df$col6,
  df$col7,
  df$col8,
  df$col9,
  df$col10
  )

##Get the data for pre-rating from the rows
round_opponents <- str_extract_all(round_opponents, "\\d+")

#Pad missing values as NA so data frame conversion works
max_length <- max(lengths(round_opponents))

round_opponents <- lapply(round_opponents, function(x) {
  length(x) <- max_length
  x
})

#Check on the data
head(round_opponents)

## [[1]]
## [1] "39" "21" "18" "14" "7"  "12" "4" 
## 
## [[2]]
## [1] "63" "58" "4"  "17" "16" "20" "7" 
## 
## [[3]]
## [1] "8"  "61" "25" "21" "11" "13" "12"
## 
## [[4]]
## [1] "23" "28" "2"  "26" "5"  "19" "1" 
## 
## [[5]]
## [1] "45" "37" "12" "13" "4"  "14" "17"
## 
## [[6]]
## [1] "34" "29" "11" "35" "10" "27" "21"

Pivot the data frame, so the data can be combined with opponent pre-chess rates

#Save the opponent data as a data frame
opponent_indices <- data.frame(round_opponents)

#Name the columns for pivoting
names(opponent_indices)[1:64] <- c("01","02","03","04","05","06","07","08","09",10:64)

#Add row names to pivot the data
row_names_col <- str_c("Round ", 1:7)

#Add row name column to data frame
opponent_indices <- bind_cols(Round = str_c("Round ", 1:7), opponent_indices)

#Pivot the data frame
opponent_indices <- opponent_indices |>
  pivot_longer(
    cols = !starts_with("Round"),
    names_to = "Competitor_Index",
    values_to = "Opponent_Index"
  )

#Check on the data
head(opponent_indices)

## # A tibble: 6 × 3
##   Round   Competitor_Index Opponent_Index
##   <chr>   <chr>            <chr>         
## 1 Round 1 01               39            
## 2 Round 1 02               63            
## 3 Round 1 03               8             
## 4 Round 1 04               23            
## 5 Round 1 05               45            
## 6 Round 1 06               34

This chunk shows R’s VLOOKUP (excel) equivalent to get the opponent rating

#Add index column to player pre rating
opponent_pre_rate <- data.frame(
                      Opponent_Index = c(1:64),
                      rate = player_pre_rating
                    )

#Glue opponent rate to the data frame
opponent_pre_rate <- merge(opponent_indices, opponent_pre_rate, by = "Opponent_Index")

Calculating the average pre chess rating of opponents for each competitor

#Convert the rate column to numeric for calculation
opponent_pre_rate[, 4] <- sapply(opponent_pre_rate[,4], as.numeric)

#Take the average rate, grouped by competitor
player_avg_opp_rtg <- opponent_pre_rate |>
  group_by(Competitor_Index) |>
  summarise(mean = mean(rate))
  
#player_avg_opp_rtg <- player_avg_opp_rtg |>
#  order(decreasing = FALSE)
  
player_avg_opp_rtg[order(player_avg_opp_rtg$Competitor_Index),]

## # A tibble: 64 × 2
##    Competitor_Index  mean
##    <chr>            <dbl>
##  1 01               1605.
##  2 02               1469.
##  3 03               1564.
##  4 04               1574.
##  5 05               1501.
##  6 06               1519.
##  7 07               1372.
##  8 08               1468.
##  9 09               1523.
## 10 10               1554.
## # ℹ 54 more rows

#Check on the data
head(player_avg_opp_rtg)

## # A tibble: 6 × 2
##   Competitor_Index  mean
##   <chr>            <dbl>
## 1 01               1605.
## 2 02               1469.
## 3 03               1564.
## 4 04               1574.
## 5 05               1501.
## 6 06               1519.

Prepare and export the .csv file

Concatenate the character vectors into a results object

#Gluing the necessary objects together
player_results <- paste(
  player_name, ",", 
  player_state, ",", 
  player_points, ",",
  player_pre_rating, ",",
  round(player_avg_opp_rtg$mean, digits = 0)
     #Rounding the average to match project guidelines
  )

#Preview data
head(player_results)

## [1] "GARY HUA , ON , 6 , 1794 , 1605"             
## [2] "DAKSHESH DARURI , MI , 6 , 1553 , 1469"      
## [3] "ADITYA BAJAJ , MI , 6 , 1384 , 1564"         
## [4] "PATRICK H SCHILLING , MI , 5.5 , 1716 , 1574"
## [5] "HANSHI ZUO , MI , 5.5 , 1655 , 1501"         
## [6] "HANSEN SONG , OH , 5 , 1686 , 1519"

Export as .csv file that saves to the user’s current working directory

#Writing the results to a csv file
write.csv(player_results, file = "player_results.csv", row.names = F)