Project 1 - Data Analysis

library(tidyverse)

Load and preview chess data

# Load data
cross_table <- read.delim('tournamentinfo.txt')

# Check type
typeof(cross_table)

## [1] "list"

# Preview data
head(cross_table,12)

##    X.........................................................................................
## 1   Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| 
## 2   Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | 
## 3   -----------------------------------------------------------------------------------------
## 4       1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|
## 5      ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 6   -----------------------------------------------------------------------------------------
## 7       2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|
## 8      MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |
## 9   -----------------------------------------------------------------------------------------
## 10      3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|
## 11     MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |
## 12  -----------------------------------------------------------------------------------------

Helper function: Remove whitespace

This function removes leading and/or trailing whitespace from a string. It is used in the main conversion function, to_data_frame.

remove_ws <- function(line_split) {
  # Initiate empty vector
  vector <- c()
   # Loop through split line
   for (i in line_split)  {
      # Remove leading and/or trailing whitespace 
      vector[i] <- trimws(i)
   }
  vector
}

Helper function: Extract score

This function extracts the prerating score of the player from a string via regex and is used in the main conversion function, to_data_frame. The regular expression looks for a 3-4 digit number following "R:" (e.g. "R: 1393"). The following lines remove "R:" from the string, trim the whitespace, and convert to an integer.

extract_score <- function(raw_string) {
  # Use regular expression to match to 3 to 4 digit number 1 to 2 characters from "R:"
  clean_str <- str_extract(raw_string, "(R:.)(.{1,2}[0-9]{3,4})") #"(R:.)(...[0-9])") 
  # Remove "R:"
  clean_str <- str_replace(clean_str, "R:", "")
  # Remove leading and/or trailing whitespace 
  clean_str <- trimws(clean_str)
  # Convert from string to integer
  clean_str <- as.integer(clean_str)
  clean_str
}

Helper function: Extract opponent

This function extracts the player number from a string via regex and is used in the main conversion function, to_data_frame. The regular expression looks for a 1-2 digit number. Note, the regular expression will need to be updated if player numbers exceed two digits.

extract_opponent <- function(raw_string) {
  # Use regular expression to match to 1 to 2 digit number
  clean_str <- str_extract(raw_string, "[0-9]{1,2}")
  clean_str
}

Clean and convert data into dataframe

This is the main data cleaning and conversion function which at a high-level creates a dataframe with four out of the five attributes needed for this project, plus the player's opponents numbers. First, the function initiates a dataframe with dummy data that will be removed later. Second, the function loops through the dataframe in 3 row increments. While in the loop, various data is extracted and cleaned, then added to the dataframe.

to_data_frame <- function(cross_table) {
   # Initiate dataframe with dummy data which will be removed later
   df_players <- tibble("name" = "temp", "state" = "te", "points" = 1, 
                     "prerating" = 1, "opponents" = "")
   # Loop through the dataframe in 3 row increments
   for (i in seq(4, nrow(cross_table), by = 3))  {
      # Split the two lines for each player using pipes as the seperator
      line_split_one <- str_split(cross_table[(i):(i+1),][1], "\\|")
      line_split_two <- str_split(cross_table[(i):(i+1),][2], "\\|")
      # Remove whitespace
      line_clean_one <- remove_ws(line_split_one)
      line_clean_two <- remove_ws(line_split_two)
      # Add row to dataframe with player's name, state, points, prerating, and opponent player numbers
      df_players <- df_players %>% add_row("name" = str_to_title(line_clean_one[[2]], locale = "en"), 
                        "state" = line_clean_two[[1]], 
                        "points" = as.numeric(line_clean_one[[3]]), 
                        "prerating" = extract_score(line_clean_two[[2]]), 
                        "opponents" = paste(
                                      toString(extract_opponent(line_split_one[[1]][[4]])),
                                      toString(extract_opponent(line_split_one[[1]][[5]])), 
                                      toString(extract_opponent(line_split_one[[1]][[6]])),
                                      toString(extract_opponent(line_split_one[[1]][[7]])),
                                      toString(extract_opponent(line_split_one[[1]][[8]])),
                                      toString(extract_opponent(line_split_one[[1]][[9]])),
                                      toString(extract_opponent(line_split_one[[1]][[10]])),
                                      sep = ", "))
   }
   # Set as a dataframe and remove extraneous rows
   df_players <- as.data.frame(df_players)
   df_players <- df_players %>% slice(2:n())
}

df_chess <- to_data_frame(cross_table)
head(df_chess, 12)

##                        name state points prerating                  opponents
## 1                  Gary Hua    ON    6.0      1794   39, 21, 18, 14, 7, 12, 4
## 2           Dakshesh Daruri    MI    6.0      1553   63, 58, 4, 17, 16, 20, 7
## 3              Aditya Bajaj    MI    6.0      1384  8, 61, 25, 21, 11, 13, 12
## 4       Patrick H Schilling    MI    5.5      1716    23, 28, 2, 26, 5, 19, 1
## 5                Hanshi Zuo    MI    5.5      1655  45, 37, 12, 13, 4, 14, 17
## 6               Hansen Song    OH    5.0      1686 34, 29, 11, 35, 10, 27, 21
## 7         Gary Dee Swathell    MI    5.0      1649    57, 46, 13, 11, 1, 9, 2
## 8          Ezekiel Houghton    MI    5.0      1641   3, 32, 14, 9, 47, 28, 19
## 9               Stefano Lee    ON    5.0      1411   25, 18, 59, 8, 26, 7, 20
## 10                Anvit Rao    MI    5.0      1365  16, 19, 55, 31, 6, 25, 18
## 11 Cameron William Mc Leman    MI    4.5      1712    38, 56, 6, 7, 3, 34, 26
## 12           Kenneth J Tack    MI    4.5      1663    42, 33, 5, 38, NA, 1, 3

Preperation for calculating average pre chess rating of opponents

Add player_num column and create opp_pre (average opponent prerating score) column with temp data.

# Turn index into player_num column
df_chess <- cbind(player_num = rownames(df_chess), df_chess)
rownames(df_chess) <- 1:nrow(df_chess)
df_chess['opp_pre']= 0

Calculate average pre chess rating of opponents

This function calculates the average pre chess rating of opponents for each player. While looping through the dataframe one row at a time each opponents cell is converted into a list, which is used to filter for only the rows containing the current player's opponents. The total sum is calculated, then divided by the number of games played (i.e. byes, forfeits, recored as NAs, are not included). Note, R's base round function rounds down positive .5 values; anything above .5 is rounded up.

cal_avg_pre_opp_rating <- function(df_chess) {
  for (i in 1:nrow(df_chess))  {
    # Split opponents string by comma
    opp_list <- as.list(strsplit(df_chess[i, ]$opponents, ",\\s"))
    # Convert list of strings to list of numeric values
    opp_list <-as.numeric(opp_list[[1]])
    # Filter for only opponents
    prerating_opp <- subset(df_chess, player_num %in% opp_list)
    # Set opp_pre cell value with calculated average rounded to nearest full-point
    df_chess[i, 7] = round(sum(prerating_opp$prerating) / length(prerating_opp$prerating), digits = 0)
  }
  df_chess
}

df_final <- cal_avg_pre_opp_rating(df_chess)

Write to CSV

# Drop opponent column
df_final <- subset(df_final, select = -c(player_num, opponents))
head(df_final, 12)

##                        name state points prerating opp_pre
## 1                  Gary Hua    ON    6.0      1794    1605
## 2           Dakshesh Daruri    MI    6.0      1553    1469
## 3              Aditya Bajaj    MI    6.0      1384    1564
## 4       Patrick H Schilling    MI    5.5      1716    1574
## 5                Hanshi Zuo    MI    5.5      1655    1501
## 6               Hansen Song    OH    5.0      1686    1519
## 7         Gary Dee Swathell    MI    5.0      1649    1372
## 8          Ezekiel Houghton    MI    5.0      1641    1468
## 9               Stefano Lee    ON    5.0      1411    1523
## 10                Anvit Rao    MI    5.0      1365    1554
## 11 Cameron William Mc Leman    MI    4.5      1712    1468
## 12           Kenneth J Tack    MI    4.5      1663    1506

# Write to CSV
write.csv(df_final, "chess_players_final.csv", row.names=FALSE)

This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.