DATA 607 - Project 1

Loading the Text File

raw <- readLines(url("https://raw.githubusercontent.com/ShanaFarber/cuny-sps/master/DATA_607/project1/player_stats.txt"))

## Warning in
## readLines(url("https://raw.githubusercontent.com/ShanaFarber/cuny-sps/master/DATA_607/project1/player_stats.txt")):
## incomplete final line found on
## 'https://raw.githubusercontent.com/ShanaFarber/cuny-sps/master/DATA_607/project1/player_stats.txt'

raw[0:10]

##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------"

In order to figure out what values to extract, I used str_view and str_view_all to look for patterns within the string to determine the correct regex to use to extract the data within each string.

########## check for player names ##########
str_view_all(raw[-c(1,2,3,4)], '\\w+\\s\\w+(\\s\\w+)?(\\s\\w+)?') # accounts for up to four names

str_view_all(raw[-c(1,2,3,4)], '([A-Z])+\\s([A-Z](\\s)?)*([A-Z])+') # accounts for more possible names

########## check for total score ##########
str_view_all(raw, '\\d\\.\\d')

########## check for pre_ratings ##########
str_view(raw[-c(1,2,3,4)], '\\d+(P\\d+)?(\\s+)?\\->(\\s+)?\\d+(P\\d+)?') # gets both the pre and post rating together

str_view(raw[-c(1,2,3,4)], '\\d+(P\\d+)?(\\s+)?\\->') # just the pre rating

########## check for states ##########
str_view_all(raw[-c(1,2,3,4)], '[A-Z][A-Z]\\s\\|')

########## check for rounds ##########
str_view_all(str_remove(raw, '^\\s+[A-Z].+'), '[WLBDXU]\\s+(\\d+)?\\|')

Extracting the Information

# player names have at least first and last name and maybe middle initial or multiple middle names
player_names <- unlist(str_extract_all(raw[-c(1,2,3,4)], '([A-Z])+\\s([A-Z](\\s)?)*([A-Z])+'))

# totals are the only numbers with a decimal
totals <- unlist(str_extract_all(raw, '\\d\\.\\d'))

# ratings in the format of number -> number
# just need the part before -> for the pre-rating
pre_ratings <- unlist(str_extract_all(raw[-c(1,2,3,4)], '\\d+(P\\d+)?(\\s+)?\\->'))

# remove the arrow and any trailing 'P..'
pre_ratings <- str_remove(pre_ratings, '(P\\d+)?(\\s+)?->')

# states have two upper case letters
# first extract all values with two upper case letters, a space, and a pipe so as not to get any values other than the states
# then extract the states from the list
states <- unlist(str_extract_all(raw[-c(1,2,3,4)], '[A-Z][A-Z]\\s\\|'))

states <- str_remove(states, '\\s\\|')

Creating the Data Frame

I combined the extracted information into a data frame chess_stats.

chess_stats <- data.frame("player_name" = player_names,
                          "player_state" = states,
                          "total" = totals,
                          "pre_rating" = pre_ratings)

# recast numeric columns
chess_stats$total <- as.numeric(chess_stats$total)
chess_stats$pre_rating <- as.numeric(chess_stats$pre_rating)

Now I needed to calculate the average pre-rating of opponents for each player:

In order to calculate the average pre-rating of opponents for each player, I needed to add in the information about their opponents. To do this, I extracted the information for the results of each round from the raw data. I then made a seven column matrix from that data so I would have a data frame of each player’s results for each of the seven rounds.

# each round has either W, L, D with a number, or H, U, X, or B
rounds <- unlist(str_extract_all(unlist((str_extract_all(str_remove(raw, '^\\s+[A-Z].+'), '\\|[WLDXUHB]\\s+(\\d+)?'))), '[WLDXUHB]\\s+(\\d+)?'))

num_players <- length(player_names)

num_rounds <- length(rounds)/num_players
  
# create a seven column matrix from rounds to have a data frame of the results of each round for each player
rounds <- matrix(rounds, byrow=T, ncol=num_rounds)

I then created a new data frame combining the chess_stats table with the rounds matrix. I used pivot_longer to expand the data frame so I could filter out rows for games that a player did not play. I also created a column just for each opponent’s player ID.

# combine the stats with the results of each round and pivot_longer
full_stats <- chess_stats %>%
  cbind(rounds) %>%
  pivot_longer(cols = c("1", "2", "3", "4", "5", "6", "7"),
               names_to = "round",
               values_to = "results")

# filtering out rounds a player did not play
full_stats <- full_stats %>%
  filter(str_detect(full_stats$results, '[WLD]')) # only keep rows where the result was a win/loss/draw i.e. games where there was an opponent

# add a column for opponent player_id from each round
full_stats <- full_stats %>%
  mutate("opp_player_id" = as.integer(str_extract(results, "\\d+")))

I created a separate data frame of each player’s ID number and their pre-rating and I used inner_join to add the pre-rating for each opponent for each round. I then used summarize to calculate the average pre-rating of opponents for each player and I joined that to the initial data frame chess_stats.

opp_pre_ratings <- data.frame("opp_player_id" = c(1:64),
                          "opp_pre_rating" = chess_stats$pre_rating)

avg_opp_pre_ratings <- full_stats %>%
  left_join(opp_pre_ratings, on = "opp_player_id") %>%
  group_by(player_name) %>%
  summarize("avg_opp_pre_rating" = round(mean(opp_pre_rating), 0))

## Joining, by = "opp_player_id"

chess_stats <- chess_stats %>%
  left_join(avg_opp_pre_ratings, on = "player_name")

## Joining, by = "player_name"

knitr::kable(head(chess_stats))

player_name	player_state	total	pre_rating	avg_opp_pre_rating
GARY HUA	ON	6.0	1794	1605
DAKSHESH DARURI	MI	6.0	1553	1469
ADITYA BAJAJ	MI	6.0	1384	1564
PATRICK H SCHILLING	MI	5.5	1716	1574
HANSHI ZUO	MI	5.5	1655	1501
HANSEN SONG	OH	5.0	1686	1519

Exporting to CSV

write.csv(chess_stats, "C:/Temp/player_stats.csv", row.names = FALSE) # use 'C:/Temp' or replace with desired file path

DATA 607 - Project 1

Shoshana Farber

February 19, 2023

Loading the Text File

Extracting the Information

Creating the Data Frame

Exporting to CSV