raw <- readLines(url("https://raw.githubusercontent.com/ShanaFarber/cuny-sps/master/DATA_607/project1/player_stats.txt"))
## Warning in
## readLines(url("https://raw.githubusercontent.com/ShanaFarber/cuny-sps/master/DATA_607/project1/player_stats.txt")):
## incomplete final line found on
## 'https://raw.githubusercontent.com/ShanaFarber/cuny-sps/master/DATA_607/project1/player_stats.txt'
raw[0:10]
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
In order to figure out what values to extract, I used
str_view and str_view_all to look for patterns
within the string to determine the correct regex to use to extract the
data within each string.
########## check for player names ##########
str_view_all(raw[-c(1,2,3,4)], '\\w+\\s\\w+(\\s\\w+)?(\\s\\w+)?') # accounts for up to four names
str_view_all(raw[-c(1,2,3,4)], '([A-Z])+\\s([A-Z](\\s)?)*([A-Z])+') # accounts for more possible names
########## check for total score ##########
str_view_all(raw, '\\d\\.\\d')
########## check for pre_ratings ##########
str_view(raw[-c(1,2,3,4)], '\\d+(P\\d+)?(\\s+)?\\->(\\s+)?\\d+(P\\d+)?') # gets both the pre and post rating together
str_view(raw[-c(1,2,3,4)], '\\d+(P\\d+)?(\\s+)?\\->') # just the pre rating
########## check for states ##########
str_view_all(raw[-c(1,2,3,4)], '[A-Z][A-Z]\\s\\|')
########## check for rounds ##########
str_view_all(str_remove(raw, '^\\s+[A-Z].+'), '[WLBDXU]\\s+(\\d+)?\\|')
# player names have at least first and last name and maybe middle initial or multiple middle names
player_names <- unlist(str_extract_all(raw[-c(1,2,3,4)], '([A-Z])+\\s([A-Z](\\s)?)*([A-Z])+'))
# totals are the only numbers with a decimal
totals <- unlist(str_extract_all(raw, '\\d\\.\\d'))
# ratings in the format of number -> number
# just need the part before -> for the pre-rating
pre_ratings <- unlist(str_extract_all(raw[-c(1,2,3,4)], '\\d+(P\\d+)?(\\s+)?\\->'))
# remove the arrow and any trailing 'P..'
pre_ratings <- str_remove(pre_ratings, '(P\\d+)?(\\s+)?->')
# states have two upper case letters
# first extract all values with two upper case letters, a space, and a pipe so as not to get any values other than the states
# then extract the states from the list
states <- unlist(str_extract_all(raw[-c(1,2,3,4)], '[A-Z][A-Z]\\s\\|'))
states <- str_remove(states, '\\s\\|')
I combined the extracted information into a data frame
chess_stats.
chess_stats <- data.frame("player_name" = player_names,
"player_state" = states,
"total" = totals,
"pre_rating" = pre_ratings)
# recast numeric columns
chess_stats$total <- as.numeric(chess_stats$total)
chess_stats$pre_rating <- as.numeric(chess_stats$pre_rating)
Now I needed to calculate the average pre-rating of opponents for each player:
In order to calculate the average pre-rating of opponents for each player, I needed to add in the information about their opponents. To do this, I extracted the information for the results of each round from the raw data. I then made a seven column matrix from that data so I would have a data frame of each player’s results for each of the seven rounds.
# each round has either W, L, D with a number, or H, U, X, or B
rounds <- unlist(str_extract_all(unlist((str_extract_all(str_remove(raw, '^\\s+[A-Z].+'), '\\|[WLDXUHB]\\s+(\\d+)?'))), '[WLDXUHB]\\s+(\\d+)?'))
num_players <- length(player_names)
num_rounds <- length(rounds)/num_players
# create a seven column matrix from rounds to have a data frame of the results of each round for each player
rounds <- matrix(rounds, byrow=T, ncol=num_rounds)
I then created a new data frame combining the
chess_stats table with the rounds matrix. I
used pivot_longer to expand the data frame so I could
filter out rows for games that a player did not play. I also created a
column just for each opponent’s player ID.
# combine the stats with the results of each round and pivot_longer
full_stats <- chess_stats %>%
cbind(rounds) %>%
pivot_longer(cols = c("1", "2", "3", "4", "5", "6", "7"),
names_to = "round",
values_to = "results")
# filtering out rounds a player did not play
full_stats <- full_stats %>%
filter(str_detect(full_stats$results, '[WLD]')) # only keep rows where the result was a win/loss/draw i.e. games where there was an opponent
# add a column for opponent player_id from each round
full_stats <- full_stats %>%
mutate("opp_player_id" = as.integer(str_extract(results, "\\d+")))
I created a separate data frame of each player’s ID number and their
pre-rating and I used inner_join to add the pre-rating for
each opponent for each round. I then used summarize to
calculate the average pre-rating of opponents for each player and I
joined that to the initial data frame chess_stats.
opp_pre_ratings <- data.frame("opp_player_id" = c(1:64),
"opp_pre_rating" = chess_stats$pre_rating)
avg_opp_pre_ratings <- full_stats %>%
left_join(opp_pre_ratings, on = "opp_player_id") %>%
group_by(player_name) %>%
summarize("avg_opp_pre_rating" = round(mean(opp_pre_rating), 0))
## Joining, by = "opp_player_id"
chess_stats <- chess_stats %>%
left_join(avg_opp_pre_ratings, on = "player_name")
## Joining, by = "player_name"
knitr::kable(head(chess_stats))
| player_name | player_state | total | pre_rating | avg_opp_pre_rating |
|---|---|---|---|---|
| GARY HUA | ON | 6.0 | 1794 | 1605 |
| DAKSHESH DARURI | MI | 6.0 | 1553 | 1469 |
| ADITYA BAJAJ | MI | 6.0 | 1384 | 1564 |
| PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
| HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
| HANSEN SONG | OH | 5.0 | 1686 | 1519 |
write.csv(chess_stats, "C:/Temp/player_stats.csv", row.names = FALSE) # use 'C:/Temp' or replace with desired file path