library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
Loading of the tournamentinfo text file
data<-readLines('https://raw.githubusercontent.com/metis-macys-66898/data_607_sp2020/master/tournamentinfo.txt', warn = 'F')
data<-data[data != strrep("-", 90-1)]
#head(data)
#length(data)
data<-data[3:130]
head(data)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [4] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [5] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [6] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
The way that we’re set up is we started to import the text file using readLines as a character vector. Then I’m going to use regexp to parse it into individual character vectors which are the variables that we can use to construct a final dataframe.
# Capturing names like CAMERON WILLIAM MC LEMAN and SEAN M MC CORMICK
# assuming the name could max out the field length allowed, I do not expect a space to be matched in the end for 2-word, 3-word, 4-word names.
name_regex<-"[0-9] \\| [a-zA-Z]+\\s[a-zA-Z]+\\s?([a-zA-Z]+\\s?)*[a-zA-Z]*"
# removing the leading and trailing space along with the leading non-named related string that I wanted to remove
name<- trimws(str_remove_all(unlist(str_extract_all(data, name_regex)), "\\d\\s\\|"))
name
## [1] "GARY HUA" "DAKSHESH DARURI"
## [3] "ADITYA BAJAJ" "PATRICK H SCHILLING"
## [5] "HANSHI ZUO" "HANSEN SONG"
## [7] "GARY DEE SWATHELL" "EZEKIEL HOUGHTON"
## [9] "STEFANO LEE" "ANVIT RAO"
## [11] "CAMERON WILLIAM MC LEMAN" "KENNETH J TACK"
## [13] "TORRANCE HENRY JR" "BRADLEY SHAW"
## [15] "ZACHARY JAMES HOUGHTON" "MIKE NIKITIN"
## [17] "RONALD GRZEGORCZYK" "DAVID SUNDEEN"
## [19] "DIPANKAR ROY" "JASON ZHENG"
## [21] "DINH DANG BUI" "EUGENE L MCCLURE"
## [23] "ALAN BUI" "MICHAEL R ALDRICH"
## [25] "LOREN SCHWIEBERT" "MAX ZHU"
## [27] "GAURAV GIDWANI" "SOFIA ADINA STANESCU"
## [29] "CHIEDOZIE OKORIE" "GEORGE AVERY JONES"
## [31] "RISHI SHETTY" "JOSHUA PHILIP MATHEWS"
## [33] "JADE GE" "MICHAEL JEFFERY THOMAS"
## [35] "JOSHUA DAVID LEE" "SIDDHARTH JHA"
## [37] "AMIYATOSH PWNANANDAM" "BRIAN LIU"
## [39] "JOEL R HENDON" "FOREST ZHANG"
## [41] "KYLE WILLIAM MURPHY" "JARED GE"
## [43] "ROBERT GLEN VASEY" "JUSTIN D SCHILLING"
## [45] "DEREK YAN" "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT" "DANIEL KHAIN"
## [49] "MICHAEL J MARTIN" "SHIVAM JHA"
## [51] "TEJAS AYYAGARI" "ETHAN GUO"
## [53] "JOSE C YBARRA" "LARRY HODGE"
## [55] "ALEX KONG" "MARISA RICCI"
## [57] "MICHAEL LU" "VIRAJ MOHILE"
## [59] "SEAN M MC CORMICK" "JULIA SHEN"
## [61] "JEZZEL FARKAS" "ASHWIN BALAJI"
## [63] "THOMAS JOSEPH HOSMER" "BEN LI"
state_regex<-"\\s{2}[A-Z]{2}"
states<-unlist(str_extract_all(data, state_regex))
states
## [1] " ON" " MI" " MI" " MI" " MI" " OH" " MI" " MI" " ON" " MI"
## [11] " MI" " MI" " MI" " MI" " MI" " MI" " MI" " MI" " MI" " MI"
## [21] " ON" " MI" " ON" " MI" " MI" " ON" " MI" " MI" " MI" " ON"
## [31] " MI" " ON" " MI" " MI" " MI" " MI" " MI" " MI" " MI" " MI"
## [41] " MI" " MI" " MI" " MI" " MI" " MI" " MI" " MI" " MI" " MI"
## [51] " MI" " MI" " MI" " MI" " MI" " MI" " MI" " MI" " MI" " MI"
## [61] " ON" " MI" " MI" " MI"
points_regex<-"\\d\\.\\d"
points<-unlist(str_extract_all(data, points_regex))
points
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5" "4.5"
## [13] "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [25] "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [37] "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "2.5" "2.5"
## [49] "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0" "2.0" "2.0" "2.0" "2.0" "1.5"
## [61] "1.5" "1.0" "1.0" "1.0"
pre_rating_regex<-"R:\\s+[0-9]{3,}"
pre_rating<-str_remove_all(unlist(str_extract_all(data, pre_rating_regex)), "R:\\s")
pre_rating
## [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" "1365"
## [11] "1712" "1663" "1666" "1610" "1220" "1604" "1629" "1600" "1564" "1595"
## [21] "1563" "1555" "1363" "1229" "1745" "1579" "1552" "1507" "1602" "1522"
## [31] "1494" "1441" "1449" "1399" "1438" "1355" " 980" "1423" "1436" "1348"
## [41] "1403" "1332" "1283" "1199" "1242" " 377" "1362" "1382" "1291" "1056"
## [51] "1011" " 935" "1393" "1270" "1186" "1153" "1092" " 917" " 853" " 967"
## [61] " 955" "1530" "1175" "1163"
uscf_regex<-"[0-9]{8}"
USCF_ID<-unlist(str_extract_all(data, uscf_regex))
USCF_ID
## [1] "15445895" "14598900" "14959604" "12616049" "14601533" "15055204"
## [7] "11146376" "15142253" "14954524" "14150362" "12581589" "12681257"
## [13] "15082995" "10131499" "15619130" "10295068" "10297702" "11342094"
## [19] "14862333" "14529060" "15495066" "12405534" "15030142" "13469010"
## [25] "12486656" "15131520" "14476567" "14882954" "15323285" "12577178"
## [31] "15131618" "14073750" "14691842" "15051807" "14601397" "14773163"
## [37] "15489571" "15108523" "12923035" "14892710" "15761443" "14462326"
## [43] "14101068" "15323504" "15372807" "15490981" "12533115" "14369165"
## [49] "12531685" "14773178" "15205474" "14918803" "12578849" "12836773"
## [55] "15412571" "14679887" "15113330" "14700365" "12841036" "14579262"
## [61] "15771592" "15219542" "15057092" "15006561"
# length(uscf_id)
# this will match anything that is W, L, or D, but only exclude other types from the two ends of the bar (|)
games_regex<-"\\|[W|L|D].+[0-9]{1,2}\\|"
# Having this games_regex_removed ensures we do not take in any B, H, or U
games_regex_removed <- "\\|[^(W|L|D)]\\s{4}"
games<- unlist(str_extract_all(data, games_regex))
games<-str_remove_all(games, games_regex_removed)
# str_extract_all with simplify defaulted to F gives us a list of character vectors, which can then be unlisted to collapse to 64 elements
games<-strsplit(unlist(str_extract_all(data, games_regex)), "\\|")
# after splitting on string literal bar (|), we should end with 64 elements in the list
length(games)
## [1] 64
# creating the list of games for all 64 players
for (i in 1:length(games)){
games[[i]]<-games[[i]][2:length(games[[i]])]
}
To answer the question for getting the average of the Pre Chess Rating of Opponents for players 1 and 37 in the Intro YouTube video, we create a function to get the opponent_scores needed to create the data frame for the final output. Need to use extra caution here as most of the variables are defaulted to factors. I need to explicitly change it back to chracter, numeric, or integers. Note that while writeLines has its merits, I do think read.csv is another standard approach where you can set stringsAsFactors = F to take care of the Factors issues. The key here is to replace the output of opponent_score, i.e. game_opponent_rank (denoted as a 1-to-2-digit string), with the pre_rating of the opponents using the right lookup mechanisms that can be achieved with for loops and basic dataframe operations. populate_opp_scores is the function that gets that job done.
opponent_score <- function(game_num) {
game_outcome <- rep(NA,64)
game_opponent_rank <- rep(NA, 64)
for (i in 1:length(games)){
# this pulls out the 1-character W, L, D
game_outcome[i]<-str_extract(games[[i]][game_num],"[A-Za-z]{1}")
# This gives me the opponent rank of a given game i
game_opponent_rank[i]<-str_extract(games[[i]][game_num],"[0-9]{1,2}")
}
result <- list(game_outcome, game_opponent_rank)
return (result)
}
#opponent_score(1)[[2]]
df <- data.frame(USCF_ID,
name,
states,
points,
pre_rating,
opponent_score(1)[[2]],
opponent_score(2)[[2]],
opponent_score(3)[[2]],
opponent_score(4)[[2]],
opponent_score(5)[[2]],
opponent_score(6)[[2]],
opponent_score(7)[[2]]
)
#df
#head(df,2)
#Renaming opponent_score for the 7 columns
names(df) <- c("USCF_ID", "name", "states", "points", "pre_rating", "opp_rating_1", "opp_rating_2", "opp_rating_3", "opp_rating_4", "opp_rating_5", "opp_rating_6", "opp_rating_7")
names(df)
## [1] "USCF_ID" "name" "states" "points" "pre_rating"
## [6] "opp_rating_1" "opp_rating_2" "opp_rating_3" "opp_rating_4" "opp_rating_5"
## [11] "opp_rating_6" "opp_rating_7"
#str(df)
# points scored needed to be in numeric
df$points<-as.numeric(as.character(df$points))
# USCF_ID along with all the opponent scores, and pre_Ratings are supposed to be all integers
cols <- c(names(df[, 5:12]), "USCF_ID")
df[,cols] <- apply(df[,cols], 2, function(x) as.integer(as.character(x)))
#str(df)
#df
char_cols <- c("name", "states")
df[,char_cols] <- apply(df[,char_cols], 2, function(x) as.character(x))
#col_names <- names(df[,6:12])
opp_ratings<- c('opp_rating_1', 'opp_rating_2', 'opp_rating_3', 'opp_rating_4', 'opp_rating_5', 'opp_rating_6', 'opp_rating_7')
populate_opp_scores <- function (df, cols){
for (j in opp_ratings) {
for (i in 1:nrow(df)){
df[[j]][i]<- df$pre_rating[df[[j]][i]]
}
}
return (df)
}
df<- populate_opp_scores(df, opp_ratings)
#df
#df[37,]
# calculating row-level means using rowMeans
df$average_opponent_rating <- rowMeans(df[,opp_ratings], na.rm = T)
#df
# Do not need to use group_by if the dataset is large.
final_df <- select( df, -USCF_ID, -all_of(opp_ratings) ) %>% group_by(name, states, points, pre_rating) %>% arrange(desc(average_opponent_rating))
final_df
## # A tibble: 64 x 5
## # Groups: name, states, points, pre_rating [64]
## name states points pre_rating average_opponent_rating
## <chr> <chr> <dbl> <int> <dbl>
## 1 GARY HUA " ON" 6 1794 1605.
## 2 PATRICK H SCHILLING " MI" 5.5 1716 1574.
## 3 ADITYA BAJAJ " MI" 6 1384 1564.
## 4 ANVIT RAO " MI" 5 1365 1554.
## 5 BRIAN LIU " MI" 3 1423 1539.
## 6 STEFANO LEE " ON" 5 1411 1523.
## 7 SOFIA ADINA STANESCU " MI" 3.5 1507 1522.
## 8 HANSEN SONG " OH" 5 1686 1519.
## 9 BRADLEY SHAW " MI" 4.5 1610 1515
## 10 MAX ZHU " ON" 3.5 1579 1507.
## # … with 54 more rows
#head(final_df)
Writing out the file to csv to complete the task
write.csv(final_df, "project_1_DataFrame_DP.csv")
cur_dir<-getwd()
list.files(cur_dir)
## [1] "proj1_data607.html" "proj1_data607.Rmd"
## [3] "Project 1.pdf" "project_1_DataFrame_DP.csv"
## [5] "rsconnect" "tournamentinfo.txt"
Bonus Track:
Furthermore, I’d like to examine the tournament results by comparing the average_opponent_rating and pre_rating (pre-tournament rating) and see if there is an origin that has a bigger difference than the others. It turns out there are only 3 states, missouri (MI), Ohio (OH), and Ontario, Canada (ON) that we have data for. OH has only 1 count of non-NA. I’m going to exclude it from comparison. It turns out that ON has a higher net tournament results, defined by pre-tournament rating minus average opponent rating.
final_df2<-final_df
final_df2$tournament_results <- final_df$pre_rating - final_df$average_opponent_rating
#final_df2
boxplot( tournament_results ~ states, data = final_df2)

#str(final_df2)
final_df2 %>% group_by(states) %>% summarize(counts = n(), na.rm = T)
## # A tibble: 3 x 3
## states counts na.rm
## <chr> <int> <lgl>
## 1 " MI" 55 TRUE
## 2 " OH" 1 TRUE
## 3 " ON" 8 TRUE