library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyr)

Loading of the tournamentinfo text file

data<-readLines('https://raw.githubusercontent.com/metis-macys-66898/data_607_sp2020/master/tournamentinfo.txt', warn = 'F')


data<-data[data != strrep("-", 90-1)]

#head(data)
#length(data)

data<-data[3:130]
head(data)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [4] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [5] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [6] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

The way that we’re set up is we started to import the text file using readLines as a character vector. Then I’m going to use regexp to parse it into individual character vectors which are the variables that we can use to construct a final dataframe.

# Capturing names like CAMERON WILLIAM MC LEMAN and SEAN M MC CORMICK
# assuming the name could max out the field length allowed, I do not expect a space to be matched in the end for 2-word, 3-word, 4-word names. 

name_regex<-"[0-9] \\| [a-zA-Z]+\\s[a-zA-Z]+\\s?([a-zA-Z]+\\s?)*[a-zA-Z]*"
# removing the leading and trailing space along with the leading non-named related string that I wanted to remove
name<- trimws(str_remove_all(unlist(str_extract_all(data, name_regex)), "\\d\\s\\|"))
name
##  [1] "GARY HUA"                 "DAKSHESH DARURI"         
##  [3] "ADITYA BAJAJ"             "PATRICK H SCHILLING"     
##  [5] "HANSHI ZUO"               "HANSEN SONG"             
##  [7] "GARY DEE SWATHELL"        "EZEKIEL HOUGHTON"        
##  [9] "STEFANO LEE"              "ANVIT RAO"               
## [11] "CAMERON WILLIAM MC LEMAN" "KENNETH J TACK"          
## [13] "TORRANCE HENRY JR"        "BRADLEY SHAW"            
## [15] "ZACHARY JAMES HOUGHTON"   "MIKE NIKITIN"            
## [17] "RONALD GRZEGORCZYK"       "DAVID SUNDEEN"           
## [19] "DIPANKAR ROY"             "JASON ZHENG"             
## [21] "DINH DANG BUI"            "EUGENE L MCCLURE"        
## [23] "ALAN BUI"                 "MICHAEL R ALDRICH"       
## [25] "LOREN SCHWIEBERT"         "MAX ZHU"                 
## [27] "GAURAV GIDWANI"           "SOFIA ADINA STANESCU"    
## [29] "CHIEDOZIE OKORIE"         "GEORGE AVERY JONES"      
## [31] "RISHI SHETTY"             "JOSHUA PHILIP MATHEWS"   
## [33] "JADE GE"                  "MICHAEL JEFFERY THOMAS"  
## [35] "JOSHUA DAVID LEE"         "SIDDHARTH JHA"           
## [37] "AMIYATOSH PWNANANDAM"     "BRIAN LIU"               
## [39] "JOEL R HENDON"            "FOREST ZHANG"            
## [41] "KYLE WILLIAM MURPHY"      "JARED GE"                
## [43] "ROBERT GLEN VASEY"        "JUSTIN D SCHILLING"      
## [45] "DEREK YAN"                "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT"              "DANIEL KHAIN"            
## [49] "MICHAEL J MARTIN"         "SHIVAM JHA"              
## [51] "TEJAS AYYAGARI"           "ETHAN GUO"               
## [53] "JOSE C YBARRA"            "LARRY HODGE"             
## [55] "ALEX KONG"                "MARISA RICCI"            
## [57] "MICHAEL LU"               "VIRAJ MOHILE"            
## [59] "SEAN M MC CORMICK"        "JULIA SHEN"              
## [61] "JEZZEL FARKAS"            "ASHWIN BALAJI"           
## [63] "THOMAS JOSEPH HOSMER"     "BEN LI"
state_regex<-"\\s{2}[A-Z]{2}"
states<-unlist(str_extract_all(data, state_regex))
states
##  [1] "  ON" "  MI" "  MI" "  MI" "  MI" "  OH" "  MI" "  MI" "  ON" "  MI"
## [11] "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI"
## [21] "  ON" "  MI" "  ON" "  MI" "  MI" "  ON" "  MI" "  MI" "  MI" "  ON"
## [31] "  MI" "  ON" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI"
## [41] "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI"
## [51] "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI" "  MI"
## [61] "  ON" "  MI" "  MI" "  MI"
points_regex<-"\\d\\.\\d"
points<-unlist(str_extract_all(data, points_regex))
points
##  [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5" "4.5"
## [13] "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [25] "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [37] "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "2.5" "2.5"
## [49] "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0" "2.0" "2.0" "2.0" "2.0" "1.5"
## [61] "1.5" "1.0" "1.0" "1.0"
pre_rating_regex<-"R:\\s+[0-9]{3,}"
pre_rating<-str_remove_all(unlist(str_extract_all(data, pre_rating_regex)), "R:\\s")
pre_rating
##  [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" "1365"
## [11] "1712" "1663" "1666" "1610" "1220" "1604" "1629" "1600" "1564" "1595"
## [21] "1563" "1555" "1363" "1229" "1745" "1579" "1552" "1507" "1602" "1522"
## [31] "1494" "1441" "1449" "1399" "1438" "1355" " 980" "1423" "1436" "1348"
## [41] "1403" "1332" "1283" "1199" "1242" " 377" "1362" "1382" "1291" "1056"
## [51] "1011" " 935" "1393" "1270" "1186" "1153" "1092" " 917" " 853" " 967"
## [61] " 955" "1530" "1175" "1163"
uscf_regex<-"[0-9]{8}"
USCF_ID<-unlist(str_extract_all(data, uscf_regex))
USCF_ID
##  [1] "15445895" "14598900" "14959604" "12616049" "14601533" "15055204"
##  [7] "11146376" "15142253" "14954524" "14150362" "12581589" "12681257"
## [13] "15082995" "10131499" "15619130" "10295068" "10297702" "11342094"
## [19] "14862333" "14529060" "15495066" "12405534" "15030142" "13469010"
## [25] "12486656" "15131520" "14476567" "14882954" "15323285" "12577178"
## [31] "15131618" "14073750" "14691842" "15051807" "14601397" "14773163"
## [37] "15489571" "15108523" "12923035" "14892710" "15761443" "14462326"
## [43] "14101068" "15323504" "15372807" "15490981" "12533115" "14369165"
## [49] "12531685" "14773178" "15205474" "14918803" "12578849" "12836773"
## [55] "15412571" "14679887" "15113330" "14700365" "12841036" "14579262"
## [61] "15771592" "15219542" "15057092" "15006561"
# length(uscf_id)


# this will match anything that is W, L, or D, but only exclude other types from the two ends of the bar (|)
games_regex<-"\\|[W|L|D].+[0-9]{1,2}\\|"
# Having this games_regex_removed ensures we do not take in any B, H, or U
games_regex_removed <- "\\|[^(W|L|D)]\\s{4}"
games<- unlist(str_extract_all(data, games_regex))
games<-str_remove_all(games, games_regex_removed)
# str_extract_all with simplify defaulted to F gives us a list of character vectors, which can then be unlisted to collapse to 64 elements
games<-strsplit(unlist(str_extract_all(data, games_regex)), "\\|")
# after splitting on string literal bar (|), we should end with 64 elements in the list
length(games)
## [1] 64
# creating the list of games for all 64 players
for (i in 1:length(games)){
  games[[i]]<-games[[i]][2:length(games[[i]])]

}

To answer the question for getting the average of the Pre Chess Rating of Opponents for players 1 and 37 in the Intro YouTube video, we create a function to get the opponent_scores needed to create the data frame for the final output. Need to use extra caution here as most of the variables are defaulted to factors. I need to explicitly change it back to chracter, numeric, or integers. Note that while writeLines has its merits, I do think read.csv is another standard approach where you can set stringsAsFactors = F to take care of the Factors issues. The key here is to replace the output of opponent_score, i.e. game_opponent_rank (denoted as a 1-to-2-digit string), with the pre_rating of the opponents using the right lookup mechanisms that can be achieved with for loops and basic dataframe operations. populate_opp_scores is the function that gets that job done.

opponent_score <- function(game_num) {
  game_outcome <- rep(NA,64)
  game_opponent_rank <- rep(NA, 64)
  
for (i in 1:length(games)){
    # this pulls out the 1-character W, L, D 
    game_outcome[i]<-str_extract(games[[i]][game_num],"[A-Za-z]{1}") 
    # This gives me the opponent rank of a given game i
    game_opponent_rank[i]<-str_extract(games[[i]][game_num],"[0-9]{1,2}")
    
}
  result <- list(game_outcome, game_opponent_rank)
  return (result)
}

#opponent_score(1)[[2]]

df <- data.frame(USCF_ID, 
                 name, 
                 states,
                 points, 
                 pre_rating, 
                 opponent_score(1)[[2]], 
                 opponent_score(2)[[2]],
                 opponent_score(3)[[2]], 
                 opponent_score(4)[[2]],
                 opponent_score(5)[[2]], 
                 opponent_score(6)[[2]],
                 opponent_score(7)[[2]]
                )
#df
#head(df,2)
#Renaming opponent_score for the 7 columns 
names(df) <- c("USCF_ID", "name", "states", "points", "pre_rating", "opp_rating_1", "opp_rating_2", "opp_rating_3", "opp_rating_4", "opp_rating_5", "opp_rating_6", "opp_rating_7")
names(df)
##  [1] "USCF_ID"      "name"         "states"       "points"       "pre_rating"  
##  [6] "opp_rating_1" "opp_rating_2" "opp_rating_3" "opp_rating_4" "opp_rating_5"
## [11] "opp_rating_6" "opp_rating_7"
#str(df)
# points scored needed to be in numeric 
df$points<-as.numeric(as.character(df$points))
# USCF_ID along with all the opponent scores, and pre_Ratings are supposed to be all integers
cols <- c(names(df[, 5:12]), "USCF_ID")
df[,cols] <- apply(df[,cols], 2, function(x) as.integer(as.character(x)))

#str(df)
#df

char_cols <- c("name", "states")
df[,char_cols] <- apply(df[,char_cols], 2, function(x) as.character(x))

#col_names <- names(df[,6:12])
opp_ratings<- c('opp_rating_1', 'opp_rating_2', 'opp_rating_3', 'opp_rating_4', 'opp_rating_5', 'opp_rating_6', 'opp_rating_7')

populate_opp_scores <- function (df, cols){
for (j in opp_ratings) {
  for (i in 1:nrow(df)){
    df[[j]][i]<- df$pre_rating[df[[j]][i]]
  }
  
}
  return (df)
}



df<- populate_opp_scores(df, opp_ratings)
#df
#df[37,]

# calculating row-level means using rowMeans
df$average_opponent_rating <- rowMeans(df[,opp_ratings], na.rm = T)

#df
# Do not need to use group_by if the dataset is large. 
final_df <- select( df, -USCF_ID, -all_of(opp_ratings) ) %>% group_by(name, states, points, pre_rating) %>%  arrange(desc(average_opponent_rating))

final_df
## # A tibble: 64 x 5
## # Groups:   name, states, points, pre_rating [64]
##    name                 states points pre_rating average_opponent_rating
##    <chr>                <chr>   <dbl>      <int>                   <dbl>
##  1 GARY HUA             "  ON"    6         1794                   1605.
##  2 PATRICK H SCHILLING  "  MI"    5.5       1716                   1574.
##  3 ADITYA BAJAJ         "  MI"    6         1384                   1564.
##  4 ANVIT RAO            "  MI"    5         1365                   1554.
##  5 BRIAN LIU            "  MI"    3         1423                   1539.
##  6 STEFANO LEE          "  ON"    5         1411                   1523.
##  7 SOFIA ADINA STANESCU "  MI"    3.5       1507                   1522.
##  8 HANSEN SONG          "  OH"    5         1686                   1519.
##  9 BRADLEY SHAW         "  MI"    4.5       1610                   1515 
## 10 MAX ZHU              "  ON"    3.5       1579                   1507.
## # … with 54 more rows
#head(final_df) 

Writing out the file to csv to complete the task

write.csv(final_df, "project_1_DataFrame_DP.csv")
cur_dir<-getwd()
list.files(cur_dir)
## [1] "proj1_data607.html"         "proj1_data607.Rmd"         
## [3] "Project 1.pdf"              "project_1_DataFrame_DP.csv"
## [5] "rsconnect"                  "tournamentinfo.txt"

Bonus Track:

Furthermore, I’d like to examine the tournament results by comparing the average_opponent_rating and pre_rating (pre-tournament rating) and see if there is an origin that has a bigger difference than the others. It turns out there are only 3 states, missouri (MI), Ohio (OH), and Ontario, Canada (ON) that we have data for. OH has only 1 count of non-NA. I’m going to exclude it from comparison. It turns out that ON has a higher net tournament results, defined by pre-tournament rating minus average opponent rating.

final_df2<-final_df
final_df2$tournament_results <- final_df$pre_rating - final_df$average_opponent_rating
#final_df2


boxplot( tournament_results ~ states, data = final_df2)

#str(final_df2)
final_df2 %>%  group_by(states) %>% summarize(counts  = n(), na.rm = T)
## # A tibble: 3 x 3
##   states counts na.rm
##   <chr>   <int> <lgl>
## 1 "  MI"     55 TRUE 
## 2 "  OH"      1 TRUE 
## 3 "  ON"      8 TRUE