Background

The assignment tasks students with creating a Mardown file that produces a .csv file for further analysis from a text file with chess tournament results.

Import and Clean Data

chess_file = "https://raw.githubusercontent.com/evanmclaughlin/ECM607/9e2ce75e6992b76c132019c38ce9c51eb0660570/tournamentinfo.txt"

chess_data= readLines( chess_file , warn = FALSE)

#next, eliminate dashed lines and white space
chess_data<-chess_data[!str_detect(chess_data,"\\-----")]
chess_data<-str_split(chess_data[3:length(chess_data)],"\\|")

head(chess_data)
## [[1]]
##  [1] "    1 "                            " GARY HUA                        "
##  [3] "6.0  "                             "W  39"                            
##  [5] "W  21"                             "W  18"                            
##  [7] "W  14"                             "W   7"                            
##  [9] "D  12"                             "D   4"                            
## [11] ""                                 
## 
## [[2]]
##  [1] "   ON "                            " 15445895 / R: 1794   ->1817     "
##  [3] "N:2  "                             "W    "                            
##  [5] "B    "                             "W    "                            
##  [7] "B    "                             "W    "                            
##  [9] "B    "                             "W    "                            
## [11] ""                                 
## 
## [[3]]
##  [1] "    2 "                            " DAKSHESH DARURI                 "
##  [3] "6.0  "                             "W  63"                            
##  [5] "W  58"                             "L   4"                            
##  [7] "W  17"                             "W  16"                            
##  [9] "W  20"                             "W   7"                            
## [11] ""                                 
## 
## [[4]]
##  [1] "   MI "                            " 14598900 / R: 1553   ->1663     "
##  [3] "N:2  "                             "B    "                            
##  [5] "W    "                             "B    "                            
##  [7] "W    "                             "B    "                            
##  [9] "W    "                             "B    "                            
## [11] ""                                 
## 
## [[5]]
##  [1] "    3 "                            " ADITYA BAJAJ                    "
##  [3] "6.0  "                             "L   8"                            
##  [5] "W  61"                             "W  25"                            
##  [7] "W  21"                             "W  11"                            
##  [9] "W  13"                             "W  12"                            
## [11] ""                                 
## 
## [[6]]
##  [1] "   MI "                            " 14959604 / R: 1384   ->1640     "
##  [3] "N:2  "                             "W    "                            
##  [5] "B    "                             "W    "                            
##  [7] "B    "                             "W    "                            
##  [9] "B    "                             "W    "                            
## [11] ""

Extract appropriate data for subsetting

#start with names
chess_ext <- str_extract_all(chess_data,"[[:alpha:]-?[:alpha:]  ?]{2,}")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
chess_names <- unlist(chess_ext)[str_detect(unlist(chess_ext),"[[:alpha:]]{3,}")]
#head(chess_names)

#next, extract state
chess_state <- unlist(chess_ext)[(str_detect(unlist(chess_ext),"[[:alpha:]]{2}"))&(!str_detect(unlist(chess_ext),"[[:alpha:]]{3,}"))]
#head(chess_state)

#Total points next, but first extract all numerical data for subset
chess_num <- str_extract_all(chess_data,"\\d{1,}+\\.?.?")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
points <- unlist(chess_num)[(str_detect(unlist(chess_num),"\\d\\.\\d"))]

#head(points)

# now for pre-rating

rating <- str_extract_all(chess_data,"(( \\:)|(\\>))?.?\\d{1,}P*\\.?")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
pre_rating <- unlist(rating)[(str_detect(unlist(rating),"\\b\\d{3,4}P?\\b"))&(!str_detect(unlist(rating),"\\>.?\\b\\d{3,4}P?\\b"))]
pre_rating <- str_replace_all(pre_rating,"P","")
#head(pre_rating)

id <- seq(1,64,by=1)
names <- str_trim(chess_names,"both")
state <- str_trim(chess_state,"both")
pre_rating <- str_trim(pre_rating,"both")
rankings <- as.data.frame(cbind(id, names, state, points, pre_rating))

head(rankings)
##   id               names state points pre_rating
## 1  1            GARY HUA    ON    6.0       1794
## 2  2     DAKSHESH DARURI    MI    6.0       1553
## 3  3        ADITYA BAJAJ    MI    6.0       1384
## 4  4 PATRICK H SCHILLING    MI    5.5       1716
## 5  5          HANSHI ZUO    MI    5.5       1655
## 6  6         HANSEN SONG    OH    5.0       1686

Now that we have our data extracted, we can extract the opponent data for the calculated field

#extract opponent data now, pulling in games played
games_info <- str_replace_all(str_extract_all(str_extract_all(chess_data,"[WDL]...\\d{1,2}"),"\\.?\\d{1,2}"),"\\b[0]\\b",".")
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : argument is not an atomic vector; coercing
games <- str_detect(games_info,fixed("."))
games_info <- games_info[!(games)]

head(games_info)
## [1] "c(\"39\", \"21\", \"18\", \"14\", \"7\", \"12\", \"4\")"  
## [2] "c(\"63\", \"58\", \"4\", \"17\", \"16\", \"20\", \"7\")"  
## [3] "c(\"8\", \"61\", \"25\", \"21\", \"11\", \"13\", \"12\")" 
## [4] "c(\"23\", \"28\", \"2\", \"26\", \"5\", \"19\", \"1\")"   
## [5] "c(\"45\", \"37\", \"12\", \"13\", \"4\", \"14\", \"17\")" 
## [6] "c(\"34\", \"29\", \"11\", \"35\", \"10\", \"27\", \"21\")"

Now that the data is cleaned, we can focus on calculating the average opponent rating

rankings[] <- lapply(rankings, gsub, pattern = '>', replacement='')
rankings[] <- lapply(rankings, gsub, pattern = 'P', replacement='')

#create a loop to run calculation for each player using opponent data
matches <- array(0, dim = nrow(rankings))
#make calculation fields numeric
rankings$points <- as.numeric(as.character(rankings$points))
rankings$pre_rating <- as.numeric(as.character(rankings$pre_rating))

for (i in 1:nrow(rankings))
  {outcomes <- as.numeric(str_split(unlist(str_extract_all(games_info[i],"\\d{1,2}"))," "))
matches[i] <- mean(rankings[outcomes, colnames(rankings) == "pre_rating"])
}

Now letโ€™s organize our data one more time before printing

rankings$avg_rating <- matches
head(rankings)
##   id              names state points pre_rating avg_rating
## 1  1           GARY HUA    ON    6.0       1794   1605.286
## 2  2    DAKSHESH DARURI    MI    6.0       1553   1469.286
## 3  3       ADITYA BAJAJ    MI    6.0       1384   1563.571
## 4  4 ATRICK H SCHILLING    MI    5.5       1716   1573.571
## 5  5         HANSHI ZUO    MI    5.5       1655   1500.857
## 6  6        HANSEN SONG    OH    5.0       1686   1518.714
write.csv(rankings, "chess_final.csv", row.names = TRUE)