library(knitr)
opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=TRUE)

load data and perform basic clean-up

library(stringr)
tournament.raw <- read.csv(url("https://raw.githubusercontent.com/agCS/DATA607/master/tournamentinfo.txt"), 
    header = FALSE, stringsAsFactors = FALSE)
# after reviewing data using head() and str(), remove any
# escape characters
tour.clean <- str_replace_all(tournament.raw, pattern = "[^[:alnum:]\\| /:->.]", 
    replacement = "")
# split the list at '|' and use 'simplify = TRUE'' to split
# into vector
tour.split <- str_split(tour.clean, "\\|", simplify = TRUE)
# after checking dimension, i see i have extra space as last
# element so i remove it as well as first row that i want to
# replace wtih clean column names
tour.split <- tour.split[-1301]
tour.final <- as.data.frame(matrix(tour.split, ncol = 20, nrow = 65, 
    byrow = TRUE), header = TRUE, stringsAsFactors = FALSE)
tour.final <- tour.final[-1, ]
colnames(tour.final) <- c("Pair", "Name", "Total pts", "Round 1", 
    "Round 2", "Round 3", "Round 4", "Round 5", "Round 6", "Round 7", 
    "State", "UCSF / RTG (Pre->Post)", "Total pts.1", "Round 1.1", 
    "Round 2.1", "Round 3.1", "Round 4.1", "Round 5.1", "Round 6.1", 
    "Round 7.1")

extract pre-rating and add as a new column

tour.final <- transform(tour.final, pre.rating = str_extract(tour.final$`UCSF / RTG (Pre->Post)`, 
    "R: *[[:digit:]]{3,4}"))
tour.final <- transform(tour.final, pre.rating.adj = str_extract(tour.final$pre.rating, 
    "\\d{3,4}"))
tour.final$pre.rating.adj <- as.numeric(as.character(tour.final$pre.rating.adj))

extract opponents and add as new columns

opp.pattern <- "[[:digit:]]+"
tour.final <- transform(tour.final, Opp1 = str_extract(tour.final$Round.1, 
    opp.pattern), Opp2 = str_extract(tour.final$Round.2, opp.pattern), 
    Opp3 = str_extract(tour.final$Round.3, opp.pattern), Opp4 = str_extract(tour.final$Round.4, 
        opp.pattern), Opp5 = str_extract(tour.final$Round.5, 
        opp.pattern), Opp6 = str_extract(tour.final$Round.6, 
        opp.pattern), Opp7 = str_extract(tour.final$Round.7, 
        opp.pattern))
# how to convert all at once?
tour.final$Opp1 <- as.numeric(as.character(tour.final$Opp1))
tour.final$Opp2 <- as.numeric(as.character(tour.final$Opp2))
tour.final$Opp3 <- as.numeric(as.character(tour.final$Opp3))
tour.final$Opp4 <- as.numeric(as.character(tour.final$Opp4))
tour.final$Opp5 <- as.numeric(as.character(tour.final$Opp5))
tour.final$Opp6 <- as.numeric(as.character(tour.final$Opp6))
tour.final$Opp7 <- as.numeric(as.character(tour.final$Opp7))
tour.final$Pair <- as.numeric(tour.final$Pair)

create a column with opponents’ average pre-rating

avg <- rep(NA, nrow(tour.final))
for (i in 1:nrow(tour.final)) {
    opponentsR <- subset(tour.final$pre.rating.adj, tour.final$Pair == 
        tour.final$Opp1[i] | tour.final$Pair == tour.final$Opp2[i] | 
        tour.final$Pair == tour.final$Opp3[i] | tour.final$Pair == 
        tour.final$Opp4[i] | tour.final$Pair == tour.final$Opp5[i] | 
        tour.final$Pair == tour.final$Opp6[i] | tour.final$Pair == 
        tour.final$Opp7[i])
    avg[i] <- mean(opponentsR)
}
tour.final <- cbind(tour.final, avg)

create subset with final attributes

final <- cbind(tour.final$Name, tour.final$State, tour.final$Total.pts, 
    tour.final$pre.rating.adj, tour.final$avg)
colnames(final) <- c("Player's Name", "Player's State", "Total Number of Points", 
    "Player's Pre-Rating", "Average Pre Chess Rating of Opponents")
final <- trimws(final)
head(final)
##      Player's Name         Player's State Total Number of Points
## [1,] "GARY HUA"            "ON"           "6.0"                 
## [2,] "DAKSHESH DARURI"     "MI"           "6.0"                 
## [3,] "ADITYA BAJAJ"        "MI"           "6.0"                 
## [4,] "PATRICK H SCHILLING" "MI"           "5.5"                 
## [5,] "HANSHI ZUO"          "MI"           "5.5"                 
## [6,] "HANSEN SONG"         "OH"           "5.0"                 
##      Player's Pre-Rating Average Pre Chess Rating of Opponents
## [1,] "1794"              "1605.28571428571"                   
## [2,] "1553"              "1469.28571428571"                   
## [3,] "1384"              "1563.57142857143"                   
## [4,] "1716"              "1573.57142857143"                   
## [5,] "1655"              "1500.85714285714"                   
## [6,] "1686"              "1518.71428571429"

write .csv

write.csv(final, file = "Data607 Project 1", row.names = FALSE, 
    quote = FALSE)