Github for homework: https://github.com/rweberc/Data607_Project1

Read in data

con <- file("https://raw.githubusercontent.com/rweberc/Data607_Project1/master/rawdata.txt", open="r")
line <- readLines(con)
close(con)

Remove dashed lines

line <- line[!str_detect(line, "^-")]
line[1:5]
## [1] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [2] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [3] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [4] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [5] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"

Create data.frame with a single column out of the indidivual ‘lines’

rawData <- data.frame(col1 = line[3:length(line)], stringsAsFactors = FALSE)

# Remove final pipe delimiter
rawData$col1 <- str_replace(rawData$col1, "\\| *$", "")
head(rawData)  
##                                                                                       col1
## 1     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4
## 2    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    
## 3     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7
## 4    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    
## 5     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12
## 6    MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W

Save out vector of column names

colNames1 <- str_trim(unlist(str_split(line[1], "\\|")))
colNames2 <- str_trim(unlist(str_split(line[2], "\\|")))

colNames <- c(colNames1[1:2], paste(colNames1[3:10], colNames2[3:10], sep=" "))
colNames
##  [1] "Pair"        "Player Name" "Total Pts"   "Round 1"     "Round 2"    
##  [6] "Round 3"     "Round 4"     "Round 5"     "Round 6"     "Round 7"

Split the single column into several columns based on the pipe delimiter

splitDf <- separate(rawData, col1, into = colNames, sep = "\\|", remove = TRUE,
         convert = FALSE, extra = "warn", fill = "warn")
head(splitDf)
##     Pair                       Player Name Total Pts Round 1 Round 2
## 1     1   GARY HUA                             6.0     W  39   W  21
## 2    ON   15445895 / R: 1794   ->1817          N:2     W       B    
## 3     2   DAKSHESH DARURI                      6.0     W  63   W  58
## 4    MI   14598900 / R: 1553   ->1663          N:2     B       W    
## 5     3   ADITYA BAJAJ                         6.0     L   8   W  61
## 6    MI   14959604 / R: 1384   ->1640          N:2     W       B    
##   Round 3 Round 4 Round 5 Round 6 Round 7
## 1   W  18   W  14   W   7   D  12   D   4
## 2   W       B       W       B       W    
## 3   L   4   W  17   W  16   W  20   W   7
## 4   B       W       B       W       B    
## 5   W  25   W  21   W  11   W  13   W  12
## 6   W       B       W       B       W

Trim all columns

splitDf <- splitDf %>% mutate_all(str_trim)

Break this data.frame into two sets, even rows in one set, odd rows in another

topDf <- splitDf[c(TRUE, FALSE), ] # name and rounds data
bottomDf <- splitDf[c(FALSE, TRUE), 1:2] # state and ratings data

Parse state and ratings information

# Add column names for reference
colnames(bottomDf) <- c("PlayerState", "temp")

# Add row id to bottomDf
bottomDf$Pair <- str_trim(topDf$Pair)

# Parse pre/post rating
bottomDf$prePostRating <- str_replace(bottomDf$temp, ".*R: ", "")
bottomDf$prePostRating <- str_replace_all(bottomDf$prePostRating, "P[:digit:]+", "")
bottomDf$preRating <- as.numeric(str_extract(bottomDf$prePostRating, "[:digit:]+"))
bottomDf$postRating <- str_extract(bottomDf$prePostRating, "[:digit:]+$")

# Select off needed columns
bottomDf <- select(bottomDf, Pair, PlayerState, preRating)

head(bottomDf)
##    Pair PlayerState preRating
## 2     1          ON      1794
## 4     2          MI      1553
## 6     3          MI      1384
## 8     4          MI      1716
## 10    5          MI      1655
## 12    6          OH      1686

Parse topDf from “wide” to “long” format to deal with round information

‘Points’ information is parsed here for use to compare later against given “Total Pts”, as a check on the work

roundsDf <- gather(topDf, "Round", "temp", 4:10)
roundsDf$Result <- str_replace(roundsDf$temp, " .*", "")
roundsDf$Opponent <- str_trim(str_extract(roundsDf$temp, "[:digit:]+"))
roundsDf$Points <- ifelse(roundsDf$Result %in% c("W", "B", "X"), 1,
                             if_else(roundsDf$Result %in% c("D", "H"), .5,
                                     0))
head(roundsDf)
##   Pair         Player Name Total Pts   Round  temp Result Opponent Points
## 1    1            GARY HUA       6.0 Round 1 W  39      W       39      1
## 2    2     DAKSHESH DARURI       6.0 Round 1 W  63      W       63      1
## 3    3        ADITYA BAJAJ       6.0 Round 1 L   8      L        8      0
## 4    4 PATRICK H SCHILLING       5.5 Round 1 W  23      W       23      1
## 5    5          HANSHI ZUO       5.5 Round 1 W  45      W       45      1
## 6    6         HANSEN SONG       5.0 Round 1 W  34      W       34      1

Merge in pre-Ratings for opponents

# merge in preRates (note... cases where there was no opponent are dropped here)
roundsDf <- left_join(roundsDf, select(bottomDf, Pair, preRating), by=c("Opponent"="Pair"))

Calculate average opponent pre-rating, as well as a calculated total points

finalDf <- roundsDf %>% 
  group_by(Pair, `Player Name`, `Total Pts`) %>%
  summarize(totalPointsCalc = sum(Points, na.rm = TRUE),
            avgOppPreRating = round(mean(preRating, na.rm = TRUE), digits=0))

stopifnot(all(as.numeric(finalDf$`Total Pts`) == finalDf$totalPointsCalc))

Merge in State and pre-rating values

finalDf <- inner_join(bottomDf, 
                      select(finalDf, c(Pair, `Player Name`, `Total Pts`, avgOppPreRating)),
                      by="Pair")

Clean up column names and save out .csv file

finalDf <- finalDf %>%
  rename(PlayerName = `Player Name`, TotalPts = `Total Pts`, PlayerPreRating = `preRating`,
         AvgOppPreRating = avgOppPreRating) %>%
  select(PlayerName, PlayerState, TotalPts, PlayerPreRating, AvgOppPreRating)

head(finalDf)
##            PlayerName PlayerState TotalPts PlayerPreRating AvgOppPreRating
## 1            GARY HUA          ON      6.0            1794            1605
## 2     DAKSHESH DARURI          MI      6.0            1553            1469
## 3        ADITYA BAJAJ          MI      6.0            1384            1564
## 4 PATRICK H SCHILLING          MI      5.5            1716            1574
## 5          HANSHI ZUO          MI      5.5            1655            1501
## 6         HANSEN SONG          OH      5.0            1686            1519
write.csv(finalDf, file = "/Users/ryanweber/Desktop/CUNY/Data 607 Db/Projects/Project 1/chess.csv",
          row.names = FALSE)