Data607_Assignment03

Read in data
Remove dashed lines
Create data.frame with a single column out of the indidivual ‘lines’
Save out vector of column names
Split the single column into several columns based on the pipe delimiter
Trim all columns
Break this data.frame into two sets, even rows in one set, odd rows in another
Parse state and ratings information
Parse topDf from “wide” to “long” format to deal with round information
Merge in pre-Ratings for opponents
Calculate average opponent pre-rating, as well as a calculated total points
Merge in State and pre-rating values
Clean up column names and save out .csv file

Github for homework: https://github.com/rweberc/Data607_Project1

Read in data

con <- file("https://raw.githubusercontent.com/rweberc/Data607_Project1/master/rawdata.txt", open="r")
line <- readLines(con)
close(con)

Remove dashed lines

line <- line[!str_detect(line, "^-")]
line[1:5]

## [1] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [2] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [3] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [4] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
## [5] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"

Create data.frame with a single column out of the indidivual ‘lines’

rawData <- data.frame(col1 = line[3:length(line)], stringsAsFactors = FALSE)

# Remove final pipe delimiter
rawData$col1 <- str_replace(rawData$col1, "\\| *$", "")
head(rawData)

##                                                                                       col1
## 1     1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4
## 2    ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    
## 3     2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7
## 4    MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    
## 5     3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12
## 6    MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W

Save out vector of column names

colNames1 <- str_trim(unlist(str_split(line[1], "\\|")))
colNames2 <- str_trim(unlist(str_split(line[2], "\\|")))

colNames <- c(colNames1[1:2], paste(colNames1[3:10], colNames2[3:10], sep=" "))
colNames

##  [1] "Pair"        "Player Name" "Total Pts"   "Round 1"     "Round 2"    
##  [6] "Round 3"     "Round 4"     "Round 5"     "Round 6"     "Round 7"

Split the single column into several columns based on the pipe delimiter

splitDf <- separate(rawData, col1, into = colNames, sep = "\\|", remove = TRUE,
         convert = FALSE, extra = "warn", fill = "warn")
head(splitDf)

##     Pair                       Player Name Total Pts Round 1 Round 2
## 1     1   GARY HUA                             6.0     W  39   W  21
## 2    ON   15445895 / R: 1794   ->1817          N:2     W       B    
## 3     2   DAKSHESH DARURI                      6.0     W  63   W  58
## 4    MI   14598900 / R: 1553   ->1663          N:2     B       W    
## 5     3   ADITYA BAJAJ                         6.0     L   8   W  61
## 6    MI   14959604 / R: 1384   ->1640          N:2     W       B    
##   Round 3 Round 4 Round 5 Round 6 Round 7
## 1   W  18   W  14   W   7   D  12   D   4
## 2   W       B       W       B       W    
## 3   L   4   W  17   W  16   W  20   W   7
## 4   B       W       B       W       B    
## 5   W  25   W  21   W  11   W  13   W  12
## 6   W       B       W       B       W

Trim all columns

splitDf <- splitDf %>% mutate_all(str_trim)

Break this data.frame into two sets, even rows in one set, odd rows in another

topDf <- splitDf[c(TRUE, FALSE), ] # name and rounds data
bottomDf <- splitDf[c(FALSE, TRUE), 1:2] # state and ratings data

Parse state and ratings information

# Add column names for reference
colnames(bottomDf) <- c("PlayerState", "temp")

# Add row id to bottomDf
bottomDf$Pair <- str_trim(topDf$Pair)

# Parse pre/post rating
bottomDf$prePostRating <- str_replace(bottomDf$temp, ".*R: ", "")
bottomDf$prePostRating <- str_replace_all(bottomDf$prePostRating, "P[:digit:]+", "")
bottomDf$preRating <- as.numeric(str_extract(bottomDf$prePostRating, "[:digit:]+"))
bottomDf$postRating <- str_extract(bottomDf$prePostRating, "[:digit:]+$")

# Select off needed columns
bottomDf <- select(bottomDf, Pair, PlayerState, preRating)

head(bottomDf)

##    Pair PlayerState preRating
## 2     1          ON      1794
## 4     2          MI      1553
## 6     3          MI      1384
## 8     4          MI      1716
## 10    5          MI      1655
## 12    6          OH      1686

Parse topDf from “wide” to “long” format to deal with round information

‘Points’ information is parsed here for use to compare later against given “Total Pts”, as a check on the work

roundsDf <- gather(topDf, "Round", "temp", 4:10)
roundsDf$Result <- str_replace(roundsDf$temp, " .*", "")
roundsDf$Opponent <- str_trim(str_extract(roundsDf$temp, "[:digit:]+"))
roundsDf$Points <- ifelse(roundsDf$Result %in% c("W", "B", "X"), 1,
                             if_else(roundsDf$Result %in% c("D", "H"), .5,
                                     0))
head(roundsDf)

##   Pair         Player Name Total Pts   Round  temp Result Opponent Points
## 1    1            GARY HUA       6.0 Round 1 W  39      W       39      1
## 2    2     DAKSHESH DARURI       6.0 Round 1 W  63      W       63      1
## 3    3        ADITYA BAJAJ       6.0 Round 1 L   8      L        8      0
## 4    4 PATRICK H SCHILLING       5.5 Round 1 W  23      W       23      1
## 5    5          HANSHI ZUO       5.5 Round 1 W  45      W       45      1
## 6    6         HANSEN SONG       5.0 Round 1 W  34      W       34      1

Merge in pre-Ratings for opponents

# merge in preRates (note... cases where there was no opponent are dropped here)
roundsDf <- left_join(roundsDf, select(bottomDf, Pair, preRating), by=c("Opponent"="Pair"))

Calculate average opponent pre-rating, as well as a calculated total points

finalDf <- roundsDf %>% 
  group_by(Pair, `Player Name`, `Total Pts`) %>%
  summarize(totalPointsCalc = sum(Points, na.rm = TRUE),
            avgOppPreRating = round(mean(preRating, na.rm = TRUE), digits=0))

stopifnot(all(as.numeric(finalDf$`Total Pts`) == finalDf$totalPointsCalc))

Merge in State and pre-rating values

finalDf <- inner_join(bottomDf, 
                      select(finalDf, c(Pair, `Player Name`, `Total Pts`, avgOppPreRating)),
                      by="Pair")

Clean up column names and save out .csv file

finalDf <- finalDf %>%
  rename(PlayerName = `Player Name`, TotalPts = `Total Pts`, PlayerPreRating = `preRating`,
         AvgOppPreRating = avgOppPreRating) %>%
  select(PlayerName, PlayerState, TotalPts, PlayerPreRating, AvgOppPreRating)

head(finalDf)

##            PlayerName PlayerState TotalPts PlayerPreRating AvgOppPreRating
## 1            GARY HUA          ON      6.0            1794            1605
## 2     DAKSHESH DARURI          MI      6.0            1553            1469
## 3        ADITYA BAJAJ          MI      6.0            1384            1564
## 4 PATRICK H SCHILLING          MI      5.5            1716            1574
## 5          HANSHI ZUO          MI      5.5            1655            1501
## 6         HANSEN SONG          OH      5.0            1686            1519

write.csv(finalDf, file = "/Users/ryanweber/Desktop/CUNY/Data 607 Db/Projects/Project 1/chess.csv",
          row.names = FALSE)

Data607_Assignment03

Ryan Weber

February 18, 2018

Read in data

Remove dashed lines

Create data.frame with a single column out of the indidivual ‘lines’

Save out vector of column names

Split the single column into several columns based on the pipe delimiter

Trim all columns

Break this data.frame into two sets, even rows in one set, odd rows in another

Parse state and ratings information

Parse topDf from “wide” to “long” format to deal with round information

Merge in pre-Ratings for opponents

Calculate average opponent pre-rating, as well as a calculated total points

Merge in State and pre-rating values

Clean up column names and save out .csv file