Github for homework: https://github.com/rweberc/Data607_Project1
con <- file("https://raw.githubusercontent.com/rweberc/Data607_Project1/master/rawdata.txt", open="r")
line <- readLines(con)
close(con)
line <- line[!str_detect(line, "^-")]
line[1:5]
## [1] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [2] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [3] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [4] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [5] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
rawData <- data.frame(col1 = line[3:length(line)], stringsAsFactors = FALSE)
# Remove final pipe delimiter
rawData$col1 <- str_replace(rawData$col1, "\\| *$", "")
head(rawData)
## col1
## 1 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4
## 2 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W
## 3 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7
## 4 MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B
## 5 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12
## 6 MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W
colNames1 <- str_trim(unlist(str_split(line[1], "\\|")))
colNames2 <- str_trim(unlist(str_split(line[2], "\\|")))
colNames <- c(colNames1[1:2], paste(colNames1[3:10], colNames2[3:10], sep=" "))
colNames
## [1] "Pair" "Player Name" "Total Pts" "Round 1" "Round 2"
## [6] "Round 3" "Round 4" "Round 5" "Round 6" "Round 7"
splitDf <- separate(rawData, col1, into = colNames, sep = "\\|", remove = TRUE,
convert = FALSE, extra = "warn", fill = "warn")
head(splitDf)
## Pair Player Name Total Pts Round 1 Round 2
## 1 1 GARY HUA 6.0 W 39 W 21
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B
## 3 2 DAKSHESH DARURI 6.0 W 63 W 58
## 4 MI 14598900 / R: 1553 ->1663 N:2 B W
## 5 3 ADITYA BAJAJ 6.0 L 8 W 61
## 6 MI 14959604 / R: 1384 ->1640 N:2 W B
## Round 3 Round 4 Round 5 Round 6 Round 7
## 1 W 18 W 14 W 7 D 12 D 4
## 2 W B W B W
## 3 L 4 W 17 W 16 W 20 W 7
## 4 B W B W B
## 5 W 25 W 21 W 11 W 13 W 12
## 6 W B W B W
splitDf <- splitDf %>% mutate_all(str_trim)
topDf <- splitDf[c(TRUE, FALSE), ] # name and rounds data
bottomDf <- splitDf[c(FALSE, TRUE), 1:2] # state and ratings data
# Add column names for reference
colnames(bottomDf) <- c("PlayerState", "temp")
# Add row id to bottomDf
bottomDf$Pair <- str_trim(topDf$Pair)
# Parse pre/post rating
bottomDf$prePostRating <- str_replace(bottomDf$temp, ".*R: ", "")
bottomDf$prePostRating <- str_replace_all(bottomDf$prePostRating, "P[:digit:]+", "")
bottomDf$preRating <- as.numeric(str_extract(bottomDf$prePostRating, "[:digit:]+"))
bottomDf$postRating <- str_extract(bottomDf$prePostRating, "[:digit:]+$")
# Select off needed columns
bottomDf <- select(bottomDf, Pair, PlayerState, preRating)
head(bottomDf)
## Pair PlayerState preRating
## 2 1 ON 1794
## 4 2 MI 1553
## 6 3 MI 1384
## 8 4 MI 1716
## 10 5 MI 1655
## 12 6 OH 1686
‘Points’ information is parsed here for use to compare later against given “Total Pts”, as a check on the work
roundsDf <- gather(topDf, "Round", "temp", 4:10)
roundsDf$Result <- str_replace(roundsDf$temp, " .*", "")
roundsDf$Opponent <- str_trim(str_extract(roundsDf$temp, "[:digit:]+"))
roundsDf$Points <- ifelse(roundsDf$Result %in% c("W", "B", "X"), 1,
if_else(roundsDf$Result %in% c("D", "H"), .5,
0))
head(roundsDf)
## Pair Player Name Total Pts Round temp Result Opponent Points
## 1 1 GARY HUA 6.0 Round 1 W 39 W 39 1
## 2 2 DAKSHESH DARURI 6.0 Round 1 W 63 W 63 1
## 3 3 ADITYA BAJAJ 6.0 Round 1 L 8 L 8 0
## 4 4 PATRICK H SCHILLING 5.5 Round 1 W 23 W 23 1
## 5 5 HANSHI ZUO 5.5 Round 1 W 45 W 45 1
## 6 6 HANSEN SONG 5.0 Round 1 W 34 W 34 1
# merge in preRates (note... cases where there was no opponent are dropped here)
roundsDf <- left_join(roundsDf, select(bottomDf, Pair, preRating), by=c("Opponent"="Pair"))
finalDf <- roundsDf %>%
group_by(Pair, `Player Name`, `Total Pts`) %>%
summarize(totalPointsCalc = sum(Points, na.rm = TRUE),
avgOppPreRating = round(mean(preRating, na.rm = TRUE), digits=0))
stopifnot(all(as.numeric(finalDf$`Total Pts`) == finalDf$totalPointsCalc))
finalDf <- inner_join(bottomDf,
select(finalDf, c(Pair, `Player Name`, `Total Pts`, avgOppPreRating)),
by="Pair")
finalDf <- finalDf %>%
rename(PlayerName = `Player Name`, TotalPts = `Total Pts`, PlayerPreRating = `preRating`,
AvgOppPreRating = avgOppPreRating) %>%
select(PlayerName, PlayerState, TotalPts, PlayerPreRating, AvgOppPreRating)
head(finalDf)
## PlayerName PlayerState TotalPts PlayerPreRating AvgOppPreRating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
write.csv(finalDf, file = "/Users/ryanweber/Desktop/CUNY/Data 607 Db/Projects/Project 1/chess.csv",
row.names = FALSE)