library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v readr 1.3.1
## v tibble 3.0.3 v purrr 0.3.4
## v tidyr 1.1.2 v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
In order to better reproduce the data, I stored the raw file on GitHub. I thought it was going to be rather straightforward to use the vertical line (pip) to parse the data… It was not that straightforward as it turns out. Using the read.table function was my best bet after going to the wrong road for a while with read.delim.
Quick Note: I had initially set the “header” argument as TRUE, but that created a lot of problems further down pretty quickly, and my troubleshooting led me to that argument. Switching it solved some things, but made others a bit more tricky!
chessURL <- "https://raw.githubusercontent.com/iscostello/Data607/master/chessMatches.txt"
chessMatchRaw <- read.table(file = chessURL, header = FALSE, skip = 1, sep = "|", fill = TRUE, stringsAsFactors = FALSE)
str(chessMatchRaw)
## 'data.frame': 195 obs. of 11 variables:
## $ V1 : chr " Pair " " Num " "-----------------------------------------------------------------------------------------" " 1 " ...
## $ V2 : chr " Player Name " " USCF ID / Rtg (Pre->Post) " "" " GARY HUA " ...
## $ V3 : chr "Total" " Pts " "" "6.0 " ...
## $ V4 : chr "Round" " 1 " "" "W 39" ...
## $ V5 : chr "Round" " 2 " "" "W 21" ...
## $ V6 : chr "Round" " 3 " "" "W 18" ...
## $ V7 : chr "Round" " 4 " "" "W 14" ...
## $ V8 : chr "Round" " 5 " "" "W 7" ...
## $ V9 : chr "Round" " 6 " "" "D 12" ...
## $ V10: chr "Round" " 7 " "" "D 4" ...
## $ V11: logi NA NA NA NA NA NA ...
Staring at the raw text data, I saw a problem with extracting just the two rows of data that I needed. At the beginning of this project I was pretty lousy with regex, but going over my notes from the R bridge I saw how I had used the yeoman function “seq.” The text file followed a pattern, every three rows, starting with the first and second row. Simple.
Each of the sets of rows was its own data set, and using “cbind” I smushed them together to form a combination set. Here is when I discovered having headers = TRUE was a problem.
chessMatch_PlayerRow <- chessMatchRaw[seq(1,195,3),]
chessMatch_StateRow <- chessMatchRaw[seq(2, 195,3),]
chessMatch_combo <- cbind(chessMatch_PlayerRow[1:10],chessMatch_StateRow[1:10])
head(chessMatch_combo)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 Pair Player Name Total Round Round Round Round Round
## 4 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7
## 7 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16
## 10 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11
## 13 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26 D 5
## 16 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13 D 4
## V9 V10 V1 V2 V3 V4 V5 V6
## 1 Round Round Num USCF ID / Rtg (Pre->Post) Pts 1 2 3
## 4 D 12 D 4 ON 15445895 / R: 1794 ->1817 N:2 W B W
## 7 W 20 W 7 MI 14598900 / R: 1553 ->1663 N:2 B W B
## 10 W 13 W 12 MI 14959604 / R: 1384 ->1640 N:2 W B W
## 13 W 19 D 1 MI 12616049 / R: 1716 ->1744 N:2 W B W
## 16 W 14 W 17 MI 14601533 / R: 1655 ->1690 N:2 B W B
## V7 V8 V9 V10
## 1 4 5 6 7
## 4 B W B W
## 7 W B W B
## 10 B W B W
## 13 B W B B
## 16 W B W B
Without headers there were just blank columns, so I had to create my own.
colnames(chessMatch_combo)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V1" "V2"
## [13] "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10"
Quick Note: I made a mistake by not including the “State” column right away in a subset of the data I made further down. I tacked it on at the first last minute. I tried to build it back into the sets, but it ended up breaking my average calculation for some reason. So this is how I crammed it in to be added to the eventual export.
names(chessMatch_combo)[1] <- "Pairing"
names(chessMatch_combo)[2] <- "PlayerName"
names(chessMatch_combo)[3] <- "TotalPoints"
names(chessMatch_combo)[4] <- "Round1"
names(chessMatch_combo)[5] <- "Round2"
names(chessMatch_combo)[6] <- "Round3"
names(chessMatch_combo)[7] <- "Round4"
names(chessMatch_combo)[8] <- "Round5"
names(chessMatch_combo)[9] <- "Round6"
names(chessMatch_combo)[10] <- "Round7"
names(chessMatch_combo)[12] <- "USCFID.PrePost"
names(chessMatch_combo)[11] <- "State"
colnames(chessMatch_combo)
## [1] "Pairing" "PlayerName" "TotalPoints" "Round1"
## [5] "Round2" "Round3" "Round4" "Round5"
## [9] "Round6" "Round7" "State" "USCFID.PrePost"
## [13] "V3" "V4" "V5" "V6"
## [17] "V7" "V8" "V9" "V10"
I ended up calling the below the “subset.” My intent was to have just the columns I needed to work with later on. I kept adding to this df until by the end it was 31 columns long.
chessMatch_subset <- chessMatch_combo[-1,c(1:10,12)]
head(chessMatch_subset, 10)
## Pairing PlayerName TotalPoints Round1 Round2 Round3
## 4 1 GARY HUA 6.0 W 39 W 21 W 18
## 7 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4
## 10 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25
## 13 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2
## 16 5 HANSHI ZUO 5.5 W 45 W 37 D 12
## 19 6 HANSEN SONG 5.0 W 34 D 29 L 11
## 22 7 GARY DEE SWATHELL 5.0 W 57 W 46 W 13
## 25 8 EZEKIEL HOUGHTON 5.0 W 3 W 32 L 14
## 28 9 STEFANO LEE 5.0 W 25 L 18 W 59
## 31 10 ANVIT RAO 5.0 D 16 L 19 W 55
## Round4 Round5 Round6 Round7 USCFID.PrePost
## 4 W 14 W 7 D 12 D 4 15445895 / R: 1794 ->1817
## 7 W 17 W 16 W 20 W 7 14598900 / R: 1553 ->1663
## 10 W 21 W 11 W 13 W 12 14959604 / R: 1384 ->1640
## 13 W 26 D 5 W 19 D 1 12616049 / R: 1716 ->1744
## 16 D 13 D 4 W 14 W 17 14601533 / R: 1655 ->1690
## 19 W 35 D 10 W 27 W 21 15055204 / R: 1686 ->1687
## 22 W 11 L 1 W 9 L 2 11146376 / R: 1649 ->1673
## 25 L 9 W 47 W 28 W 19 15142253 / R: 1641P17->1657P24
## 28 W 8 W 26 L 7 W 20 14954524 / R: 1411 ->1564
## 31 W 31 D 6 W 25 W 18 14150362 / R: 1365 ->1544
I learned a lot about RegEx in this section. I also learned you have to check for spaces when working with strings and text files.
My first task here is to break apart the USCFID.PrePost column into its three parts: the USCFID, Pre-score, and post-score using RegEx. The ID was easy, looking for 8 digits. And once I got the hang of it the pre and post scores came easily, referencing the String RegEx cheatsheet. I decided I wanted to break the round columns up to gather the Pairing IDs of the opponents, so that I could match on their prescores and swap them in to get the average. So I extracted those too.
Lastly, I got all the way to the end, matching the opponent IDs to the Pairing ID and kept coming up NULL. It was making me absolutely bonkers. I had accounted for the white spaces of the opponents by asking for just the digits (either 1 or 2). I assumed that the Pairing would already come free of any white space - that was a bad assumption. Simply creating a new ID column and matching on that solved my issue.
chessMatch_subset$USCFID <- str_extract_all(chessMatch_subset$USCFID.PrePost,"\\d{8}")
chessMatch_subset$USCFID <- as.character(chessMatch_subset$USCFID)
chessMatch_subset$preScore <- str_extract_all(chessMatch_subset$USCFID.PrePost,":\\s\\d{4}")
chessMatch_subset$preScore <- as.character(chessMatch_subset$preScore)
chessMatch_subset$postScore <- str_extract_all(chessMatch_subset$USCFID.PrePost,">\\d{4}")
chessMatch_subset$postScore <- as.character(chessMatch_subset$postScore)
chessMatch_subset$oppo1 <- str_extract_all(chessMatch_subset$Round1,"\\d+$")
chessMatch_subset$oppo1 <- as.character(chessMatch_subset$oppo1)
chessMatch_subset$oppo2 <- str_extract_all(chessMatch_subset$Round2,"\\d+$")
chessMatch_subset$oppo2 <- as.character(chessMatch_subset$oppo2)
chessMatch_subset$oppo3 <- str_extract_all(chessMatch_subset$Round3,"\\d+$")
chessMatch_subset$oppo3 <- as.character(chessMatch_subset$oppo3)
chessMatch_subset$oppo4 <- str_extract_all(chessMatch_subset$Round4,"\\d+$")
chessMatch_subset$oppo4 <- as.character(chessMatch_subset$oppo4)
chessMatch_subset$oppo5 <- str_extract_all(chessMatch_subset$Round5,"\\d+$")
chessMatch_subset$oppo5 <- as.character(chessMatch_subset$oppo5)
chessMatch_subset$oppo6 <- str_extract_all(chessMatch_subset$Round6,"\\d+$")
chessMatch_subset$oppo6 <- as.character(chessMatch_subset$oppo6)
chessMatch_subset$oppo7 <- str_extract_all(chessMatch_subset$Round7,"\\d+$")
chessMatch_subset$oppo7 <- as.character(chessMatch_subset$oppo7)
chessMatch_subset$ID <- str_extract_all(chessMatch_subset$Pairing, "\\d+")
chessMatch_subset$ID <- as.character(chessMatch_subset$ID)
head(chessMatch_subset,10)
## Pairing PlayerName TotalPoints Round1 Round2 Round3
## 4 1 GARY HUA 6.0 W 39 W 21 W 18
## 7 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4
## 10 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25
## 13 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2
## 16 5 HANSHI ZUO 5.5 W 45 W 37 D 12
## 19 6 HANSEN SONG 5.0 W 34 D 29 L 11
## 22 7 GARY DEE SWATHELL 5.0 W 57 W 46 W 13
## 25 8 EZEKIEL HOUGHTON 5.0 W 3 W 32 L 14
## 28 9 STEFANO LEE 5.0 W 25 L 18 W 59
## 31 10 ANVIT RAO 5.0 D 16 L 19 W 55
## Round4 Round5 Round6 Round7 USCFID.PrePost USCFID
## 4 W 14 W 7 D 12 D 4 15445895 / R: 1794 ->1817 15445895
## 7 W 17 W 16 W 20 W 7 14598900 / R: 1553 ->1663 14598900
## 10 W 21 W 11 W 13 W 12 14959604 / R: 1384 ->1640 14959604
## 13 W 26 D 5 W 19 D 1 12616049 / R: 1716 ->1744 12616049
## 16 D 13 D 4 W 14 W 17 14601533 / R: 1655 ->1690 14601533
## 19 W 35 D 10 W 27 W 21 15055204 / R: 1686 ->1687 15055204
## 22 W 11 L 1 W 9 L 2 11146376 / R: 1649 ->1673 11146376
## 25 L 9 W 47 W 28 W 19 15142253 / R: 1641P17->1657P24 15142253
## 28 W 8 W 26 L 7 W 20 14954524 / R: 1411 ->1564 14954524
## 31 W 31 D 6 W 25 W 18 14150362 / R: 1365 ->1544 14150362
## preScore postScore oppo1 oppo2 oppo3 oppo4 oppo5 oppo6 oppo7 ID
## 4 : 1794 >1817 39 21 18 14 7 12 4 1
## 7 : 1553 >1663 63 58 4 17 16 20 7 2
## 10 : 1384 >1640 8 61 25 21 11 13 12 3
## 13 : 1716 >1744 23 28 2 26 5 19 1 4
## 16 : 1655 >1690 45 37 12 13 4 14 17 5
## 19 : 1686 >1687 34 29 11 35 10 27 21 6
## 22 : 1649 >1673 57 46 13 11 1 9 2 7
## 25 : 1641 >1657 3 32 14 9 47 28 19 8
## 28 : 1411 >1564 25 18 59 8 26 7 20 9
## 31 : 1365 >1544 16 19 55 31 6 25 18 10
After breaking apart the pre and post scores, I removed their punctuation and made them into numeric form. For whatever reason, the post score was not coming up properly. Since I didn’t need it (thankfully), I moved on. Interesting to try to understand why I got this error. Individually they work fine, but together the second function breaks.
chessMatch_subset$preScore <- as.numeric(str_replace_all(chessMatch_subset$preScore, "[[:punct:]]",""))
## Warning: NAs introduced by coercion
chessMatch_subset$postScore <- as.numeric(str_replace_all(chessMatch_subset$postScore, "[[:punct:]]",""))
## Warning: NAs introduced by coercion
head(chessMatch_subset, 10)
## Pairing PlayerName TotalPoints Round1 Round2 Round3
## 4 1 GARY HUA 6.0 W 39 W 21 W 18
## 7 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4
## 10 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25
## 13 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2
## 16 5 HANSHI ZUO 5.5 W 45 W 37 D 12
## 19 6 HANSEN SONG 5.0 W 34 D 29 L 11
## 22 7 GARY DEE SWATHELL 5.0 W 57 W 46 W 13
## 25 8 EZEKIEL HOUGHTON 5.0 W 3 W 32 L 14
## 28 9 STEFANO LEE 5.0 W 25 L 18 W 59
## 31 10 ANVIT RAO 5.0 D 16 L 19 W 55
## Round4 Round5 Round6 Round7 USCFID.PrePost USCFID
## 4 W 14 W 7 D 12 D 4 15445895 / R: 1794 ->1817 15445895
## 7 W 17 W 16 W 20 W 7 14598900 / R: 1553 ->1663 14598900
## 10 W 21 W 11 W 13 W 12 14959604 / R: 1384 ->1640 14959604
## 13 W 26 D 5 W 19 D 1 12616049 / R: 1716 ->1744 12616049
## 16 D 13 D 4 W 14 W 17 14601533 / R: 1655 ->1690 14601533
## 19 W 35 D 10 W 27 W 21 15055204 / R: 1686 ->1687 15055204
## 22 W 11 L 1 W 9 L 2 11146376 / R: 1649 ->1673 11146376
## 25 L 9 W 47 W 28 W 19 15142253 / R: 1641P17->1657P24 15142253
## 28 W 8 W 26 L 7 W 20 14954524 / R: 1411 ->1564 14954524
## 31 W 31 D 6 W 25 W 18 14150362 / R: 1365 ->1544 14150362
## preScore postScore oppo1 oppo2 oppo3 oppo4 oppo5 oppo6 oppo7 ID
## 4 1794 NA 39 21 18 14 7 12 4 1
## 7 1553 NA 63 58 4 17 16 20 7 2
## 10 1384 NA 8 61 25 21 11 13 12 3
## 13 1716 NA 23 28 2 26 5 19 1 4
## 16 1655 NA 45 37 12 13 4 14 17 5
## 19 1686 NA 34 29 11 35 10 27 21 6
## 22 1649 NA 57 46 13 11 1 9 2 7
## 25 1641 NA 3 32 14 9 47 28 19 8
## 28 1411 NA 25 18 59 8 26 7 20 9
## 31 1365 NA 16 19 55 31 6 25 18 10
The crux of this project is in this section. I created a set of seven new columns to be filled by the scores of the opponents, based on their IDs. As I mentioned earlier, the original “Pairing” column gave me a big headache, but I overcame that with creating a new ID column altogether.
Also as I mentioned, I matched the State column as I forgot it at the beginning and couldn’t smoothly incorporate it without breaking more code.
oppoScores <- chessMatch_subset[c(22,2,13,15:21)]
chessMatch_subset$oppo1score <- oppoScores$preScore[match(chessMatch_subset$oppo1, oppoScores$ID)]
chessMatch_subset$oppo2score <- oppoScores$preScore[match(chessMatch_subset$oppo2, oppoScores$ID)]
chessMatch_subset$oppo3score <- oppoScores$preScore[match(chessMatch_subset$oppo3, oppoScores$ID)]
chessMatch_subset$oppo4score <- oppoScores$preScore[match(chessMatch_subset$oppo4, oppoScores$ID)]
chessMatch_subset$oppo5score <- oppoScores$preScore[match(chessMatch_subset$oppo5, oppoScores$ID)]
chessMatch_subset$oppo6score <- oppoScores$preScore[match(chessMatch_subset$oppo6, oppoScores$ID)]
chessMatch_subset$oppo7score <- oppoScores$preScore[match(chessMatch_subset$oppo7, oppoScores$ID)]
chessMatch_subset$State <- chessMatch_combo$State[match(chessMatch_subset$Pairing, chessMatch_combo$Pairing)]
head(chessMatch_subset)
## Pairing PlayerName TotalPoints Round1 Round2 Round3
## 4 1 GARY HUA 6.0 W 39 W 21 W 18
## 7 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4
## 10 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25
## 13 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2
## 16 5 HANSHI ZUO 5.5 W 45 W 37 D 12
## 19 6 HANSEN SONG 5.0 W 34 D 29 L 11
## Round4 Round5 Round6 Round7 USCFID.PrePost USCFID
## 4 W 14 W 7 D 12 D 4 15445895 / R: 1794 ->1817 15445895
## 7 W 17 W 16 W 20 W 7 14598900 / R: 1553 ->1663 14598900
## 10 W 21 W 11 W 13 W 12 14959604 / R: 1384 ->1640 14959604
## 13 W 26 D 5 W 19 D 1 12616049 / R: 1716 ->1744 12616049
## 16 D 13 D 4 W 14 W 17 14601533 / R: 1655 ->1690 14601533
## 19 W 35 D 10 W 27 W 21 15055204 / R: 1686 ->1687 15055204
## preScore postScore oppo1 oppo2 oppo3 oppo4 oppo5 oppo6 oppo7 ID oppo1score
## 4 1794 NA 39 21 18 14 7 12 4 1 1436
## 7 1553 NA 63 58 4 17 16 20 7 2 1175
## 10 1384 NA 8 61 25 21 11 13 12 3 1641
## 13 1716 NA 23 28 2 26 5 19 1 4 1363
## 16 1655 NA 45 37 12 13 4 14 17 5 1242
## 19 1686 NA 34 29 11 35 10 27 21 6 1399
## oppo2score oppo3score oppo4score oppo5score oppo6score oppo7score State
## 4 1563 1600 1610 1649 1663 1716 ON
## 7 NA 1716 1629 1604 1595 1649 MI
## 10 NA 1745 1563 1712 1666 1663 MI
## 13 1507 1553 1579 1655 1564 1794 MI
## 16 NA 1663 1666 1716 1610 1629 MI
## 19 1602 1712 1438 1365 1552 1563 OH
At first, I just used the “mean” function, but it seemed to take the mean of all the numbers, rather than the ones I wanted. After a bit of digging through the texts (“Text Mining with R”), reviewing my DATA 606 work, and the Internet, I stumbled on the “rowMean” function. I made certain to include NULL results, and since this function didn’t have a “trim” argument, I wrapped it in a “round” function to match the sigfigs of the prompt.
chessMatch_subset$oppoAvg <- round(rowMeans(chessMatch_subset[,grep("\\dscore$",names(chessMatch_subset))],na.rm = TRUE,),digits = 0)
head(chessMatch_subset)
## Pairing PlayerName TotalPoints Round1 Round2 Round3
## 4 1 GARY HUA 6.0 W 39 W 21 W 18
## 7 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4
## 10 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25
## 13 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2
## 16 5 HANSHI ZUO 5.5 W 45 W 37 D 12
## 19 6 HANSEN SONG 5.0 W 34 D 29 L 11
## Round4 Round5 Round6 Round7 USCFID.PrePost USCFID
## 4 W 14 W 7 D 12 D 4 15445895 / R: 1794 ->1817 15445895
## 7 W 17 W 16 W 20 W 7 14598900 / R: 1553 ->1663 14598900
## 10 W 21 W 11 W 13 W 12 14959604 / R: 1384 ->1640 14959604
## 13 W 26 D 5 W 19 D 1 12616049 / R: 1716 ->1744 12616049
## 16 D 13 D 4 W 14 W 17 14601533 / R: 1655 ->1690 14601533
## 19 W 35 D 10 W 27 W 21 15055204 / R: 1686 ->1687 15055204
## preScore postScore oppo1 oppo2 oppo3 oppo4 oppo5 oppo6 oppo7 ID oppo1score
## 4 1794 NA 39 21 18 14 7 12 4 1 1436
## 7 1553 NA 63 58 4 17 16 20 7 2 1175
## 10 1384 NA 8 61 25 21 11 13 12 3 1641
## 13 1716 NA 23 28 2 26 5 19 1 4 1363
## 16 1655 NA 45 37 12 13 4 14 17 5 1242
## 19 1686 NA 34 29 11 35 10 27 21 6 1399
## oppo2score oppo3score oppo4score oppo5score oppo6score oppo7score State
## 4 1563 1600 1610 1649 1663 1716 ON
## 7 NA 1716 1629 1604 1595 1649 MI
## 10 NA 1745 1563 1712 1666 1663 MI
## 13 1507 1553 1579 1655 1564 1794 MI
## 16 NA 1663 1666 1716 1610 1629 MI
## 19 1602 1712 1438 1365 1552 1563 OH
## oppoAvg
## 4 1605
## 7 1561
## 10 1665
## 13 1574
## 16 1588
## 19 1519
At this point, I’m pretty much home free. Created a much cleaner table and export code.
chessMatch_export <- chessMatch_subset[,c(2,30,3,13,31)]
head(chessMatch_export)
## PlayerName State TotalPoints preScore oppoAvg
## 4 GARY HUA ON 6.0 1794 1605
## 7 DAKSHESH DARURI MI 6.0 1553 1561
## 10 ADITYA BAJAJ MI 6.0 1384 1665
## 13 PATRICK H SCHILLING MI 5.5 1716 1574
## 16 HANSHI ZUO MI 5.5 1655 1588
## 19 HANSEN SONG OH 5.0 1686 1519
write.csv(chessMatch_export,"chessMatchSummary.csv")