I uploaded the .txt file to GitHub and imported it from there. After studying the data file I used the read_delim() method so I could set my own delimiter. I used ‘|’ as the delimiter, trimmed the leading and trailing white spaces from each field and skipped the first row of all hyphens so I could use the next row for column names.
Note: ‘-BELLU’ was cut off from the name ‘SOFIA ADINA STANESCU-BELLU’ during the initial import. I could never figure out why it was cut off or how to fix it.
chessdata <- read_delim("https://raw.githubusercontent.com/smithchad17/Class607/master/tournamentinfo.txt", delim = "|", trim_ws = T, skip = 1, col_names = T)
Removed the first row which was not needed and the last column which was empty
chessdata <- chessdata[-c(1),-dim(chessdata)[2]]
Filter rows where the player’s name is empty. This will eliminate the rows with hyphens
chessdata <- filter(chessdata, chessdata$`Player Name` != " ")
#Rename columns
names(chessdata) <- c("Pair", "Player's Name", "Total Points", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7")
chessdata
## # A tibble: 128 x 10
## Pair `Player's Name` `Total Points` Round1 Round2 Round3
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 GARY HUA 6.0 W 39 W 21 W 18
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B W
## 3 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4
## 4 MI 14598900 / R: 1553 ->1663 N:2 B W B
## 5 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25
## 6 MI 14959604 / R: 1384 ->1640 N:2 W B W
## 7 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2
## 8 MI 12616049 / R: 1716 ->1744 N:2 W B W
## 9 5 HANSHI ZUO 5.5 W 45 W 37 D 12
## 10 MI 14601533 / R: 1655 ->1690 N:2 B W B
## # ... with 118 more rows, and 4 more variables: Round4 <chr>,
## # Round5 <chr>, Round6 <chr>, Round7 <chr>
Each column of data was pulled as a vector, then set as a column in a temporary data frame so it could be filtered using dplyr. The result was a data frame of one column.
Get Player’s Number
#Extract the rows that only have digits
playernumber <- str_extract(chessdata$Pair, "[[:digit:]]+")
##Add to a temporary data frame so it can be filtered
tempdata <- data.frame(playernumber, stringsAsFactors = F)
#Filter rows that are empty
playernumber <- filter(tempdata, playernumber != " ")
playernumber
## playernumber
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
## 9 9
## 10 10
## 11 11
## 12 12
## 13 13
## 14 14
## 15 15
## 16 16
## 17 17
## 18 18
## 19 19
## 20 20
## 21 21
## 22 22
## 23 23
## 24 24
## 25 25
## 26 26
## 27 27
## 28 28
## 29 29
## 30 30
## 31 31
## 32 32
## 33 33
## 34 34
## 35 35
## 36 36
## 37 37
## 38 38
## 39 39
## 40 40
## 41 41
## 42 42
## 43 43
## 44 44
## 45 45
## 46 46
## 47 47
## 48 48
## 49 49
## 50 50
## 51 51
## 52 52
## 53 53
## 54 54
## 55 55
## 56 56
## 57 57
## 58 58
## 59 59
## 60 60
## 61 61
## 62 62
## 63 63
## 64 64
Get Player’s Name
#Extract the rows that only have letters and spaces
playername <- str_extract(chessdata$`Player's Name`, "[[:alpha:] ]+")
#Add to a temporary data frame so it can be filtered
tempdata <- data.frame(playername, stringsAsFactors = F)
#Filter rows that are empty
playername <- filter(tempdata, playername != " ")
playername
## playername
## 1 GARY HUA
## 2 DAKSHESH DARURI
## 3 ADITYA BAJAJ
## 4 PATRICK H SCHILLING
## 5 HANSHI ZUO
## 6 HANSEN SONG
## 7 GARY DEE SWATHELL
## 8 EZEKIEL HOUGHTON
## 9 STEFANO LEE
## 10 ANVIT RAO
## 11 CAMERON WILLIAM MC LEMAN
## 12 KENNETH J TACK
## 13 TORRANCE HENRY JR
## 14 BRADLEY SHAW
## 15 ZACHARY JAMES HOUGHTON
## 16 MIKE NIKITIN
## 17 RONALD GRZEGORCZYK
## 18 DAVID SUNDEEN
## 19 DIPANKAR ROY
## 20 JASON ZHENG
## 21 DINH DANG BUI
## 22 EUGENE L MCCLURE
## 23 ALAN BUI
## 24 MICHAEL R ALDRICH
## 25 LOREN SCHWIEBERT
## 26 MAX ZHU
## 27 GAURAV GIDWANI
## 28 SOFIA ADINA STANESCU
## 29 CHIEDOZIE OKORIE
## 30 GEORGE AVERY JONES
## 31 RISHI SHETTY
## 32 JOSHUA PHILIP MATHEWS
## 33 JADE GE
## 34 MICHAEL JEFFERY THOMAS
## 35 JOSHUA DAVID LEE
## 36 SIDDHARTH JHA
## 37 AMIYATOSH PWNANANDAM
## 38 BRIAN LIU
## 39 JOEL R HENDON
## 40 FOREST ZHANG
## 41 KYLE WILLIAM MURPHY
## 42 JARED GE
## 43 ROBERT GLEN VASEY
## 44 JUSTIN D SCHILLING
## 45 DEREK YAN
## 46 JACOB ALEXANDER LAVALLEY
## 47 ERIC WRIGHT
## 48 DANIEL KHAIN
## 49 MICHAEL J MARTIN
## 50 SHIVAM JHA
## 51 TEJAS AYYAGARI
## 52 ETHAN GUO
## 53 JOSE C YBARRA
## 54 LARRY HODGE
## 55 ALEX KONG
## 56 MARISA RICCI
## 57 MICHAEL LU
## 58 VIRAJ MOHILE
## 59 SEAN M MC CORMICK
## 60 JULIA SHEN
## 61 JEZZEL FARKAS
## 62 ASHWIN BALAJI
## 63 THOMAS JOSEPH HOSMER
## 64 BEN LI
Get Player’s State
#Extract the rows that only have letters
playerstate <- str_extract(chessdata$Pair, "[[:alpha:]]+")
#Add to a temporary data frame so it can be filtered
tempdata <- data.frame(playerstate, stringsAsFactors = F)
#Filter rows that are empty
playerstate <- filter(tempdata, playerstate != " ")
playerstate
## playerstate
## 1 ON
## 2 MI
## 3 MI
## 4 MI
## 5 MI
## 6 OH
## 7 MI
## 8 MI
## 9 ON
## 10 MI
## 11 MI
## 12 MI
## 13 MI
## 14 MI
## 15 MI
## 16 MI
## 17 MI
## 18 MI
## 19 MI
## 20 MI
## 21 ON
## 22 MI
## 23 ON
## 24 MI
## 25 MI
## 26 ON
## 27 MI
## 28 MI
## 29 MI
## 30 ON
## 31 MI
## 32 ON
## 33 MI
## 34 MI
## 35 MI
## 36 MI
## 37 MI
## 38 MI
## 39 MI
## 40 MI
## 41 MI
## 42 MI
## 43 MI
## 44 MI
## 45 MI
## 46 MI
## 47 MI
## 48 MI
## 49 MI
## 50 MI
## 51 MI
## 52 MI
## 53 MI
## 54 MI
## 55 MI
## 56 MI
## 57 MI
## 58 MI
## 59 MI
## 60 MI
## 61 ON
## 62 MI
## 63 MI
## 64 MI
Get Player’s Points
#Extract the rows that only have numbers before and after a period
playerpoints <- str_extract(chessdata$"Total Points", "\\d+.\\d+")
#Add to a temporary data frame so it can be filtered
tempdata <- data.frame(playerpoints, stringsAsFactors = F)
#Filter rows that are empty
playerpoints <- filter(tempdata, playerpoints != " ")
playerpoints
## playerpoints
## 1 6.0
## 2 6.0
## 3 6.0
## 4 5.5
## 5 5.5
## 6 5.0
## 7 5.0
## 8 5.0
## 9 5.0
## 10 5.0
## 11 4.5
## 12 4.5
## 13 4.5
## 14 4.5
## 15 4.5
## 16 4.0
## 17 4.0
## 18 4.0
## 19 4.0
## 20 4.0
## 21 4.0
## 22 4.0
## 23 4.0
## 24 4.0
## 25 3.5
## 26 3.5
## 27 3.5
## 28 3.5
## 29 3.5
## 30 3.5
## 31 3.5
## 32 3.5
## 33 3.5
## 34 3.5
## 35 3.5
## 36 3.5
## 37 3.5
## 38 3.0
## 39 3.0
## 40 3.0
## 41 3.0
## 42 3.0
## 43 3.0
## 44 3.0
## 45 3.0
## 46 3.0
## 47 2.5
## 48 2.5
## 49 2.5
## 50 2.5
## 51 2.5
## 52 2.5
## 53 2.0
## 54 2.0
## 55 2.0
## 56 2.0
## 57 2.0
## 58 2.0
## 59 2.0
## 60 1.5
## 61 1.5
## 62 1.0
## 63 1.0
## 64 1.0
Get Player’s Pre-Rating
#Extract the rows that only have numbers after a colon and before a hyphen
playerrating <- str_extract(chessdata$`Player's Name`, ":.+?(\\d+).+?-")
#Extract only the numbers
playerrating <- str_extract(playerrating, "\\d+")
#Add to a temporary data frame so it can be filtered
tempdata <- data.frame(playerrating, stringsAsFactors = F)
#Filter rows that are empty
playerrating <- filter(tempdata, playerrating != " ")
playerrating
## playerrating
## 1 1794
## 2 1553
## 3 1384
## 4 1716
## 5 1655
## 6 1686
## 7 1649
## 8 1641
## 9 1411
## 10 1365
## 11 1712
## 12 1663
## 13 1666
## 14 1610
## 15 1220
## 16 1604
## 17 1629
## 18 1600
## 19 1564
## 20 1595
## 21 1563
## 22 1555
## 23 1363
## 24 1229
## 25 1745
## 26 1579
## 27 1552
## 28 1507
## 29 1602
## 30 1522
## 31 1494
## 32 1441
## 33 1449
## 34 1399
## 35 1438
## 36 1355
## 37 980
## 38 1423
## 39 1436
## 40 1348
## 41 1403
## 42 1332
## 43 1283
## 44 1199
## 45 1242
## 46 377
## 47 1362
## 48 1382
## 49 1291
## 50 1056
## 51 1011
## 52 935
## 53 1393
## 54 1270
## 55 1186
## 56 1153
## 57 1092
## 58 917
## 59 853
## 60 967
## 61 955
## 62 1530
## 63 1175
## 64 1163
To find the amount of games played, I organized the data a little further. I only want to extract data from the most simplified data frame as possible so I removed the rows I didn’t need. To find the amount of games played I didn’t need the rows with the ratings so I removed those to create a subset of the original data frame.
chess.subset <- chessdata[grep('^[0-9]', chessdata$Pair), ]
chess.subset
## # A tibble: 64 x 10
## Pair `Player's Name` `Total Points` Round1 Round2 Round3 Round4
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 2 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 3 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 4 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26
## 5 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13
## 6 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35
## 7 7 GARY DEE SWATHELL 5.0 W 57 W 46 W 13 W 11
## 8 8 EZEKIEL HOUGHTON 5.0 W 3 W 32 L 14 L 9
## 9 9 STEFANO LEE 5.0 W 25 L 18 W 59 W 8
## 10 10 ANVIT RAO 5.0 D 16 L 19 W 55 W 31
## # ... with 54 more rows, and 3 more variables: Round5 <chr>, Round6 <chr>,
## # Round7 <chr>
Looping through the columns Round1 - Round7 by row, while counting the amount of games played.
#Count how many games each player played
played <- c()
#Start with the first row and cycle through columns of Round1 through Round7
#If a opponent number is present, add it to the counter to find the amount of games played.
for(i in chess.subset$Pair){
j <- 4
count <- 0
while(j <= dim(chess.subset)[2]){
if(str_detect(chess.subset[as.integer(i),j], "\\d") == T) {count <- count + 1}
j <- j + 1
}
played <- c(played, count)
}
played #amount of games played per player
## [1] 7 7 7 7 7 7 7 7 7 7 7 6 7 7 7 5 7 7 7 7 7 6 7 7 7 7 6 7 6 7 7 7 7 7 7
## [36] 6 5 6 7 7 4 7 7 6 7 7 7 5 5 6 7 7 3 6 6 5 6 6 6 5 7 1 5 7
Start with the first row and cycle through columns of Round1 through Round7. Extracts the opponent number and compares it to the corresponding value in the ‘playerratings’ data frame. Adds the value which will be summed up and averaged
#Empty vector that will hold the average opponents rating
averages <- c()
#Start with the first row and cycle through columns of Round1 through Round7.
#Extracts the opponent number and compares it to the corresponding value in the
#'playerratings' data frame.
for(i in chess.subset$Pair){
j <- 4
ratings <- c()
while(j <= dim(chess.subset)[2]){
opp_num <- str_extract(chess.subset[as.integer(i),j], "\\d+")
#Appends the next opponent rating to the ratings vector
ratings <- c(ratings, playerrating[opp_num,])
j <- j + 1
}
#Computes the mean of the 'ratings' vector and appends the result to the 'average' vector
#Ignore the 'NA' results that were extracted when an opponent wasn't played that round.
averages <- c(averages, mean(as.integer(ratings), na.rm = T))
}
round(averages)
## [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515
## [15] 1484 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522
## [29] 1314 1144 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150
## [43] 1107 1327 1152 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414
## [57] 1363 1391 1319 1330 1327 1186 1350 1263
Create a final data frame from the resulting data frames we previously found
columns <- c(playername, playerstate, playerpoints, playerrating, as.data.frame(played), as.data.frame(round(averages)))
finaldata <- data.frame(columns, stringsAsFactors = F)
names(finaldata) <- c("Player's Name", "Player's State", "Total Points", "Pre-Rating", "Games Played", "AVG Opponent Pre-Rating")
finaldata
## Player's Name Player's State Total Points Pre-Rating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## 7 GARY DEE SWATHELL MI 5.0 1649
## 8 EZEKIEL HOUGHTON MI 5.0 1641
## 9 STEFANO LEE ON 5.0 1411
## 10 ANVIT RAO MI 5.0 1365
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712
## 12 KENNETH J TACK MI 4.5 1663
## 13 TORRANCE HENRY JR MI 4.5 1666
## 14 BRADLEY SHAW MI 4.5 1610
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220
## 16 MIKE NIKITIN MI 4.0 1604
## 17 RONALD GRZEGORCZYK MI 4.0 1629
## 18 DAVID SUNDEEN MI 4.0 1600
## 19 DIPANKAR ROY MI 4.0 1564
## 20 JASON ZHENG MI 4.0 1595
## 21 DINH DANG BUI ON 4.0 1563
## 22 EUGENE L MCCLURE MI 4.0 1555
## 23 ALAN BUI ON 4.0 1363
## 24 MICHAEL R ALDRICH MI 4.0 1229
## 25 LOREN SCHWIEBERT MI 3.5 1745
## 26 MAX ZHU ON 3.5 1579
## 27 GAURAV GIDWANI MI 3.5 1552
## 28 SOFIA ADINA STANESCU MI 3.5 1507
## 29 CHIEDOZIE OKORIE MI 3.5 1602
## 30 GEORGE AVERY JONES ON 3.5 1522
## 31 RISHI SHETTY MI 3.5 1494
## 32 JOSHUA PHILIP MATHEWS ON 3.5 1441
## 33 JADE GE MI 3.5 1449
## 34 MICHAEL JEFFERY THOMAS MI 3.5 1399
## 35 JOSHUA DAVID LEE MI 3.5 1438
## 36 SIDDHARTH JHA MI 3.5 1355
## 37 AMIYATOSH PWNANANDAM MI 3.5 980
## 38 BRIAN LIU MI 3.0 1423
## 39 JOEL R HENDON MI 3.0 1436
## 40 FOREST ZHANG MI 3.0 1348
## 41 KYLE WILLIAM MURPHY MI 3.0 1403
## 42 JARED GE MI 3.0 1332
## 43 ROBERT GLEN VASEY MI 3.0 1283
## 44 JUSTIN D SCHILLING MI 3.0 1199
## 45 DEREK YAN MI 3.0 1242
## 46 JACOB ALEXANDER LAVALLEY MI 3.0 377
## 47 ERIC WRIGHT MI 2.5 1362
## 48 DANIEL KHAIN MI 2.5 1382
## 49 MICHAEL J MARTIN MI 2.5 1291
## 50 SHIVAM JHA MI 2.5 1056
## 51 TEJAS AYYAGARI MI 2.5 1011
## 52 ETHAN GUO MI 2.5 935
## 53 JOSE C YBARRA MI 2.0 1393
## 54 LARRY HODGE MI 2.0 1270
## 55 ALEX KONG MI 2.0 1186
## 56 MARISA RICCI MI 2.0 1153
## 57 MICHAEL LU MI 2.0 1092
## 58 VIRAJ MOHILE MI 2.0 917
## 59 SEAN M MC CORMICK MI 2.0 853
## 60 JULIA SHEN MI 1.5 967
## 61 JEZZEL FARKAS ON 1.5 955
## 62 ASHWIN BALAJI MI 1.0 1530
## 63 THOMAS JOSEPH HOSMER MI 1.0 1175
## 64 BEN LI MI 1.0 1163
## Games Played AVG Opponent Pre-Rating
## 1 7 1605
## 2 7 1469
## 3 7 1564
## 4 7 1574
## 5 7 1501
## 6 7 1519
## 7 7 1372
## 8 7 1468
## 9 7 1523
## 10 7 1554
## 11 7 1468
## 12 6 1506
## 13 7 1498
## 14 7 1515
## 15 7 1484
## 16 5 1386
## 17 7 1499
## 18 7 1480
## 19 7 1426
## 20 7 1411
## 21 7 1470
## 22 6 1300
## 23 7 1214
## 24 7 1357
## 25 7 1363
## 26 7 1507
## 27 6 1222
## 28 7 1522
## 29 6 1314
## 30 7 1144
## 31 7 1260
## 32 7 1379
## 33 7 1277
## 34 7 1375
## 35 7 1150
## 36 6 1388
## 37 5 1385
## 38 6 1539
## 39 7 1430
## 40 7 1391
## 41 4 1248
## 42 7 1150
## 43 7 1107
## 44 6 1327
## 45 7 1152
## 46 7 1358
## 47 7 1392
## 48 5 1356
## 49 5 1286
## 50 6 1296
## 51 7 1356
## 52 7 1495
## 53 3 1345
## 54 6 1206
## 55 6 1406
## 56 5 1414
## 57 6 1363
## 58 6 1391
## 59 6 1319
## 60 5 1330
## 61 7 1327
## 62 1 1186
## 63 5 1350
## 64 7 1263
write.csv(finaldata, file = "ChessData.csv")