In this project, we were given a text file with chess tournament results where the information has some structure. My job was to create an R Markdown file that generates a .CSV file with the following information for all of the players:


Load data into R data frame

mydata <- read.table("https://raw.githubusercontent.com/ajbentley/cuny_ms_ds/master/607/tournamentinfo.txt", header = TRUE, skip = 1, sep = "|", fill=TRUE)

Plan of attack

Create vectors for each of the requested data (Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents), then merge them into a dataframe.

namevector <- mydata[['Player.Name']]

name <- namevector[seq(3, length(namevector), 3)]

dfs <- data.frame(name)
head(dfs)
##                                name
## 1  GARY HUA                        
## 2  DAKSHESH DARURI                 
## 3  ADITYA BAJAJ                    
## 4  PATRICK H SCHILLING             
## 5  HANSHI ZUO                      
## 6  HANSEN SONG
statevector <- mydata[['Pair']]

state <- statevector[seq(4, length(statevector), 3)]

dfs$state <- state
head(dfs)
##                                name  state
## 1  GARY HUA                            ON 
## 2  DAKSHESH DARURI                     MI 
## 3  ADITYA BAJAJ                        MI 
## 4  PATRICK H SCHILLING                 MI 
## 5  HANSHI ZUO                          MI 
## 6  HANSEN SONG                         OH
pointsvector <- mydata[['Total']]

points <- pointsvector[seq(3, length(statevector), 3)]

dfs$points <- points
head(dfs)
##                                name  state points
## 1  GARY HUA                            ON   6.0  
## 2  DAKSHESH DARURI                     MI   6.0  
## 3  ADITYA BAJAJ                        MI   6.0  
## 4  PATRICK H SCHILLING                 MI   5.5  
## 5  HANSHI ZUO                          MI   5.5  
## 6  HANSEN SONG                         OH   5.0
playerprerating <- namevector[seq(4, length(namevector), 3)]

plypredf <- data.frame(playerprerating)

plypredf$plypre <- sub('.* R: ', '', plypredf$playerprerating)

plypredf$plypre <- sub('(.{4})(.*)', '\\1', plypredf$plypre)

head(plypredf)
##                     playerprerating plypre
## 1  15445895 / R: 1794   ->1817        1794
## 2  14598900 / R: 1553   ->1663        1553
## 3  14959604 / R: 1384   ->1640        1384
## 4  12616049 / R: 1716   ->1744        1716
## 5  14601533 / R: 1655   ->1690        1655
## 6  15055204 / R: 1686   ->1687        1686
playernumber <- statevector[seq(3, length(statevector), 3)]
dfs$playernum <- as.numeric(playernumber)

dfs <- dfs[c("playernum", "name", "state", "points")]

dfs$playerrtg <- plypredf$plypre

head(dfs)
##   playernum                              name  state points playerrtg
## 1         1  GARY HUA                            ON   6.0        1794
## 2         2  DAKSHESH DARURI                     MI   6.0        1553
## 3         3  ADITYA BAJAJ                        MI   6.0        1384
## 4         4  PATRICK H SCHILLING                 MI   5.5        1716
## 5         5  HANSHI ZUO                          MI   5.5        1655
## 6         6  HANSEN SONG                         OH   5.0        1686

Now I’ve got to add columns with each competitor for each player

r1vector <- mydata[['Round']]
rnd1 <- r1vector[seq(3, length(r1vector), 3)]
rnd1 <- str_extract(rnd1, "\\d+")

r2vector <- mydata[['Round.1']]
rnd2 <- r2vector[seq(3, length(r2vector), 3)]
rnd2 <- str_extract(rnd2, "\\d+")

r3vector <- mydata[['Round.2']]
rnd3 <- r3vector[seq(3, length(r3vector), 3)]
rnd3 <- str_extract(rnd3, "\\d+")

r4vector <- mydata[['Round.3']]
rnd4 <- r4vector[seq(3, length(r4vector), 3)]
rnd4 <- str_extract(rnd4, "\\d+")

r5vector <- mydata[['Round.4']]
rnd5 <- r5vector[seq(3, length(r5vector), 3)]
rnd5 <- str_extract(rnd5, "\\d+")

r6vector <- mydata[['Round.5']]
rnd6 <- r6vector[seq(3, length(r6vector), 3)]
rnd6 <- str_extract(rnd6, "\\d+")

r7vector <- mydata[['Round.6']]
rnd7 <- r7vector[seq(3, length(r7vector), 3)]
rnd7 <- str_extract(rnd7, "\\d+")

dfs$rnd1 <- as.numeric(rnd1)
dfs$rnd2 <- as.numeric(rnd2)
dfs$rnd3 <- as.numeric(rnd3)
dfs$rnd4 <- as.numeric(rnd4)
dfs$rnd5 <- as.numeric(rnd5)
dfs$rnd6 <- as.numeric(rnd6)
dfs$rnd7 <- as.numeric(rnd7)

head(dfs)
##   playernum                              name  state points playerrtg rnd1
## 1         1  GARY HUA                            ON   6.0        1794   39
## 2         2  DAKSHESH DARURI                     MI   6.0        1553   63
## 3         3  ADITYA BAJAJ                        MI   6.0        1384    8
## 4         4  PATRICK H SCHILLING                 MI   5.5        1716   23
## 5         5  HANSHI ZUO                          MI   5.5        1655   45
## 6         6  HANSEN SONG                         OH   5.0        1686   34
##   rnd2 rnd3 rnd4 rnd5 rnd6 rnd7
## 1   21   18   14    7   12    4
## 2   58    4   17   16   20    7
## 3   61   25   21   11   13   12
## 4   28    2   26    5   19    1
## 5   37   12   13    4   14   17
## 6   29   11   35   10   27   21

Make relatinal df of just the playernumber and playerrating

df_pr <- data.frame(as.numeric(dfs$playernum), as.numeric(dfs$playerrtg))
head(df_pr)
##   as.numeric.dfs.playernum. as.numeric.dfs.playerrtg.
## 1                         1                      1794
## 2                         2                      1553
## 3                         3                      1384
## 4                         4                      1716
## 5                         5                      1655
## 6                         6                      1686

Now merge df_pr with dfs on dfs.playernum matching to each rnd column.

#merge the two dayparts on rnd1
dfsr1 <- merge(dfs, df_pr, by.x = "rnd1", by.y = "as.numeric.dfs.playernum.", all.x = TRUE)
dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd2", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))

# have to clean up titles as we go along
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg..y"] <- "rnd1_oppscore"
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg..x"] <- "rnd2_oppscore"

dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd3", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg."] <- "rnd3_oppscore"

dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd4", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg."] <- "rnd4_oppscore"

dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd5", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg."] <- "rnd5_oppscore"

dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd6", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg."] <- "rnd6_oppscore"

dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd7", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg."] <- "rnd7_oppscore"

head(dfsr1)
##   rnd7 rnd6 rnd5 rnd4 rnd3 rnd2 rnd1 playernum
## 1    1   19    5   26    2   28   23         4
## 2    2    9    1   11   13   46   57         7
## 3    3    1   NA   38    5   33   42        12
## 4    4   12    7   14   18   21   39         1
## 5    5   22   23    2   26   41   48        17
## 6    6   39   40    3   47    1   43        21
##                                name  state points playerrtg rnd2_oppscore
## 1  PATRICK H SCHILLING                 MI   5.5        1716          1363
## 2  GARY DEE SWATHELL                   MI   5.0        1649          1092
## 3  KENNETH J TACK                      MI   4.5        1663          1332
## 4  GARY HUA                            ON   6.0        1794          1436
## 5  RONALD GRZEGORCZYK                  MI   4.0        1629          1382
## 6  DINH DANG BUI                       ON   4.0        1563          1283
##   rnd1_oppscore rnd3_oppscore rnd4_oppscore rnd5_oppscore rnd6_oppscore
## 1          1507          1553          1579          1655          1564
## 2           377          1666          1712          1794          1411
## 3          1449          1655          1423            NA          1794
## 4          1563          1600          1610          1649          1663
## 5          1403          1579          1553          1363          1555
## 6          1794          1362          1384          1348          1436
##   rnd7_oppscore
## 1          1794
## 2          1553
## 3          1384
## 4          1716
## 5          1655
## 6          1686

Add a column averaging the scores

dfsr1$avg_opponent_score <- rowMeans(dfsr1[c(13:19)], na.rm=TRUE)
head(dfsr1)
##   rnd7 rnd6 rnd5 rnd4 rnd3 rnd2 rnd1 playernum
## 1    1   19    5   26    2   28   23         4
## 2    2    9    1   11   13   46   57         7
## 3    3    1   NA   38    5   33   42        12
## 4    4   12    7   14   18   21   39         1
## 5    5   22   23    2   26   41   48        17
## 6    6   39   40    3   47    1   43        21
##                                name  state points playerrtg rnd2_oppscore
## 1  PATRICK H SCHILLING                 MI   5.5        1716          1363
## 2  GARY DEE SWATHELL                   MI   5.0        1649          1092
## 3  KENNETH J TACK                      MI   4.5        1663          1332
## 4  GARY HUA                            ON   6.0        1794          1436
## 5  RONALD GRZEGORCZYK                  MI   4.0        1629          1382
## 6  DINH DANG BUI                       ON   4.0        1563          1283
##   rnd1_oppscore rnd3_oppscore rnd4_oppscore rnd5_oppscore rnd6_oppscore
## 1          1507          1553          1579          1655          1564
## 2           377          1666          1712          1794          1411
## 3          1449          1655          1423            NA          1794
## 4          1563          1600          1610          1649          1663
## 5          1403          1579          1553          1363          1555
## 6          1794          1362          1384          1348          1436
##   rnd7_oppscore avg_opponent_score
## 1          1794           1573.571
## 2          1553           1372.143
## 3          1384           1506.167
## 4          1716           1605.286
## 5          1655           1498.571
## 6          1686           1470.429

Order in the data frame!

dfsr1 <- dfsr1[order(dfsr1$playernum),]
rownames(dfsr1) <- 1:nrow(dfsr1)

head(dfsr1)
##   rnd7 rnd6 rnd5 rnd4 rnd3 rnd2 rnd1 playernum
## 1    4   12    7   14   18   21   39         1
## 2    7   20   16   17    4   58   63         2
## 3   12   13   11   21   25   61    8         3
## 4    1   19    5   26    2   28   23         4
## 5   17   14    4   13   12   37   45         5
## 6   21   27   10   35   11   29   34         6
##                                name  state points playerrtg rnd2_oppscore
## 1  GARY HUA                            ON   6.0        1794          1436
## 2  DAKSHESH DARURI                     MI   6.0        1553          1175
## 3  ADITYA BAJAJ                        MI   6.0        1384          1641
## 4  PATRICK H SCHILLING                 MI   5.5        1716          1363
## 5  HANSHI ZUO                          MI   5.5        1655          1242
## 6  HANSEN SONG                         OH   5.0        1686          1399
##   rnd1_oppscore rnd3_oppscore rnd4_oppscore rnd5_oppscore rnd6_oppscore
## 1          1563          1600          1610          1649          1663
## 2           917          1716          1629          1604          1595
## 3           955          1745          1563          1712          1666
## 4          1507          1553          1579          1655          1564
## 5           980          1663          1666          1716          1610
## 6          1602          1712          1438          1365          1552
##   rnd7_oppscore avg_opponent_score
## 1          1716           1605.286
## 2          1649           1469.286
## 3          1663           1563.571
## 4          1794           1573.571
## 5          1629           1500.857
## 6          1563           1518.714

Time for cleanup!

#removing columns not needed
dfsr1 <- dfsr1[,-c(1:8)]
dfsr1 <- dfsr1[,-c(5:11)]
# making the average opponent score an integer, as intdicated in the assignment
dfsr1$avg_opponent_score <- as.integer(dfsr1$avg_opponent_score)

Finished product

dfsr1
##                                 name  state points playerrtg
## 1   GARY HUA                            ON   6.0        1794
## 2   DAKSHESH DARURI                     MI   6.0        1553
## 3   ADITYA BAJAJ                        MI   6.0        1384
## 4   PATRICK H SCHILLING                 MI   5.5        1716
## 5   HANSHI ZUO                          MI   5.5        1655
## 6   HANSEN SONG                         OH   5.0        1686
## 7   GARY DEE SWATHELL                   MI   5.0        1649
## 8   EZEKIEL HOUGHTON                    MI   5.0        1641
## 9   STEFANO LEE                         ON   5.0        1411
## 10  ANVIT RAO                           MI   5.0        1365
## 11  CAMERON WILLIAM MC LEMAN            MI   4.5        1712
## 12  KENNETH J TACK                      MI   4.5        1663
## 13  TORRANCE HENRY JR                   MI   4.5        1666
## 14  BRADLEY SHAW                        MI   4.5        1610
## 15  ZACHARY JAMES HOUGHTON              MI   4.5        1220
## 16  MIKE NIKITIN                        MI   4.0        1604
## 17  RONALD GRZEGORCZYK                  MI   4.0        1629
## 18  DAVID SUNDEEN                       MI   4.0        1600
## 19  DIPANKAR ROY                        MI   4.0        1564
## 20  JASON ZHENG                         MI   4.0        1595
## 21  DINH DANG BUI                       ON   4.0        1563
## 22  EUGENE L MCCLURE                    MI   4.0        1555
## 23  ALAN BUI                            ON   4.0        1363
## 24  MICHAEL R ALDRICH                   MI   4.0        1229
## 25  LOREN SCHWIEBERT                    MI   3.5        1745
## 26  MAX ZHU                             ON   3.5        1579
## 27  GAURAV GIDWANI                      MI   3.5        1552
## 28  SOFIA ADINA STANESCU-BELLU          MI   3.5        1507
## 29  CHIEDOZIE OKORIE                    MI   3.5        1602
## 30  GEORGE AVERY JONES                  ON   3.5        1522
## 31  RISHI SHETTY                        MI   3.5        1494
## 32  JOSHUA PHILIP MATHEWS               ON   3.5        1441
## 33  JADE GE                             MI   3.5        1449
## 34  MICHAEL JEFFERY THOMAS              MI   3.5        1399
## 35  JOSHUA DAVID LEE                    MI   3.5        1438
## 36  SIDDHARTH JHA                       MI   3.5        1355
## 37  AMIYATOSH PWNANANDAM                MI   3.5         980
## 38  BRIAN LIU                           MI   3.0        1423
## 39  JOEL R HENDON                       MI   3.0        1436
## 40  FOREST ZHANG                        MI   3.0        1348
## 41  KYLE WILLIAM MURPHY                 MI   3.0        1403
## 42  JARED GE                            MI   3.0        1332
## 43  ROBERT GLEN VASEY                   MI   3.0        1283
## 44  JUSTIN D SCHILLING                  MI   3.0        1199
## 45  DEREK YAN                           MI   3.0        1242
## 46  JACOB ALEXANDER LAVALLEY            MI   3.0         377
## 47  ERIC WRIGHT                         MI   2.5        1362
## 48  DANIEL KHAIN                        MI   2.5        1382
## 49  MICHAEL J MARTIN                    MI   2.5        1291
## 50  SHIVAM JHA                          MI   2.5        1056
## 51  TEJAS AYYAGARI                      MI   2.5        1011
## 52  ETHAN GUO                           MI   2.5         935
## 53  JOSE C YBARRA                       MI   2.0        1393
## 54  LARRY HODGE                         MI   2.0        1270
## 55  ALEX KONG                           MI   2.0        1186
## 56  MARISA RICCI                        MI   2.0        1153
## 57  MICHAEL LU                          MI   2.0        1092
## 58  VIRAJ MOHILE                        MI   2.0         917
## 59  SEAN M MC CORMICK                   MI   2.0         853
## 60  JULIA SHEN                          MI   1.5         967
## 61  JEZZEL FARKAS                       ON   1.5         955
## 62  ASHWIN BALAJI                       MI   1.0        1530
## 63  THOMAS JOSEPH HOSMER                MI   1.0        1175
## 64  BEN LI                              MI   1.0        1163
##    avg_opponent_score
## 1                1605
## 2                1469
## 3                1563
## 4                1573
## 5                1500
## 6                1518
## 7                1372
## 8                1468
## 9                1523
## 10               1554
## 11               1467
## 12               1506
## 13               1497
## 14               1515
## 15               1483
## 16               1385
## 17               1498
## 18               1480
## 19               1426
## 20               1410
## 21               1470
## 22               1300
## 23               1213
## 24               1357
## 25               1363
## 26               1506
## 27               1221
## 28               1522
## 29               1313
## 30               1144
## 31               1259
## 32               1378
## 33               1276
## 34               1375
## 35               1149
## 36               1388
## 37               1384
## 38               1539
## 39               1429
## 40               1390
## 41               1248
## 42               1149
## 43               1106
## 44               1327
## 45               1152
## 46               1357
## 47               1392
## 48               1355
## 49               1285
## 50               1296
## 51               1356
## 52               1494
## 53               1345
## 54               1206
## 55               1406
## 56               1414
## 57               1363
## 58               1391
## 59               1319
## 60               1330
## 61               1327
## 62               1186
## 63               1350
## 64               1263

Export to CSV

write.csv(dfsr1, file = "chess_data.csv")