Load data into R data frame
mydata <- read.table("https://raw.githubusercontent.com/ajbentley/cuny_ms_ds/master/607/tournamentinfo.txt", header = TRUE, skip = 1, sep = "|", fill=TRUE)
Create vectors for each of the requested data (Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents), then merge them into a dataframe.
namevector <- mydata[['Player.Name']]
name <- namevector[seq(3, length(namevector), 3)]
dfs <- data.frame(name)
head(dfs)
## name
## 1 GARY HUA
## 2 DAKSHESH DARURI
## 3 ADITYA BAJAJ
## 4 PATRICK H SCHILLING
## 5 HANSHI ZUO
## 6 HANSEN SONG
statevector <- mydata[['Pair']]
state <- statevector[seq(4, length(statevector), 3)]
dfs$state <- state
head(dfs)
## name state
## 1 GARY HUA ON
## 2 DAKSHESH DARURI MI
## 3 ADITYA BAJAJ MI
## 4 PATRICK H SCHILLING MI
## 5 HANSHI ZUO MI
## 6 HANSEN SONG OH
pointsvector <- mydata[['Total']]
points <- pointsvector[seq(3, length(statevector), 3)]
dfs$points <- points
head(dfs)
## name state points
## 1 GARY HUA ON 6.0
## 2 DAKSHESH DARURI MI 6.0
## 3 ADITYA BAJAJ MI 6.0
## 4 PATRICK H SCHILLING MI 5.5
## 5 HANSHI ZUO MI 5.5
## 6 HANSEN SONG OH 5.0
playerprerating <- namevector[seq(4, length(namevector), 3)]
plypredf <- data.frame(playerprerating)
plypredf$plypre <- sub('.* R: ', '', plypredf$playerprerating)
plypredf$plypre <- sub('(.{4})(.*)', '\\1', plypredf$plypre)
head(plypredf)
## playerprerating plypre
## 1 15445895 / R: 1794 ->1817 1794
## 2 14598900 / R: 1553 ->1663 1553
## 3 14959604 / R: 1384 ->1640 1384
## 4 12616049 / R: 1716 ->1744 1716
## 5 14601533 / R: 1655 ->1690 1655
## 6 15055204 / R: 1686 ->1687 1686
playernumber <- statevector[seq(3, length(statevector), 3)]
dfs$playernum <- as.numeric(playernumber)
dfs <- dfs[c("playernum", "name", "state", "points")]
dfs$playerrtg <- plypredf$plypre
head(dfs)
## playernum name state points playerrtg
## 1 1 GARY HUA ON 6.0 1794
## 2 2 DAKSHESH DARURI MI 6.0 1553
## 3 3 ADITYA BAJAJ MI 6.0 1384
## 4 4 PATRICK H SCHILLING MI 5.5 1716
## 5 5 HANSHI ZUO MI 5.5 1655
## 6 6 HANSEN SONG OH 5.0 1686
Now I’ve got to add columns with each competitor for each player
r1vector <- mydata[['Round']]
rnd1 <- r1vector[seq(3, length(r1vector), 3)]
rnd1 <- str_extract(rnd1, "\\d+")
r2vector <- mydata[['Round.1']]
rnd2 <- r2vector[seq(3, length(r2vector), 3)]
rnd2 <- str_extract(rnd2, "\\d+")
r3vector <- mydata[['Round.2']]
rnd3 <- r3vector[seq(3, length(r3vector), 3)]
rnd3 <- str_extract(rnd3, "\\d+")
r4vector <- mydata[['Round.3']]
rnd4 <- r4vector[seq(3, length(r4vector), 3)]
rnd4 <- str_extract(rnd4, "\\d+")
r5vector <- mydata[['Round.4']]
rnd5 <- r5vector[seq(3, length(r5vector), 3)]
rnd5 <- str_extract(rnd5, "\\d+")
r6vector <- mydata[['Round.5']]
rnd6 <- r6vector[seq(3, length(r6vector), 3)]
rnd6 <- str_extract(rnd6, "\\d+")
r7vector <- mydata[['Round.6']]
rnd7 <- r7vector[seq(3, length(r7vector), 3)]
rnd7 <- str_extract(rnd7, "\\d+")
dfs$rnd1 <- as.numeric(rnd1)
dfs$rnd2 <- as.numeric(rnd2)
dfs$rnd3 <- as.numeric(rnd3)
dfs$rnd4 <- as.numeric(rnd4)
dfs$rnd5 <- as.numeric(rnd5)
dfs$rnd6 <- as.numeric(rnd6)
dfs$rnd7 <- as.numeric(rnd7)
head(dfs)
## playernum name state points playerrtg rnd1
## 1 1 GARY HUA ON 6.0 1794 39
## 2 2 DAKSHESH DARURI MI 6.0 1553 63
## 3 3 ADITYA BAJAJ MI 6.0 1384 8
## 4 4 PATRICK H SCHILLING MI 5.5 1716 23
## 5 5 HANSHI ZUO MI 5.5 1655 45
## 6 6 HANSEN SONG OH 5.0 1686 34
## rnd2 rnd3 rnd4 rnd5 rnd6 rnd7
## 1 21 18 14 7 12 4
## 2 58 4 17 16 20 7
## 3 61 25 21 11 13 12
## 4 28 2 26 5 19 1
## 5 37 12 13 4 14 17
## 6 29 11 35 10 27 21
Make relatinal df of just the playernumber and playerrating
df_pr <- data.frame(as.numeric(dfs$playernum), as.numeric(dfs$playerrtg))
head(df_pr)
## as.numeric.dfs.playernum. as.numeric.dfs.playerrtg.
## 1 1 1794
## 2 2 1553
## 3 3 1384
## 4 4 1716
## 5 5 1655
## 6 6 1686
Now merge df_pr with dfs on dfs.playernum matching to each rnd column.
#merge the two dayparts on rnd1
dfsr1 <- merge(dfs, df_pr, by.x = "rnd1", by.y = "as.numeric.dfs.playernum.", all.x = TRUE)
dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd2", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))
# have to clean up titles as we go along
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg..y"] <- "rnd1_oppscore"
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg..x"] <- "rnd2_oppscore"
dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd3", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg."] <- "rnd3_oppscore"
dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd4", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg."] <- "rnd4_oppscore"
dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd5", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg."] <- "rnd5_oppscore"
dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd6", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg."] <- "rnd6_oppscore"
dfsr1 <- merge(dfsr1, df_pr, by.x = "rnd7", by.y = "as.numeric.dfs.playernum.", all.x = TRUE, suffix = c(".x",".y"))
colnames(dfsr1)[colnames(dfsr1)=="as.numeric.dfs.playerrtg."] <- "rnd7_oppscore"
head(dfsr1)
## rnd7 rnd6 rnd5 rnd4 rnd3 rnd2 rnd1 playernum
## 1 1 19 5 26 2 28 23 4
## 2 2 9 1 11 13 46 57 7
## 3 3 1 NA 38 5 33 42 12
## 4 4 12 7 14 18 21 39 1
## 5 5 22 23 2 26 41 48 17
## 6 6 39 40 3 47 1 43 21
## name state points playerrtg rnd2_oppscore
## 1 PATRICK H SCHILLING MI 5.5 1716 1363
## 2 GARY DEE SWATHELL MI 5.0 1649 1092
## 3 KENNETH J TACK MI 4.5 1663 1332
## 4 GARY HUA ON 6.0 1794 1436
## 5 RONALD GRZEGORCZYK MI 4.0 1629 1382
## 6 DINH DANG BUI ON 4.0 1563 1283
## rnd1_oppscore rnd3_oppscore rnd4_oppscore rnd5_oppscore rnd6_oppscore
## 1 1507 1553 1579 1655 1564
## 2 377 1666 1712 1794 1411
## 3 1449 1655 1423 NA 1794
## 4 1563 1600 1610 1649 1663
## 5 1403 1579 1553 1363 1555
## 6 1794 1362 1384 1348 1436
## rnd7_oppscore
## 1 1794
## 2 1553
## 3 1384
## 4 1716
## 5 1655
## 6 1686
Add a column averaging the scores
dfsr1$avg_opponent_score <- rowMeans(dfsr1[c(13:19)], na.rm=TRUE)
head(dfsr1)
## rnd7 rnd6 rnd5 rnd4 rnd3 rnd2 rnd1 playernum
## 1 1 19 5 26 2 28 23 4
## 2 2 9 1 11 13 46 57 7
## 3 3 1 NA 38 5 33 42 12
## 4 4 12 7 14 18 21 39 1
## 5 5 22 23 2 26 41 48 17
## 6 6 39 40 3 47 1 43 21
## name state points playerrtg rnd2_oppscore
## 1 PATRICK H SCHILLING MI 5.5 1716 1363
## 2 GARY DEE SWATHELL MI 5.0 1649 1092
## 3 KENNETH J TACK MI 4.5 1663 1332
## 4 GARY HUA ON 6.0 1794 1436
## 5 RONALD GRZEGORCZYK MI 4.0 1629 1382
## 6 DINH DANG BUI ON 4.0 1563 1283
## rnd1_oppscore rnd3_oppscore rnd4_oppscore rnd5_oppscore rnd6_oppscore
## 1 1507 1553 1579 1655 1564
## 2 377 1666 1712 1794 1411
## 3 1449 1655 1423 NA 1794
## 4 1563 1600 1610 1649 1663
## 5 1403 1579 1553 1363 1555
## 6 1794 1362 1384 1348 1436
## rnd7_oppscore avg_opponent_score
## 1 1794 1573.571
## 2 1553 1372.143
## 3 1384 1506.167
## 4 1716 1605.286
## 5 1655 1498.571
## 6 1686 1470.429
Order in the data frame!
dfsr1 <- dfsr1[order(dfsr1$playernum),]
rownames(dfsr1) <- 1:nrow(dfsr1)
head(dfsr1)
## rnd7 rnd6 rnd5 rnd4 rnd3 rnd2 rnd1 playernum
## 1 4 12 7 14 18 21 39 1
## 2 7 20 16 17 4 58 63 2
## 3 12 13 11 21 25 61 8 3
## 4 1 19 5 26 2 28 23 4
## 5 17 14 4 13 12 37 45 5
## 6 21 27 10 35 11 29 34 6
## name state points playerrtg rnd2_oppscore
## 1 GARY HUA ON 6.0 1794 1436
## 2 DAKSHESH DARURI MI 6.0 1553 1175
## 3 ADITYA BAJAJ MI 6.0 1384 1641
## 4 PATRICK H SCHILLING MI 5.5 1716 1363
## 5 HANSHI ZUO MI 5.5 1655 1242
## 6 HANSEN SONG OH 5.0 1686 1399
## rnd1_oppscore rnd3_oppscore rnd4_oppscore rnd5_oppscore rnd6_oppscore
## 1 1563 1600 1610 1649 1663
## 2 917 1716 1629 1604 1595
## 3 955 1745 1563 1712 1666
## 4 1507 1553 1579 1655 1564
## 5 980 1663 1666 1716 1610
## 6 1602 1712 1438 1365 1552
## rnd7_oppscore avg_opponent_score
## 1 1716 1605.286
## 2 1649 1469.286
## 3 1663 1563.571
## 4 1794 1573.571
## 5 1629 1500.857
## 6 1563 1518.714
Time for cleanup!
#removing columns not needed
dfsr1 <- dfsr1[,-c(1:8)]
dfsr1 <- dfsr1[,-c(5:11)]
# making the average opponent score an integer, as intdicated in the assignment
dfsr1$avg_opponent_score <- as.integer(dfsr1$avg_opponent_score)
Finished product
dfsr1
## name state points playerrtg
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## 7 GARY DEE SWATHELL MI 5.0 1649
## 8 EZEKIEL HOUGHTON MI 5.0 1641
## 9 STEFANO LEE ON 5.0 1411
## 10 ANVIT RAO MI 5.0 1365
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712
## 12 KENNETH J TACK MI 4.5 1663
## 13 TORRANCE HENRY JR MI 4.5 1666
## 14 BRADLEY SHAW MI 4.5 1610
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220
## 16 MIKE NIKITIN MI 4.0 1604
## 17 RONALD GRZEGORCZYK MI 4.0 1629
## 18 DAVID SUNDEEN MI 4.0 1600
## 19 DIPANKAR ROY MI 4.0 1564
## 20 JASON ZHENG MI 4.0 1595
## 21 DINH DANG BUI ON 4.0 1563
## 22 EUGENE L MCCLURE MI 4.0 1555
## 23 ALAN BUI ON 4.0 1363
## 24 MICHAEL R ALDRICH MI 4.0 1229
## 25 LOREN SCHWIEBERT MI 3.5 1745
## 26 MAX ZHU ON 3.5 1579
## 27 GAURAV GIDWANI MI 3.5 1552
## 28 SOFIA ADINA STANESCU-BELLU MI 3.5 1507
## 29 CHIEDOZIE OKORIE MI 3.5 1602
## 30 GEORGE AVERY JONES ON 3.5 1522
## 31 RISHI SHETTY MI 3.5 1494
## 32 JOSHUA PHILIP MATHEWS ON 3.5 1441
## 33 JADE GE MI 3.5 1449
## 34 MICHAEL JEFFERY THOMAS MI 3.5 1399
## 35 JOSHUA DAVID LEE MI 3.5 1438
## 36 SIDDHARTH JHA MI 3.5 1355
## 37 AMIYATOSH PWNANANDAM MI 3.5 980
## 38 BRIAN LIU MI 3.0 1423
## 39 JOEL R HENDON MI 3.0 1436
## 40 FOREST ZHANG MI 3.0 1348
## 41 KYLE WILLIAM MURPHY MI 3.0 1403
## 42 JARED GE MI 3.0 1332
## 43 ROBERT GLEN VASEY MI 3.0 1283
## 44 JUSTIN D SCHILLING MI 3.0 1199
## 45 DEREK YAN MI 3.0 1242
## 46 JACOB ALEXANDER LAVALLEY MI 3.0 377
## 47 ERIC WRIGHT MI 2.5 1362
## 48 DANIEL KHAIN MI 2.5 1382
## 49 MICHAEL J MARTIN MI 2.5 1291
## 50 SHIVAM JHA MI 2.5 1056
## 51 TEJAS AYYAGARI MI 2.5 1011
## 52 ETHAN GUO MI 2.5 935
## 53 JOSE C YBARRA MI 2.0 1393
## 54 LARRY HODGE MI 2.0 1270
## 55 ALEX KONG MI 2.0 1186
## 56 MARISA RICCI MI 2.0 1153
## 57 MICHAEL LU MI 2.0 1092
## 58 VIRAJ MOHILE MI 2.0 917
## 59 SEAN M MC CORMICK MI 2.0 853
## 60 JULIA SHEN MI 1.5 967
## 61 JEZZEL FARKAS ON 1.5 955
## 62 ASHWIN BALAJI MI 1.0 1530
## 63 THOMAS JOSEPH HOSMER MI 1.0 1175
## 64 BEN LI MI 1.0 1163
## avg_opponent_score
## 1 1605
## 2 1469
## 3 1563
## 4 1573
## 5 1500
## 6 1518
## 7 1372
## 8 1468
## 9 1523
## 10 1554
## 11 1467
## 12 1506
## 13 1497
## 14 1515
## 15 1483
## 16 1385
## 17 1498
## 18 1480
## 19 1426
## 20 1410
## 21 1470
## 22 1300
## 23 1213
## 24 1357
## 25 1363
## 26 1506
## 27 1221
## 28 1522
## 29 1313
## 30 1144
## 31 1259
## 32 1378
## 33 1276
## 34 1375
## 35 1149
## 36 1388
## 37 1384
## 38 1539
## 39 1429
## 40 1390
## 41 1248
## 42 1149
## 43 1106
## 44 1327
## 45 1152
## 46 1357
## 47 1392
## 48 1355
## 49 1285
## 50 1296
## 51 1356
## 52 1494
## 53 1345
## 54 1206
## 55 1406
## 56 1414
## 57 1363
## 58 1391
## 59 1319
## 60 1330
## 61 1327
## 62 1186
## 63 1350
## 64 1263
Export to CSV
write.csv(dfsr1, file = "chess_data.csv")