Given a slightly structured text file with chess tournament results, use R to clean the data and calculate a new field for each competitor. The final file should be available to export into a .CSV file suitable for uploading to a SQL database.
I import the file from my working directory with the read.table function, using the options to specify that there is no header and that the delimiter for the fields is the “|” symbol. The fill option fills in columns with “NA” values when lines of the text file do not have the correct amount of delimiters. I specify that I do not want the strings read into the table as factors as well (this will allow for easier calculations later).
dataChess <- read.table("607pro1info.txt", header = FALSE, sep = '|', fill = TRUE, stringsAsFactors = FALSE)
head(dataChess)## V1
## 1 -----------------------------------------------------------------------------------------
## 2 Pair
## 3 Num
## 4 -----------------------------------------------------------------------------------------
## 5 1
## 6 ON
## V2 V3 V4 V5 V6 V7 V8
## 1
## 2 Player Name Total Round Round Round Round Round
## 3 USCF ID / Rtg (Pre->Post) Pts 1 2 3 4 5
## 4
## 5 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7
## 6 15445895 / R: 1794 ->1817 N:2 W B W B W
## V9 V10 V11
## 1 NA
## 2 Round Round NA
## 3 6 7 NA
## 4 NA
## 5 D 12 D 4 NA
## 6 B W NA
names(dataChess)## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11"
dataChess$V1[dataChess$V1 == "-----------------------------------------------------------------------------------------"] <- NA ## clean-up to make the file easier to read
dataChess## V1 V2 V3 V4 V5 V6 V7
## 1 <NA>
## 2 Pair Player Name Total Round Round Round Round
## 3 Num USCF ID / Rtg (Pre->Post) Pts 1 2 3 4
## 4 <NA>
## 5 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 6 ON 15445895 / R: 1794 ->1817 N:2 W B W B
## 7 <NA>
## 8 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 9 MI 14598900 / R: 1553 ->1663 N:2 B W B W
## 10 <NA>
## 11 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 12 MI 14959604 / R: 1384 ->1640 N:2 W B W B
## 13 <NA>
## 14 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26
## 15 MI 12616049 / R: 1716 ->1744 N:2 W B W B
## 16 <NA>
## 17 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13
## 18 MI 14601533 / R: 1655 ->1690 N:2 B W B W
## 19 <NA>
## 20 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35
## 21 OH 15055204 / R: 1686 ->1687 N:3 W B W B
## 22 <NA>
## 23 7 GARY DEE SWATHELL 5.0 W 57 W 46 W 13 W 11
## 24 MI 11146376 / R: 1649 ->1673 N:3 W B W B
## 25 <NA>
## 26 8 EZEKIEL HOUGHTON 5.0 W 3 W 32 L 14 L 9
## 27 MI 15142253 / R: 1641P17->1657P24 N:3 B W B W
## 28 <NA>
## 29 9 STEFANO LEE 5.0 W 25 L 18 W 59 W 8
## 30 ON 14954524 / R: 1411 ->1564 N:2 W B W B
## 31 <NA>
## 32 10 ANVIT RAO 5.0 D 16 L 19 W 55 W 31
## 33 MI 14150362 / R: 1365 ->1544 N:3 W W B B
## 34 <NA>
## 35 11 CAMERON WILLIAM MC LEMAN 4.5 D 38 W 56 W 6 L 7
## 36 MI 12581589 / R: 1712 ->1696 N:3 B W B W
## 37 <NA>
## 38 12 KENNETH J TACK 4.5 W 42 W 33 D 5 W 38
## 39 MI 12681257 / R: 1663 ->1670 N:3 W B W B
## 40 <NA>
## 41 13 TORRANCE HENRY JR 4.5 W 36 W 27 L 7 D 5
## 42 MI 15082995 / R: 1666 ->1662 N:3 B W B B
## 43 <NA>
## 44 14 BRADLEY SHAW 4.5 W 54 W 44 W 8 L 1
## 45 MI 10131499 / R: 1610 ->1618 N:3 W B W W
## 46 <NA>
## 47 15 ZACHARY JAMES HOUGHTON 4.5 D 19 L 16 W 30 L 22
## 48 MI 15619130 / R: 1220P13->1416P20 N:3 B B W W
## 49 <NA>
## 50 16 MIKE NIKITIN 4.0 D 10 W 15 H W 39
## 51 MI 10295068 / R: 1604 ->1613 N:3 B W B
## 52 <NA>
## 53 17 RONALD GRZEGORCZYK 4.0 W 48 W 41 L 26 L 2
## 54 MI 10297702 / R: 1629 ->1610 N:3 W B W B
## 55 <NA>
## 56 18 DAVID SUNDEEN 4.0 W 47 W 9 L 1 W 32
## 57 MI 11342094 / R: 1600 ->1600 N:3 B W B W
## 58 <NA>
## 59 19 DIPANKAR ROY 4.0 D 15 W 10 W 52 D 28
## 60 MI 14862333 / R: 1564 ->1570 N:3 W B W B
## 61 <NA>
## 62 20 JASON ZHENG 4.0 L 40 W 49 W 23 W 41
## 63 MI 14529060 / R: 1595 ->1569 N:4 W B W B
## 64 <NA>
## 65 21 DINH DANG BUI 4.0 W 43 L 1 W 47 L 3
## 66 ON 15495066 / R: 1563P22->1562 N:3 B W B W
## 67 <NA>
## 68 22 EUGENE L MCCLURE 4.0 W 64 D 52 L 28 W 15
## 69 MI 12405534 / R: 1555 ->1529 N:4 W B W B
## 70 <NA>
## 71 23 ALAN BUI 4.0 L 4 W 43 L 20 W 58
## 72 ON 15030142 / R: 1363 ->1371 B W B W
## 73 <NA>
## 74 24 MICHAEL R ALDRICH 4.0 L 28 L 47 W 43 L 25
## 75 MI 13469010 / R: 1229 ->1300 N:4 B W B B
## 76 <NA>
## 77 25 LOREN SCHWIEBERT 3.5 L 9 W 53 L 3 W 24
## 78 MI 12486656 / R: 1745 ->1681 N:4 B W B W
## 79 <NA>
## 80 26 MAX ZHU 3.5 W 49 W 40 W 17 L 4
## 81 ON 15131520 / R: 1579 ->1564 N:4 B W B W
## 82 <NA>
## 83 27 GAURAV GIDWANI 3.5 W 51 L 13 W 46 W 37
## 84 MI 14476567 / R: 1552 ->1539 N:4 W B W B
## 85 <NA>
## 86 28 SOFIA ADINA STANESCU-BELLU 3.5 W 24 D 4 W 22 D 19
## 87 MI 14882954 / R: 1507 ->1513 N:3 W W B W
## 88 <NA>
## 89 29 CHIEDOZIE OKORIE 3.5 W 50 D 6 L 38 L 34
## 90 MI 15323285 / R: 1602P6 ->1508P12 N:4 B W B W
## 91 <NA>
## 92 30 GEORGE AVERY JONES 3.5 L 52 D 64 L 15 W 55
## 93 ON 12577178 / R: 1522 ->1444 W B B W
## 94 <NA>
## 95 31 RISHI SHETTY 3.5 L 58 D 55 W 64 L 10
## 96 MI 15131618 / R: 1494 ->1444 B W B W
## 97 <NA>
## 98 32 JOSHUA PHILIP MATHEWS 3.5 W 61 L 8 W 44 L 18
## 99 ON 14073750 / R: 1441 ->1433 N:4 W B W B
## 100 <NA>
## 101 33 JADE GE 3.5 W 60 L 12 W 50 D 36
## 102 MI 14691842 / R: 1449 ->1421 B W B W
## 103 <NA>
## 104 34 MICHAEL JEFFERY THOMAS 3.5 L 6 W 60 L 37 W 29
## 105 MI 15051807 / R: 1399 ->1400 B W B B
## 106 <NA>
## 107 35 JOSHUA DAVID LEE 3.5 L 46 L 38 W 56 L 6
## 108 MI 14601397 / R: 1438 ->1392 W W B W
## 109 <NA>
## 110 36 SIDDHARTH JHA 3.5 L 13 W 57 W 51 D 33
## 111 MI 14773163 / R: 1355 ->1367 N:4 W B W B
## 112 <NA>
## 113 37 AMIYATOSH PWNANANDAM 3.5 B L 5 W 34 L 27
## 114 MI 15489571 / R: 980P12->1077P17 B W W
## 115 <NA>
## 116 38 BRIAN LIU 3.0 D 11 W 35 W 29 L 12
## 117 MI 15108523 / R: 1423 ->1439 N:4 W B W W
## 118 <NA>
## 119 39 JOEL R HENDON 3.0 L 1 W 54 W 40 L 16
## 120 MI 12923035 / R: 1436P23->1413 N:4 B W B W
## 121 <NA>
## 122 40 FOREST ZHANG 3.0 W 20 L 26 L 39 W 59
## 123 MI 14892710 / R: 1348 ->1346 B B W W
## 124 <NA>
## 125 41 KYLE WILLIAM MURPHY 3.0 W 59 L 17 W 58 L 20
## 126 MI 15761443 / R: 1403P5 ->1341P9 B W B W
## 127 <NA>
## 128 42 JARED GE 3.0 L 12 L 50 L 57 D 60
## 129 MI 14462326 / R: 1332 ->1256 B W B B
## 130 <NA>
## 131 43 ROBERT GLEN VASEY 3.0 L 21 L 23 L 24 W 63
## 132 MI 14101068 / R: 1283 ->1244 W B W W
## 133 <NA>
## 134 44 JUSTIN D SCHILLING 3.0 B L 14 L 32 W 53
## 135 MI 15323504 / R: 1199 ->1199 W B B
## 136 <NA>
## 137 45 DEREK YAN 3.0 L 5 L 51 D 60 L 56
## 138 MI 15372807 / R: 1242 ->1191 W B W B
## 139 <NA>
## 140 46 JACOB ALEXANDER LAVALLEY 3.0 W 35 L 7 L 27 L 50
## 141 MI 15490981 / R: 377P3 ->1076P10 B W B W
## 142 <NA>
## 143 47 ERIC WRIGHT 2.5 L 18 W 24 L 21 W 61
## 144 MI 12533115 / R: 1362 ->1341 W B W B
## 145 <NA>
## 146 48 DANIEL KHAIN 2.5 L 17 W 63 H D 52
## 147 MI 14369165 / R: 1382 ->1335 B W B
## 148 <NA>
## 149 49 MICHAEL J MARTIN 2.5 L 26 L 20 D 63 D 64
## 150 MI 12531685 / R: 1291P12->1259P17 W W B W
## 151 <NA>
## 152 50 SHIVAM JHA 2.5 L 29 W 42 L 33 W 46
## 153 MI 14773178 / R: 1056 ->1111 W B W B
## 154 <NA>
## 155 51 TEJAS AYYAGARI 2.5 L 27 W 45 L 36 W 57
## 156 MI 15205474 / R: 1011 ->1097 B W B W
## 157 <NA>
## 158 52 ETHAN GUO 2.5 W 30 D 22 L 19 D 48
## 159 MI 14918803 / R: 935 ->1092 N:4 B W B W
## 160 <NA>
## 161 53 JOSE C YBARRA 2.0 H L 25 H L 44
## 162 MI 12578849 / R: 1393 ->1359 B W
## 163 <NA>
## 164 54 LARRY HODGE 2.0 L 14 L 39 L 61 B
## 165 MI 12836773 / R: 1270 ->1200 B B W
## 166 <NA>
## 167 55 ALEX KONG 2.0 L 62 D 31 L 10 L 30
## 168 MI 15412571 / R: 1186 ->1163 W B W B
## 169 <NA>
## 170 56 MARISA RICCI 2.0 H L 11 L 35 W 45
## 171 MI 14679887 / R: 1153 ->1140 B W W
## 172 <NA>
## 173 57 MICHAEL LU 2.0 L 7 L 36 W 42 L 51
## 174 MI 15113330 / R: 1092 ->1079 B W W B
## 175 <NA>
## 176 58 VIRAJ MOHILE 2.0 W 31 L 2 L 41 L 23
## 177 MI 14700365 / R: 917 -> 941 W B W B
## 178 <NA>
## 179 59 SEAN M MC CORMICK 2.0 L 41 B L 9 L 40
## 180 MI 12841036 / R: 853 -> 878 W B B
## 181 <NA>
## 182 60 JULIA SHEN 1.5 L 33 L 34 D 45 D 42
## 183 MI 14579262 / R: 967 -> 984 W B B W
## 184 <NA>
## 185 61 JEZZEL FARKAS 1.5 L 32 L 3 W 54 L 47
## 186 ON 15771592 / R: 955P11-> 979P18 B W B W
## 187 <NA>
## 188 62 ASHWIN BALAJI 1.0 W 55 U U U
## 189 MI 15219542 / R: 1530 ->1535 B
## 190 <NA>
## 191 63 THOMAS JOSEPH HOSMER 1.0 L 2 L 48 D 49 L 43
## 192 MI 15057092 / R: 1175 ->1125 W B W B
## 193 <NA>
## 194 64 BEN LI 1.0 L 22 D 30 L 31 D 49
## 195 MI 15006561 / R: 1163 ->1112 B W W B
## 196 <NA>
## V8 V9 V10 V11
## 1 NA
## 2 Round Round Round NA
## 3 5 6 7 NA
## 4 NA
## 5 W 7 D 12 D 4 NA
## 6 W B W NA
## 7 NA
## 8 W 16 W 20 W 7 NA
## 9 B W B NA
## 10 NA
## 11 W 11 W 13 W 12 NA
## 12 W B W NA
## 13 NA
## 14 D 5 W 19 D 1 NA
## 15 W B B NA
## 16 NA
## 17 D 4 W 14 W 17 NA
## 18 B W B NA
## 19 NA
## 20 D 10 W 27 W 21 NA
## 21 B W B NA
## 22 NA
## 23 L 1 W 9 L 2 NA
## 24 B W W NA
## 25 NA
## 26 W 47 W 28 W 19 NA
## 27 B W W NA
## 28 NA
## 29 W 26 L 7 W 20 NA
## 30 W B B NA
## 31 NA
## 32 D 6 W 25 W 18 NA
## 33 W B W NA
## 34 NA
## 35 L 3 W 34 W 26 NA
## 36 B W B NA
## 37 NA
## 38 H D 1 L 3 NA
## 39 W B NA
## 40 NA
## 41 W 33 L 3 W 32 NA
## 42 W W B NA
## 43 NA
## 44 D 27 L 5 W 31 NA
## 45 B B W NA
## 46 NA
## 47 W 54 W 33 W 38 NA
## 48 B B W NA
## 49 NA
## 50 L 2 W 36 U NA
## 51 W B NA
## 52 NA
## 53 W 23 W 22 L 5 NA
## 54 W B W NA
## 55 NA
## 56 L 19 W 38 L 10 NA
## 57 B W B NA
## 58 NA
## 59 W 18 L 4 L 8 NA
## 60 W W B NA
## 61 NA
## 62 W 28 L 2 L 9 NA
## 63 W B W NA
## 64 NA
## 65 W 40 W 39 L 6 NA
## 66 W B W NA
## 67 NA
## 68 H L 17 W 40 NA
## 69 W B NA
## 70 NA
## 71 L 17 W 37 W 46 NA
## 72 B W B NA
## 73 NA
## 74 W 60 W 44 W 39 NA
## 75 W W B NA
## 76 NA
## 77 D 34 L 10 W 47 NA
## 78 B W B NA
## 79 NA
## 80 L 9 D 32 L 11 NA
## 81 B W W NA
## 82 NA
## 83 D 14 L 6 U NA
## 84 W B NA
## 85 NA
## 86 L 20 L 8 D 36 NA
## 87 B B W NA
## 88 NA
## 89 W 52 W 48 U NA
## 90 W B NA
## 91 NA
## 92 L 31 W 61 W 50 NA
## 93 W B B NA
## 94 NA
## 95 W 30 W 50 L 14 NA
## 96 B W B NA
## 97 NA
## 98 W 51 D 26 L 13 NA
## 99 W B W NA
## 100 NA
## 101 L 13 L 15 W 51 NA
## 102 B W B NA
## 103 NA
## 104 D 25 L 11 W 52 NA
## 105 W B W NA
## 106 NA
## 107 W 57 D 52 W 48 NA
## 108 B B W NA
## 109 NA
## 110 H L 16 D 28 NA
## 111 W B NA
## 112 NA
## 113 H L 23 W 61 NA
## 114 B W NA
## 115 NA
## 116 H L 18 L 15 NA
## 117 B B NA
## 118 NA
## 119 W 44 L 21 L 24 NA
## 120 B W W NA
## 121 NA
## 122 L 21 W 56 L 22 NA
## 123 B W W NA
## 124 NA
## 125 X U U NA
## 126 NA
## 127 NA
## 128 D 61 W 64 W 56 NA
## 129 W W B NA
## 130 NA
## 131 W 59 L 46 W 55 NA
## 132 B B W NA
## 133 NA
## 134 L 39 L 24 W 59 NA
## 135 W B W NA
## 136 NA
## 137 W 63 D 55 W 58 NA
## 138 W B W NA
## 139 NA
## 140 W 64 W 43 L 23 NA
## 141 B W W NA
## 142 NA
## 143 L 8 D 51 L 25 NA
## 144 W B W NA
## 145 NA
## 146 H L 29 L 35 NA
## 147 W B NA
## 148 NA
## 149 W 58 H U NA
## 150 B NA
## 151 NA
## 152 H L 31 L 30 NA
## 153 B W NA
## 154 NA
## 155 L 32 D 47 L 33 NA
## 156 B W W NA
## 157 NA
## 158 L 29 D 35 L 34 NA
## 159 B W B NA
## 160 NA
## 161 U W 57 U NA
## 162 W NA
## 163 NA
## 164 L 15 L 59 W 64 NA
## 165 W B W NA
## 166 NA
## 167 B D 45 L 43 NA
## 168 W B NA
## 169 NA
## 170 H L 40 L 42 NA
## 171 B W NA
## 172 NA
## 173 L 35 L 53 B NA
## 174 W B NA
## 175 NA
## 176 L 49 B L 45 NA
## 177 W B NA
## 178 NA
## 179 L 43 W 54 L 44 NA
## 180 W W B NA
## 181 NA
## 182 L 24 H U NA
## 183 B NA
## 184 NA
## 185 D 42 L 30 L 37 NA
## 186 B W B NA
## 187 NA
## 188 U U U NA
## 189 NA
## 190 NA
## 191 L 45 H U NA
## 192 B NA
## 193 NA
## 194 L 46 L 42 L 54 NA
## 195 W B B NA
## 196 NA
I use regular expressions in certain columns of the dataChess data frame to extract the desired values for the final data frame.
Observing the raw data, I note that the player names live in the second column and are formatted in upper case letters. Some entries also contain hyphens. The names all contain at least two parts, but some have up to four parts. The regular expression has two names mandatory and the final two names as optional components.
After extracting the initial vector, I drop the first entry (“USCF ID”), which is not a name but part of the raw data column header. I am left with a vector containing the 64 player names.
library(stringr)
playerName <- unlist(str_extract_all(dataChess$V2, "[A-Z]{1,} [A-Z]{1,}( [A-Z-]{1,})*( [A-Z-]{1,})*"))
playerName## [1] "USCF ID" "GARY HUA"
## [3] "DAKSHESH DARURI" "ADITYA BAJAJ"
## [5] "PATRICK H SCHILLING" "HANSHI ZUO"
## [7] "HANSEN SONG" "GARY DEE SWATHELL"
## [9] "EZEKIEL HOUGHTON" "STEFANO LEE"
## [11] "ANVIT RAO" "CAMERON WILLIAM MC LEMAN"
## [13] "KENNETH J TACK" "TORRANCE HENRY JR"
## [15] "BRADLEY SHAW" "ZACHARY JAMES HOUGHTON"
## [17] "MIKE NIKITIN" "RONALD GRZEGORCZYK"
## [19] "DAVID SUNDEEN" "DIPANKAR ROY"
## [21] "JASON ZHENG" "DINH DANG BUI"
## [23] "EUGENE L MCCLURE" "ALAN BUI"
## [25] "MICHAEL R ALDRICH" "LOREN SCHWIEBERT"
## [27] "MAX ZHU" "GAURAV GIDWANI"
## [29] "SOFIA ADINA STANESCU-BELLU" "CHIEDOZIE OKORIE"
## [31] "GEORGE AVERY JONES" "RISHI SHETTY"
## [33] "JOSHUA PHILIP MATHEWS" "JADE GE"
## [35] "MICHAEL JEFFERY THOMAS" "JOSHUA DAVID LEE"
## [37] "SIDDHARTH JHA" "AMIYATOSH PWNANANDAM"
## [39] "BRIAN LIU" "JOEL R HENDON"
## [41] "FOREST ZHANG" "KYLE WILLIAM MURPHY"
## [43] "JARED GE" "ROBERT GLEN VASEY"
## [45] "JUSTIN D SCHILLING" "DEREK YAN"
## [47] "JACOB ALEXANDER LAVALLEY" "ERIC WRIGHT"
## [49] "DANIEL KHAIN" "MICHAEL J MARTIN"
## [51] "SHIVAM JHA" "TEJAS AYYAGARI"
## [53] "ETHAN GUO" "JOSE C YBARRA"
## [55] "LARRY HODGE" "ALEX KONG"
## [57] "MARISA RICCI" "MICHAEL LU"
## [59] "VIRAJ MOHILE" "SEAN M MC CORMICK"
## [61] "JULIA SHEN" "JEZZEL FARKAS"
## [63] "ASHWIN BALAJI" "THOMAS JOSEPH HOSMER"
## [65] "BEN LI"
playerName <- playerName[-1]
playerName## [1] "GARY HUA" "DAKSHESH DARURI"
## [3] "ADITYA BAJAJ" "PATRICK H SCHILLING"
## [5] "HANSHI ZUO" "HANSEN SONG"
## [7] "GARY DEE SWATHELL" "EZEKIEL HOUGHTON"
## [9] "STEFANO LEE" "ANVIT RAO"
## [11] "CAMERON WILLIAM MC LEMAN" "KENNETH J TACK"
## [13] "TORRANCE HENRY JR" "BRADLEY SHAW"
## [15] "ZACHARY JAMES HOUGHTON" "MIKE NIKITIN"
## [17] "RONALD GRZEGORCZYK" "DAVID SUNDEEN"
## [19] "DIPANKAR ROY" "JASON ZHENG"
## [21] "DINH DANG BUI" "EUGENE L MCCLURE"
## [23] "ALAN BUI" "MICHAEL R ALDRICH"
## [25] "LOREN SCHWIEBERT" "MAX ZHU"
## [27] "GAURAV GIDWANI" "SOFIA ADINA STANESCU-BELLU"
## [29] "CHIEDOZIE OKORIE" "GEORGE AVERY JONES"
## [31] "RISHI SHETTY" "JOSHUA PHILIP MATHEWS"
## [33] "JADE GE" "MICHAEL JEFFERY THOMAS"
## [35] "JOSHUA DAVID LEE" "SIDDHARTH JHA"
## [37] "AMIYATOSH PWNANANDAM" "BRIAN LIU"
## [39] "JOEL R HENDON" "FOREST ZHANG"
## [41] "KYLE WILLIAM MURPHY" "JARED GE"
## [43] "ROBERT GLEN VASEY" "JUSTIN D SCHILLING"
## [45] "DEREK YAN" "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT" "DANIEL KHAIN"
## [49] "MICHAEL J MARTIN" "SHIVAM JHA"
## [51] "TEJAS AYYAGARI" "ETHAN GUO"
## [53] "JOSE C YBARRA" "LARRY HODGE"
## [55] "ALEX KONG" "MARISA RICCI"
## [57] "MICHAEL LU" "VIRAJ MOHILE"
## [59] "SEAN M MC CORMICK" "JULIA SHEN"
## [61] "JEZZEL FARKAS" "ASHWIN BALAJI"
## [63] "THOMAS JOSEPH HOSMER" "BEN LI"
Extracting two-digit upper case values from the first column using regular expressions returns the state values as well as “NA” values. Removing the “NA” values returns the correct vector of state names (length 64).
playerState <- unlist(str_extract_all(dataChess$V1, "[A-Z]{2}"))
playerState## [1] NA NA "ON" NA "MI" NA "MI" NA "MI" NA "MI" NA "OH" NA
## [15] "MI" NA "MI" NA "ON" NA "MI" NA "MI" NA "MI" NA "MI" NA
## [29] "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA
## [43] "ON" NA "MI" NA "ON" NA "MI" NA "MI" NA "ON" NA "MI" NA
## [57] "MI" NA "MI" NA "ON" NA "MI" NA "ON" NA "MI" NA "MI" NA
## [71] "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA
## [85] "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA
## [99] "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA
## [113] "MI" NA "MI" NA "MI" NA "MI" NA "MI" NA "ON" NA "MI" NA
## [127] "MI" NA "MI" NA
playerState <- playerState[!is.na(playerState)]
playerState## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
All of the values for points are formatted exactly the same, so one regular expression for a digit followed by a period followed by another digit calculated against the third column returns the desired vector.
totalPoints <- unlist(str_extract_all(dataChess$V3, "\\d\\.\\d"))
totalPoints## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5"
## [12] "4.5" "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [23] "4.0" "4.0" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [34] "3.5" "3.5" "3.5" "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0"
## [45] "3.0" "3.0" "2.5" "2.5" "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0"
## [56] "2.0" "2.0" "2.0" "2.0" "1.5" "1.5" "1.0" "1.0" "1.0"
The pre-rating for all the players is prefaced with “R:” so the first regular expression finds these values including the “R:” - a second regular expression removes the “R:” and leaves only the numerical ranking.
preRating <- unlist(str_extract_all(dataChess$V2, "R: {1,2}\\d{3,4}"))
preRating## [1] "R: 1794" "R: 1553" "R: 1384" "R: 1716" "R: 1655" "R: 1686" "R: 1649"
## [8] "R: 1641" "R: 1411" "R: 1365" "R: 1712" "R: 1663" "R: 1666" "R: 1610"
## [15] "R: 1220" "R: 1604" "R: 1629" "R: 1600" "R: 1564" "R: 1595" "R: 1563"
## [22] "R: 1555" "R: 1363" "R: 1229" "R: 1745" "R: 1579" "R: 1552" "R: 1507"
## [29] "R: 1602" "R: 1522" "R: 1494" "R: 1441" "R: 1449" "R: 1399" "R: 1438"
## [36] "R: 1355" "R: 980" "R: 1423" "R: 1436" "R: 1348" "R: 1403" "R: 1332"
## [43] "R: 1283" "R: 1199" "R: 1242" "R: 377" "R: 1362" "R: 1382" "R: 1291"
## [50] "R: 1056" "R: 1011" "R: 935" "R: 1393" "R: 1270" "R: 1186" "R: 1153"
## [57] "R: 1092" "R: 917" "R: 853" "R: 967" "R: 955" "R: 1530" "R: 1175"
## [64] "R: 1163"
preRating <- unlist(str_extract_all(preRating, "\\d{3,4}"))
preRating## [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" "1365"
## [11] "1712" "1663" "1666" "1610" "1220" "1604" "1629" "1600" "1564" "1595"
## [21] "1563" "1555" "1363" "1229" "1745" "1579" "1552" "1507" "1602" "1522"
## [31] "1494" "1441" "1449" "1399" "1438" "1355" "980" "1423" "1436" "1348"
## [41] "1403" "1332" "1283" "1199" "1242" "377" "1362" "1382" "1291" "1056"
## [51] "1011" "935" "1393" "1270" "1186" "1153" "1092" "917" "853" "967"
## [61] "955" "1530" "1175" "1163"
To begin assembling the final desired data frame, I use the cbind.data.frame function to compile the vectors calculated above into a preliminary data frame. I also add a column for the playerNumber at the front of the data frame to be referenced later when calculating the average opponent ranking.
newChess <- cbind.data.frame(playerNumber = c(1:length(playerName)), playerName, playerState, totalPoints, preRating, stringsAsFactors = FALSE)However, before I can calculate the final column of average opponent rankings for the output file, I need to modify the columns containing the opponent numbers into a usable format. To start, I add a row index column to the raw data file, and then subset with a modulo function to return only the rows containing the opponent numbers. I remove the first row (which was part of the raw data header) and also rename the columns.
rowIndex <- c(1:length(dataChess$V1))
dataChess <- cbind.data.frame(rowIndex, dataChess, stringsAsFactors = FALSE)
opponents <- dataChess[dataChess$rowIndex %% 3 == 2, ] ## modulo function to return rows with opponent numbers
opponents <- opponents[-1, c(3,5:11)] ## drop first row with header info
names(opponents) <- c("Name", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7")
row.names(opponents) <- c(1:length(opponents$Name))
opponents## Name Round1 Round2 Round3 Round4 Round5
## 1 GARY HUA W 39 W 21 W 18 W 14 W 7
## 2 DAKSHESH DARURI W 63 W 58 L 4 W 17 W 16
## 3 ADITYA BAJAJ L 8 W 61 W 25 W 21 W 11
## 4 PATRICK H SCHILLING W 23 D 28 W 2 W 26 D 5
## 5 HANSHI ZUO W 45 W 37 D 12 D 13 D 4
## 6 HANSEN SONG W 34 D 29 L 11 W 35 D 10
## 7 GARY DEE SWATHELL W 57 W 46 W 13 W 11 L 1
## 8 EZEKIEL HOUGHTON W 3 W 32 L 14 L 9 W 47
## 9 STEFANO LEE W 25 L 18 W 59 W 8 W 26
## 10 ANVIT RAO D 16 L 19 W 55 W 31 D 6
## 11 CAMERON WILLIAM MC LEMAN D 38 W 56 W 6 L 7 L 3
## 12 KENNETH J TACK W 42 W 33 D 5 W 38 H
## 13 TORRANCE HENRY JR W 36 W 27 L 7 D 5 W 33
## 14 BRADLEY SHAW W 54 W 44 W 8 L 1 D 27
## 15 ZACHARY JAMES HOUGHTON D 19 L 16 W 30 L 22 W 54
## 16 MIKE NIKITIN D 10 W 15 H W 39 L 2
## 17 RONALD GRZEGORCZYK W 48 W 41 L 26 L 2 W 23
## 18 DAVID SUNDEEN W 47 W 9 L 1 W 32 L 19
## 19 DIPANKAR ROY D 15 W 10 W 52 D 28 W 18
## 20 JASON ZHENG L 40 W 49 W 23 W 41 W 28
## 21 DINH DANG BUI W 43 L 1 W 47 L 3 W 40
## 22 EUGENE L MCCLURE W 64 D 52 L 28 W 15 H
## 23 ALAN BUI L 4 W 43 L 20 W 58 L 17
## 24 MICHAEL R ALDRICH L 28 L 47 W 43 L 25 W 60
## 25 LOREN SCHWIEBERT L 9 W 53 L 3 W 24 D 34
## 26 MAX ZHU W 49 W 40 W 17 L 4 L 9
## 27 GAURAV GIDWANI W 51 L 13 W 46 W 37 D 14
## 28 SOFIA ADINA STANESCU-BELLU W 24 D 4 W 22 D 19 L 20
## 29 CHIEDOZIE OKORIE W 50 D 6 L 38 L 34 W 52
## 30 GEORGE AVERY JONES L 52 D 64 L 15 W 55 L 31
## 31 RISHI SHETTY L 58 D 55 W 64 L 10 W 30
## 32 JOSHUA PHILIP MATHEWS W 61 L 8 W 44 L 18 W 51
## 33 JADE GE W 60 L 12 W 50 D 36 L 13
## 34 MICHAEL JEFFERY THOMAS L 6 W 60 L 37 W 29 D 25
## 35 JOSHUA DAVID LEE L 46 L 38 W 56 L 6 W 57
## 36 SIDDHARTH JHA L 13 W 57 W 51 D 33 H
## 37 AMIYATOSH PWNANANDAM B L 5 W 34 L 27 H
## 38 BRIAN LIU D 11 W 35 W 29 L 12 H
## 39 JOEL R HENDON L 1 W 54 W 40 L 16 W 44
## 40 FOREST ZHANG W 20 L 26 L 39 W 59 L 21
## 41 KYLE WILLIAM MURPHY W 59 L 17 W 58 L 20 X
## 42 JARED GE L 12 L 50 L 57 D 60 D 61
## 43 ROBERT GLEN VASEY L 21 L 23 L 24 W 63 W 59
## 44 JUSTIN D SCHILLING B L 14 L 32 W 53 L 39
## 45 DEREK YAN L 5 L 51 D 60 L 56 W 63
## 46 JACOB ALEXANDER LAVALLEY W 35 L 7 L 27 L 50 W 64
## 47 ERIC WRIGHT L 18 W 24 L 21 W 61 L 8
## 48 DANIEL KHAIN L 17 W 63 H D 52 H
## 49 MICHAEL J MARTIN L 26 L 20 D 63 D 64 W 58
## 50 SHIVAM JHA L 29 W 42 L 33 W 46 H
## 51 TEJAS AYYAGARI L 27 W 45 L 36 W 57 L 32
## 52 ETHAN GUO W 30 D 22 L 19 D 48 L 29
## 53 JOSE C YBARRA H L 25 H L 44 U
## 54 LARRY HODGE L 14 L 39 L 61 B L 15
## 55 ALEX KONG L 62 D 31 L 10 L 30 B
## 56 MARISA RICCI H L 11 L 35 W 45 H
## 57 MICHAEL LU L 7 L 36 W 42 L 51 L 35
## 58 VIRAJ MOHILE W 31 L 2 L 41 L 23 L 49
## 59 SEAN M MC CORMICK L 41 B L 9 L 40 L 43
## 60 JULIA SHEN L 33 L 34 D 45 D 42 L 24
## 61 JEZZEL FARKAS L 32 L 3 W 54 L 47 D 42
## 62 ASHWIN BALAJI W 55 U U U U
## 63 THOMAS JOSEPH HOSMER L 2 L 48 D 49 L 43 L 45
## 64 BEN LI L 22 D 30 L 31 D 49 L 46
## Round6 Round7
## 1 D 12 D 4
## 2 W 20 W 7
## 3 W 13 W 12
## 4 W 19 D 1
## 5 W 14 W 17
## 6 W 27 W 21
## 7 W 9 L 2
## 8 W 28 W 19
## 9 L 7 W 20
## 10 W 25 W 18
## 11 W 34 W 26
## 12 D 1 L 3
## 13 L 3 W 32
## 14 L 5 W 31
## 15 W 33 W 38
## 16 W 36 U
## 17 W 22 L 5
## 18 W 38 L 10
## 19 L 4 L 8
## 20 L 2 L 9
## 21 W 39 L 6
## 22 L 17 W 40
## 23 W 37 W 46
## 24 W 44 W 39
## 25 L 10 W 47
## 26 D 32 L 11
## 27 L 6 U
## 28 L 8 D 36
## 29 W 48 U
## 30 W 61 W 50
## 31 W 50 L 14
## 32 D 26 L 13
## 33 L 15 W 51
## 34 L 11 W 52
## 35 D 52 W 48
## 36 L 16 D 28
## 37 L 23 W 61
## 38 L 18 L 15
## 39 L 21 L 24
## 40 W 56 L 22
## 41 U U
## 42 W 64 W 56
## 43 L 46 W 55
## 44 L 24 W 59
## 45 D 55 W 58
## 46 W 43 L 23
## 47 D 51 L 25
## 48 L 29 L 35
## 49 H U
## 50 L 31 L 30
## 51 D 47 L 33
## 52 D 35 L 34
## 53 W 57 U
## 54 L 59 W 64
## 55 D 45 L 43
## 56 L 40 L 42
## 57 L 53 B
## 58 B L 45
## 59 W 54 L 44
## 60 H U
## 61 L 30 L 37
## 62 U U
## 63 H U
## 64 L 42 L 54
I now have a data frame for each player’s opponents, which I attach to the newChess data frame. The opponent columns also include extraneous letters, so I run a for loop to substring and trim the values in each opponent column to leave only the number.
newChess <- cbind.data.frame(newChess, opponents, stringsAsFactors = FALSE)
head(newChess)## playerNumber playerName playerState totalPoints preRating
## 1 1 GARY HUA ON 6.0 1794
## 2 2 DAKSHESH DARURI MI 6.0 1553
## 3 3 ADITYA BAJAJ MI 6.0 1384
## 4 4 PATRICK H SCHILLING MI 5.5 1716
## 5 5 HANSHI ZUO MI 5.5 1655
## 6 6 HANSEN SONG OH 5.0 1686
## Name Round1 Round2 Round3 Round4 Round5
## 1 GARY HUA W 39 W 21 W 18 W 14 W 7
## 2 DAKSHESH DARURI W 63 W 58 L 4 W 17 W 16
## 3 ADITYA BAJAJ L 8 W 61 W 25 W 21 W 11
## 4 PATRICK H SCHILLING W 23 D 28 W 2 W 26 D 5
## 5 HANSHI ZUO W 45 W 37 D 12 D 13 D 4
## 6 HANSEN SONG W 34 D 29 L 11 W 35 D 10
## Round6 Round7
## 1 D 12 D 4
## 2 W 20 W 7
## 3 W 13 W 12
## 4 W 19 D 1
## 5 W 14 W 17
## 6 W 27 W 21
for(i in 7:13) {
newChess[,i] <- unlist(str_trim(str_sub(newChess[,i], 3)))
}
head(newChess)## playerNumber playerName playerState totalPoints preRating
## 1 1 GARY HUA ON 6.0 1794
## 2 2 DAKSHESH DARURI MI 6.0 1553
## 3 3 ADITYA BAJAJ MI 6.0 1384
## 4 4 PATRICK H SCHILLING MI 5.5 1716
## 5 5 HANSHI ZUO MI 5.5 1655
## 6 6 HANSEN SONG OH 5.0 1686
## Name Round1 Round2 Round3 Round4 Round5
## 1 GARY HUA 39 21 18 14 7
## 2 DAKSHESH DARURI 63 58 4 17 16
## 3 ADITYA BAJAJ 8 61 25 21 11
## 4 PATRICK H SCHILLING 23 28 2 26 5
## 5 HANSHI ZUO 45 37 12 13 4
## 6 HANSEN SONG 34 29 11 35 10
## Round6 Round7
## 1 12 4
## 2 20 7
## 3 13 12
## 4 19 1
## 5 14 17
## 6 27 21
Now that the data in the newChess data frame is clean, I can use the values to calculate the average pre chess rating of opponents for each player. Using a for loop, I identify the numbers of the opponents for each player and then find their associated rankings. Storing these rankings in a vector allows me to take the mean of the vector (removing “NA” values) and return that result into the proper place in a new column vector.
avgOpponentPreRating <- numeric() ## generate empty vector to gather values for final column
for (i in 1:length(newChess[,1])) {
opponentNums <- newChess[i,7:13] ## extract vector containing opponent numbers
opponentNums <- opponentNums[!is.na(opponentNums)]
opponentPreRank <- newChess[as.numeric(opponentNums), 5] ## create vector with opponent rankings
avg <- mean(as.numeric(opponentPreRank), na.rm = TRUE) ## take mean of opponent rankings
avgOpponentPreRating[i] <- round(avg, 0) ## place mean in correct vector location
}
avgOpponentPreRating## [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515
## [15] 1484 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522
## [29] 1314 1144 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150
## [43] 1107 1327 1152 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414
## [57] 1363 1391 1319 1330 1327 1186 1350 1263
Finally, I use the cbind.data.frame function to attach the desired columns from newChess to the new average column to create the final desired data frame. I can export this file to my working directory using the write.csv function.
finalChessData <- cbind.data.frame(newChess[,1:5], avgOpponentPreRating)
finalChessData## playerNumber playerName playerState totalPoints
## 1 1 GARY HUA ON 6.0
## 2 2 DAKSHESH DARURI MI 6.0
## 3 3 ADITYA BAJAJ MI 6.0
## 4 4 PATRICK H SCHILLING MI 5.5
## 5 5 HANSHI ZUO MI 5.5
## 6 6 HANSEN SONG OH 5.0
## 7 7 GARY DEE SWATHELL MI 5.0
## 8 8 EZEKIEL HOUGHTON MI 5.0
## 9 9 STEFANO LEE ON 5.0
## 10 10 ANVIT RAO MI 5.0
## 11 11 CAMERON WILLIAM MC LEMAN MI 4.5
## 12 12 KENNETH J TACK MI 4.5
## 13 13 TORRANCE HENRY JR MI 4.5
## 14 14 BRADLEY SHAW MI 4.5
## 15 15 ZACHARY JAMES HOUGHTON MI 4.5
## 16 16 MIKE NIKITIN MI 4.0
## 17 17 RONALD GRZEGORCZYK MI 4.0
## 18 18 DAVID SUNDEEN MI 4.0
## 19 19 DIPANKAR ROY MI 4.0
## 20 20 JASON ZHENG MI 4.0
## 21 21 DINH DANG BUI ON 4.0
## 22 22 EUGENE L MCCLURE MI 4.0
## 23 23 ALAN BUI ON 4.0
## 24 24 MICHAEL R ALDRICH MI 4.0
## 25 25 LOREN SCHWIEBERT MI 3.5
## 26 26 MAX ZHU ON 3.5
## 27 27 GAURAV GIDWANI MI 3.5
## 28 28 SOFIA ADINA STANESCU-BELLU MI 3.5
## 29 29 CHIEDOZIE OKORIE MI 3.5
## 30 30 GEORGE AVERY JONES ON 3.5
## 31 31 RISHI SHETTY MI 3.5
## 32 32 JOSHUA PHILIP MATHEWS ON 3.5
## 33 33 JADE GE MI 3.5
## 34 34 MICHAEL JEFFERY THOMAS MI 3.5
## 35 35 JOSHUA DAVID LEE MI 3.5
## 36 36 SIDDHARTH JHA MI 3.5
## 37 37 AMIYATOSH PWNANANDAM MI 3.5
## 38 38 BRIAN LIU MI 3.0
## 39 39 JOEL R HENDON MI 3.0
## 40 40 FOREST ZHANG MI 3.0
## 41 41 KYLE WILLIAM MURPHY MI 3.0
## 42 42 JARED GE MI 3.0
## 43 43 ROBERT GLEN VASEY MI 3.0
## 44 44 JUSTIN D SCHILLING MI 3.0
## 45 45 DEREK YAN MI 3.0
## 46 46 JACOB ALEXANDER LAVALLEY MI 3.0
## 47 47 ERIC WRIGHT MI 2.5
## 48 48 DANIEL KHAIN MI 2.5
## 49 49 MICHAEL J MARTIN MI 2.5
## 50 50 SHIVAM JHA MI 2.5
## 51 51 TEJAS AYYAGARI MI 2.5
## 52 52 ETHAN GUO MI 2.5
## 53 53 JOSE C YBARRA MI 2.0
## 54 54 LARRY HODGE MI 2.0
## 55 55 ALEX KONG MI 2.0
## 56 56 MARISA RICCI MI 2.0
## 57 57 MICHAEL LU MI 2.0
## 58 58 VIRAJ MOHILE MI 2.0
## 59 59 SEAN M MC CORMICK MI 2.0
## 60 60 JULIA SHEN MI 1.5
## 61 61 JEZZEL FARKAS ON 1.5
## 62 62 ASHWIN BALAJI MI 1.0
## 63 63 THOMAS JOSEPH HOSMER MI 1.0
## 64 64 BEN LI MI 1.0
## preRating avgOpponentPreRating
## 1 1794 1605
## 2 1553 1469
## 3 1384 1564
## 4 1716 1574
## 5 1655 1501
## 6 1686 1519
## 7 1649 1372
## 8 1641 1468
## 9 1411 1523
## 10 1365 1554
## 11 1712 1468
## 12 1663 1506
## 13 1666 1498
## 14 1610 1515
## 15 1220 1484
## 16 1604 1386
## 17 1629 1499
## 18 1600 1480
## 19 1564 1426
## 20 1595 1411
## 21 1563 1470
## 22 1555 1300
## 23 1363 1214
## 24 1229 1357
## 25 1745 1363
## 26 1579 1507
## 27 1552 1222
## 28 1507 1522
## 29 1602 1314
## 30 1522 1144
## 31 1494 1260
## 32 1441 1379
## 33 1449 1277
## 34 1399 1375
## 35 1438 1150
## 36 1355 1388
## 37 980 1385
## 38 1423 1539
## 39 1436 1430
## 40 1348 1391
## 41 1403 1248
## 42 1332 1150
## 43 1283 1107
## 44 1199 1327
## 45 1242 1152
## 46 377 1358
## 47 1362 1392
## 48 1382 1356
## 49 1291 1286
## 50 1056 1296
## 51 1011 1356
## 52 935 1495
## 53 1393 1345
## 54 1270 1206
## 55 1186 1406
## 56 1153 1414
## 57 1092 1363
## 58 917 1391
## 59 853 1319
## 60 967 1330
## 61 955 1327
## 62 1530 1186
## 63 1175 1350
## 64 1163 1263
write.csv(finalChessData, "finalChessData.csv", row.names = FALSE)