The following script extracts the tournament data from a github repository and extracts information from the repository using the stringr package within R and regular expressions as described in Chapter 8 of Simon Munzert’s Data Science for Business. Using the stringr package, the player’s pre-scores, post-scores and total points are extracted.

tournamentinfo<-read_file("https://raw.githubusercontent.com/joshuargst/Data607Proj1/master/tournamentinfo.txt")
#extracting PreScore
Pre_Score<-unlist(str_extract_all(tournamentinfo, "R:\\s+[:digit:]{1,4}"))%>% str_extract_all("[:digit:]+")%>%unlist%>%as.numeric()

#extracting PostScore
Post_Score<-unlist(str_extract_all(tournamentinfo, "->\\s*[:digit:]{1,4}"))%>% str_extract_all("[:digit:]+")%>%unlist%>%as.numeric()

#extracting Total points of each player
Total_Points<-unlist(str_extract_all(tournamentinfo, "\\d+\\.\\d+"))%>%as.numeric()



Since the data isnt completely unstructured, we can use the read_delim function within readr to parse information into a dataframe with the character “|” used as a delimiter. We can clean this semi structured table to place all the pertinent information we wish to extract. the code below shows this process.


tournamentinfo <- read_delim("https://raw.githubusercontent.com/joshuargst/Data607Proj1/master/tournamentinfo.txt", "|", escape_double = FALSE, trim_ws = TRUE, skip = 2)


tournamentinfo<-tournamentinfo[-grep("--+",tournamentinfo$Num),]

tournamentinfo1<-tournamentinfo[-grep("[[:alpha:]]",tournamentinfo$Num),]

#exctracting states
tournamentinfo2<-tournamentinfo[-grep("[[:digit:]]",tournamentinfo$Num),][1]
colnames(tournamentinfo2)[1]<-"Player_State"

  
WinsLosses<-separate(tournamentinfo1,'1',c("game_1","Game_1"), sep = "\\s+")%>%
  separate('2',c("game_2","Game_2"), sep = "\\s+")%>%
  separate('3',c("game_3","Game_3"), sep = "\\s+")%>%
  separate('4',c("game_4","Game_4"), sep = "\\s+")%>%
  separate('5',c("game_5","Game_5"), sep = "\\s+")%>%
  separate('6',c("game_6","Game_6"), sep = "\\s+")%>%
  separate('7',c("game_7","Game_7"), sep = "\\s+")

colnames(WinsLosses)[1]<-"PlayerID"
colnames(WinsLosses)[2]<-"PlayerName"
WinsLosses<-WinsLosses[c(-3,-18)]

head(WinsLosses)
## # A tibble: 6 x 16
##   PlayerID PlayerName game_1 Game_1 game_2 Game_2 game_3 Game_3 game_4
##   <chr>    <chr>      <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr> 
## 1 1        GARY HUA   W      39     W      21     W      18     W     
## 2 2        DAKSHESH ~ W      63     W      58     L      4      W     
## 3 3        ADITYA BA~ L      8      W      61     W      25     W     
## 4 4        PATRICK H~ W      23     D      28     W      2      W     
## 5 5        HANSHI ZUO W      45     W      37     D      12     D     
## 6 6        HANSEN SO~ W      34     D      29     L      11     W     
## # ... with 7 more variables: Game_4 <chr>, game_5 <chr>, Game_5 <chr>,
## #   game_6 <chr>, Game_6 <chr>, game_7 <chr>, Game_7 <chr>



As shown in the preview table above, we now have a table that identifies each player with each game win/loss designation as well as that player’s opponent for that game.

We can clean this so that our data is in a more readable format by compressing this wider table into a longer formats as shown in the tables below



PlayerInfo<-cbind.data.frame(WinsLosses[c(1,2)],tournamentinfo2[1],Pre_Score,Post_Score,Total_Points)

WinsLosses<-gather(WinsLosses, Game,W_L_D, -Game_1,-Game_2,-Game_3,-Game_4,-Game_5,-Game_6,-Game_7,-PlayerName,-PlayerID)[c(1,2,10,11)]


WinsLosses2<-separate(tournamentinfo1,'1',c("game_1","Game_1"), sep = "\\s+")%>%
  separate('2',c("game_2","Game_2"), sep = "\\s+")%>%
  separate('3',c("game_3","Game_3"), sep = "\\s+")%>%
  separate('4',c("game_4","Game_4"), sep = "\\s+")%>%
  separate('5',c("game_5","Game_5"), sep = "\\s+")%>%
  separate('6',c("game_6","Game_6"), sep = "\\s+")%>%
  separate('7',c("game_7","Game_7"), sep = "\\s+")

colnames(WinsLosses2)[1]<-"PlayerID"
colnames(WinsLosses2)[2]<-"PlayerName"
WinsLosses2<-WinsLosses2[c(-3,-18)]


WinsLosses2<-gather(WinsLosses2, Game2,OpponentID, -game_1,-game_2,-game_3,-game_4,-game_5,-game_6,-game_7,-PlayerName,-PlayerID)[c(1,2,10,11)]

WinsLosses
## # A tibble: 448 x 4
##    PlayerID PlayerName          Game   W_L_D
##    <chr>    <chr>               <chr>  <chr>
##  1 1        GARY HUA            game_1 W    
##  2 2        DAKSHESH DARURI     game_1 W    
##  3 3        ADITYA BAJAJ        game_1 L    
##  4 4        PATRICK H SCHILLING game_1 W    
##  5 5        HANSHI ZUO          game_1 W    
##  6 6        HANSEN SONG         game_1 W    
##  7 7        GARY DEE SWATHELL   game_1 W    
##  8 8        EZEKIEL HOUGHTON    game_1 W    
##  9 9        STEFANO LEE         game_1 W    
## 10 10       ANVIT RAO           game_1 D    
## # ... with 438 more rows
WinsLosses2
## # A tibble: 448 x 4
##    PlayerID PlayerName          Game2  OpponentID
##    <chr>    <chr>               <chr>  <chr>     
##  1 1        GARY HUA            Game_1 39        
##  2 2        DAKSHESH DARURI     Game_1 63        
##  3 3        ADITYA BAJAJ        Game_1 8         
##  4 4        PATRICK H SCHILLING Game_1 23        
##  5 5        HANSHI ZUO          Game_1 45        
##  6 6        HANSEN SONG         Game_1 34        
##  7 7        GARY DEE SWATHELL   Game_1 57        
##  8 8        EZEKIEL HOUGHTON    Game_1 3         
##  9 9        STEFANO LEE         Game_1 25        
## 10 10       ANVIT RAO           Game_1 16        
## # ... with 438 more rows



Finally we can combine all of the necessary data from all parsing that was done and place this into our summary data frame called “FinalTable”. Within this table, we use dplyr to group by playerIDs and summarize the opponent scores for each game by their mean and end with our final output as shown below.

WinsLosses<-cbind(WinsLosses,WinsLosses2)[c(1,2,3,4,8)]

WinsLosses$Game<-str_replace_all(WinsLosses$Game,"game_","")

WinsLosses$Game<-str_replace_all(WinsLosses$Game,"Game_","")


#summarise opponent scores


FinalTable<-
  left_join(WinsLosses,PlayerInfo, by = c("OpponentID"="PlayerID"))%>%  
  group_by(PlayerID)%>%
  summarise(Avg_Opponent_Score = round(mean(Pre_Score,na.rm = TRUE),0))%>%
  left_join(PlayerInfo,by = c("PlayerID"= "PlayerID"))%>%as.data.frame()

#reorder table for cleanliness
FinalTable<-FinalTable[c(1,3:7,2)]
kable(FinalTable, format = "html")
PlayerID PlayerName Player_State Pre_Score Post_Score Total_Points Avg_Opponent_Score
1 GARY HUA ON 1794 1817 6.0 1605
10 ANVIT RAO MI 1365 1544 5.0 1554
11 CAMERON WILLIAM MC LEMAN MI 1712 1696 4.5 1468
12 KENNETH J TACK MI 1663 1670 4.5 1506
13 TORRANCE HENRY JR MI 1666 1662 4.5 1498
14 BRADLEY SHAW MI 1610 1618 4.5 1515
15 ZACHARY JAMES HOUGHTON MI 1220 1416 4.5 1484
16 MIKE NIKITIN MI 1604 1613 4.0 1386
17 RONALD GRZEGORCZYK MI 1629 1610 4.0 1499
18 DAVID SUNDEEN MI 1600 1600 4.0 1480
19 DIPANKAR ROY MI 1564 1570 4.0 1426
2 DAKSHESH DARURI MI 1553 1663 6.0 1469
20 JASON ZHENG MI 1595 1569 4.0 1411
21 DINH DANG BUI ON 1563 1562 4.0 1470
22 EUGENE L MCCLURE MI 1555 1529 4.0 1300
23 ALAN BUI ON 1363 1371 4.0 1214
24 MICHAEL R ALDRICH MI 1229 1300 4.0 1357
25 LOREN SCHWIEBERT MI 1745 1681 3.5 1363
26 MAX ZHU ON 1579 1564 3.5 1507
27 GAURAV GIDWANI MI 1552 1539 3.5 1222
28 SOFIA ADINA STANESCU-BELLU MI 1507 1513 3.5 1522
29 CHIEDOZIE OKORIE MI 1602 1508 3.5 1314
3 ADITYA BAJAJ MI 1384 1640 6.0 1564
30 GEORGE AVERY JONES ON 1522 1444 3.5 1144
31 RISHI SHETTY MI 1494 1444 3.5 1260
32 JOSHUA PHILIP MATHEWS ON 1441 1433 3.5 1379
33 JADE GE MI 1449 1421 3.5 1277
34 MICHAEL JEFFERY THOMAS MI 1399 1400 3.5 1375
35 JOSHUA DAVID LEE MI 1438 1392 3.5 1150
36 SIDDHARTH JHA MI 1355 1367 3.5 1388
37 AMIYATOSH PWNANANDAM MI 980 1077 3.5 1385
38 BRIAN LIU MI 1423 1439 3.0 1539
39 JOEL R HENDON MI 1436 1413 3.0 1430
4 PATRICK H SCHILLING MI 1716 1744 5.5 1574
40 FOREST ZHANG MI 1348 1346 3.0 1391
41 KYLE WILLIAM MURPHY MI 1403 1341 3.0 1248
42 JARED GE MI 1332 1256 3.0 1150
43 ROBERT GLEN VASEY MI 1283 1244 3.0 1107
44 JUSTIN D SCHILLING MI 1199 1199 3.0 1327
45 DEREK YAN MI 1242 1191 3.0 1152
46 JACOB ALEXANDER LAVALLEY MI 377 1076 3.0 1358
47 ERIC WRIGHT MI 1362 1341 2.5 1392
48 DANIEL KHAIN MI 1382 1335 2.5 1356
49 MICHAEL J MARTIN MI 1291 1259 2.5 1286
5 HANSHI ZUO MI 1655 1690 5.5 1501
50 SHIVAM JHA MI 1056 1111 2.5 1296
51 TEJAS AYYAGARI MI 1011 1097 2.5 1356
52 ETHAN GUO MI 935 1092 2.5 1495
53 JOSE C YBARRA MI 1393 1359 2.0 1345
54 LARRY HODGE MI 1270 1200 2.0 1206
55 ALEX KONG MI 1186 1163 2.0 1406
56 MARISA RICCI MI 1153 1140 2.0 1414
57 MICHAEL LU MI 1092 1079 2.0 1363
58 VIRAJ MOHILE MI 917 941 2.0 1391
59 SEAN M MC CORMICK MI 853 878 2.0 1319
6 HANSEN SONG OH 1686 1687 5.0 1519
60 JULIA SHEN MI 967 984 1.5 1330
61 JEZZEL FARKAS ON 955 979 1.5 1327
62 ASHWIN BALAJI MI 1530 1535 1.0 1186
63 THOMAS JOSEPH HOSMER MI 1175 1125 1.0 1350
64 BEN LI MI 1163 1112 1.0 1263
7 GARY DEE SWATHELL MI 1649 1673 5.0 1372
8 EZEKIEL HOUGHTON MI 1641 1657 5.0 1468
9 STEFANO LEE ON 1411 1564 5.0 1523