607 Project 1

The following script extracts the tournament data from a github repository and extracts information from the repository using the stringr package within R and regular expressions as described in Chapter 8 of Simon Munzert’s Data Science for Business. Using the stringr package, the player’s pre-scores, post-scores and total points are extracted.

tournamentinfo<-read_file("https://raw.githubusercontent.com/joshuargst/Data607Proj1/master/tournamentinfo.txt")
#extracting PreScore
Pre_Score<-unlist(str_extract_all(tournamentinfo, "R:\\s+[:digit:]{1,4}"))%>% str_extract_all("[:digit:]+")%>%unlist%>%as.numeric()

#extracting PostScore
Post_Score<-unlist(str_extract_all(tournamentinfo, "->\\s*[:digit:]{1,4}"))%>% str_extract_all("[:digit:]+")%>%unlist%>%as.numeric()

#extracting Total points of each player
Total_Points<-unlist(str_extract_all(tournamentinfo, "\\d+\\.\\d+"))%>%as.numeric()

Since the data isnt completely unstructured, we can use the read_delim function within readr to parse information into a dataframe with the character “|” used as a delimiter. We can clean this semi structured table to place all the pertinent information we wish to extract. the code below shows this process.

tournamentinfo <- read_delim("https://raw.githubusercontent.com/joshuargst/Data607Proj1/master/tournamentinfo.txt", "|", escape_double = FALSE, trim_ws = TRUE, skip = 2)


tournamentinfo<-tournamentinfo[-grep("--+",tournamentinfo$Num),]

tournamentinfo1<-tournamentinfo[-grep("[[:alpha:]]",tournamentinfo$Num),]

#exctracting states
tournamentinfo2<-tournamentinfo[-grep("[[:digit:]]",tournamentinfo$Num),][1]
colnames(tournamentinfo2)[1]<-"Player_State"

  
WinsLosses<-separate(tournamentinfo1,'1',c("game_1","Game_1"), sep = "\\s+")%>%
  separate('2',c("game_2","Game_2"), sep = "\\s+")%>%
  separate('3',c("game_3","Game_3"), sep = "\\s+")%>%
  separate('4',c("game_4","Game_4"), sep = "\\s+")%>%
  separate('5',c("game_5","Game_5"), sep = "\\s+")%>%
  separate('6',c("game_6","Game_6"), sep = "\\s+")%>%
  separate('7',c("game_7","Game_7"), sep = "\\s+")

colnames(WinsLosses)[1]<-"PlayerID"
colnames(WinsLosses)[2]<-"PlayerName"
WinsLosses<-WinsLosses[c(-3,-18)]

head(WinsLosses)

## # A tibble: 6 x 16
##   PlayerID PlayerName game_1 Game_1 game_2 Game_2 game_3 Game_3 game_4
##   <chr>    <chr>      <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr> 
## 1 1        GARY HUA   W      39     W      21     W      18     W     
## 2 2        DAKSHESH ~ W      63     W      58     L      4      W     
## 3 3        ADITYA BA~ L      8      W      61     W      25     W     
## 4 4        PATRICK H~ W      23     D      28     W      2      W     
## 5 5        HANSHI ZUO W      45     W      37     D      12     D     
## 6 6        HANSEN SO~ W      34     D      29     L      11     W     
## # ... with 7 more variables: Game_4 <chr>, game_5 <chr>, Game_5 <chr>,
## #   game_6 <chr>, Game_6 <chr>, game_7 <chr>, Game_7 <chr>

As shown in the preview table above, we now have a table that identifies each player with each game win/loss designation as well as that player’s opponent for that game.

We can clean this so that our data is in a more readable format by compressing this wider table into a longer formats as shown in the tables below

PlayerInfo<-cbind.data.frame(WinsLosses[c(1,2)],tournamentinfo2[1],Pre_Score,Post_Score,Total_Points)

WinsLosses<-gather(WinsLosses, Game,W_L_D, -Game_1,-Game_2,-Game_3,-Game_4,-Game_5,-Game_6,-Game_7,-PlayerName,-PlayerID)[c(1,2,10,11)]


WinsLosses2<-separate(tournamentinfo1,'1',c("game_1","Game_1"), sep = "\\s+")%>%
  separate('2',c("game_2","Game_2"), sep = "\\s+")%>%
  separate('3',c("game_3","Game_3"), sep = "\\s+")%>%
  separate('4',c("game_4","Game_4"), sep = "\\s+")%>%
  separate('5',c("game_5","Game_5"), sep = "\\s+")%>%
  separate('6',c("game_6","Game_6"), sep = "\\s+")%>%
  separate('7',c("game_7","Game_7"), sep = "\\s+")

colnames(WinsLosses2)[1]<-"PlayerID"
colnames(WinsLosses2)[2]<-"PlayerName"
WinsLosses2<-WinsLosses2[c(-3,-18)]


WinsLosses2<-gather(WinsLosses2, Game2,OpponentID, -game_1,-game_2,-game_3,-game_4,-game_5,-game_6,-game_7,-PlayerName,-PlayerID)[c(1,2,10,11)]

WinsLosses

## # A tibble: 448 x 4
##    PlayerID PlayerName          Game   W_L_D
##    <chr>    <chr>               <chr>  <chr>
##  1 1        GARY HUA            game_1 W    
##  2 2        DAKSHESH DARURI     game_1 W    
##  3 3        ADITYA BAJAJ        game_1 L    
##  4 4        PATRICK H SCHILLING game_1 W    
##  5 5        HANSHI ZUO          game_1 W    
##  6 6        HANSEN SONG         game_1 W    
##  7 7        GARY DEE SWATHELL   game_1 W    
##  8 8        EZEKIEL HOUGHTON    game_1 W    
##  9 9        STEFANO LEE         game_1 W    
## 10 10       ANVIT RAO           game_1 D    
## # ... with 438 more rows

WinsLosses2

## # A tibble: 448 x 4
##    PlayerID PlayerName          Game2  OpponentID
##    <chr>    <chr>               <chr>  <chr>     
##  1 1        GARY HUA            Game_1 39        
##  2 2        DAKSHESH DARURI     Game_1 63        
##  3 3        ADITYA BAJAJ        Game_1 8         
##  4 4        PATRICK H SCHILLING Game_1 23        
##  5 5        HANSHI ZUO          Game_1 45        
##  6 6        HANSEN SONG         Game_1 34        
##  7 7        GARY DEE SWATHELL   Game_1 57        
##  8 8        EZEKIEL HOUGHTON    Game_1 3         
##  9 9        STEFANO LEE         Game_1 25        
## 10 10       ANVIT RAO           Game_1 16        
## # ... with 438 more rows

Finally we can combine all of the necessary data from all parsing that was done and place this into our summary data frame called “FinalTable”. Within this table, we use dplyr to group by playerIDs and summarize the opponent scores for each game by their mean and end with our final output as shown below.

WinsLosses<-cbind(WinsLosses,WinsLosses2)[c(1,2,3,4,8)]

WinsLosses$Game<-str_replace_all(WinsLosses$Game,"game_","")

WinsLosses$Game<-str_replace_all(WinsLosses$Game,"Game_","")


#summarise opponent scores


FinalTable<-
  left_join(WinsLosses,PlayerInfo, by = c("OpponentID"="PlayerID"))%>%  
  group_by(PlayerID)%>%
  summarise(Avg_Opponent_Score = round(mean(Pre_Score,na.rm = TRUE),0))%>%
  left_join(PlayerInfo,by = c("PlayerID"= "PlayerID"))%>%as.data.frame()

#reorder table for cleanliness
FinalTable<-FinalTable[c(1,3:7,2)]
kable(FinalTable, format = "html")

PlayerID	PlayerName	Player_State	Pre_Score	Post_Score	Total_Points	Avg_Opponent_Score
1	GARY HUA	ON	1794	1817	6.0	1605
10	ANVIT RAO	MI	1365	1544	5.0	1554
11	CAMERON WILLIAM MC LEMAN	MI	1712	1696	4.5	1468
12	KENNETH J TACK	MI	1663	1670	4.5	1506
13	TORRANCE HENRY JR	MI	1666	1662	4.5	1498
14	BRADLEY SHAW	MI	1610	1618	4.5	1515
15	ZACHARY JAMES HOUGHTON	MI	1220	1416	4.5	1484
16	MIKE NIKITIN	MI	1604	1613	4.0	1386
17	RONALD GRZEGORCZYK	MI	1629	1610	4.0	1499
18	DAVID SUNDEEN	MI	1600	1600	4.0	1480
19	DIPANKAR ROY	MI	1564	1570	4.0	1426
2	DAKSHESH DARURI	MI	1553	1663	6.0	1469
20	JASON ZHENG	MI	1595	1569	4.0	1411
21	DINH DANG BUI	ON	1563	1562	4.0	1470
22	EUGENE L MCCLURE	MI	1555	1529	4.0	1300
23	ALAN BUI	ON	1363	1371	4.0	1214
24	MICHAEL R ALDRICH	MI	1229	1300	4.0	1357
25	LOREN SCHWIEBERT	MI	1745	1681	3.5	1363
26	MAX ZHU	ON	1579	1564	3.5	1507
27	GAURAV GIDWANI	MI	1552	1539	3.5	1222
28	SOFIA ADINA STANESCU-BELLU	MI	1507	1513	3.5	1522
29	CHIEDOZIE OKORIE	MI	1602	1508	3.5	1314
3	ADITYA BAJAJ	MI	1384	1640	6.0	1564
30	GEORGE AVERY JONES	ON	1522	1444	3.5	1144
31	RISHI SHETTY	MI	1494	1444	3.5	1260
32	JOSHUA PHILIP MATHEWS	ON	1441	1433	3.5	1379
33	JADE GE	MI	1449	1421	3.5	1277
34	MICHAEL JEFFERY THOMAS	MI	1399	1400	3.5	1375
35	JOSHUA DAVID LEE	MI	1438	1392	3.5	1150
36	SIDDHARTH JHA	MI	1355	1367	3.5	1388
37	AMIYATOSH PWNANANDAM	MI	980	1077	3.5	1385
38	BRIAN LIU	MI	1423	1439	3.0	1539
39	JOEL R HENDON	MI	1436	1413	3.0	1430
4	PATRICK H SCHILLING	MI	1716	1744	5.5	1574
40	FOREST ZHANG	MI	1348	1346	3.0	1391
41	KYLE WILLIAM MURPHY	MI	1403	1341	3.0	1248
42	JARED GE	MI	1332	1256	3.0	1150
43	ROBERT GLEN VASEY	MI	1283	1244	3.0	1107
44	JUSTIN D SCHILLING	MI	1199	1199	3.0	1327
45	DEREK YAN	MI	1242	1191	3.0	1152
46	JACOB ALEXANDER LAVALLEY	MI	377	1076	3.0	1358
47	ERIC WRIGHT	MI	1362	1341	2.5	1392
48	DANIEL KHAIN	MI	1382	1335	2.5	1356
49	MICHAEL J MARTIN	MI	1291	1259	2.5	1286
5	HANSHI ZUO	MI	1655	1690	5.5	1501
50	SHIVAM JHA	MI	1056	1111	2.5	1296
51	TEJAS AYYAGARI	MI	1011	1097	2.5	1356
52	ETHAN GUO	MI	935	1092	2.5	1495
53	JOSE C YBARRA	MI	1393	1359	2.0	1345
54	LARRY HODGE	MI	1270	1200	2.0	1206
55	ALEX KONG	MI	1186	1163	2.0	1406
56	MARISA RICCI	MI	1153	1140	2.0	1414
57	MICHAEL LU	MI	1092	1079	2.0	1363
58	VIRAJ MOHILE	MI	917	941	2.0	1391
59	SEAN M MC CORMICK	MI	853	878	2.0	1319
6	HANSEN SONG	OH	1686	1687	5.0	1519
60	JULIA SHEN	MI	967	984	1.5	1330
61	JEZZEL FARKAS	ON	955	979	1.5	1327
62	ASHWIN BALAJI	MI	1530	1535	1.0	1186
63	THOMAS JOSEPH HOSMER	MI	1175	1125	1.0	1350
64	BEN LI	MI	1163	1112	1.0	1263
7	GARY DEE SWATHELL	MI	1649	1673	5.0	1372
8	EZEKIEL HOUGHTON	MI	1641	1657	5.0	1468
9	STEFANO LEE	ON	1411	1564	5.0	1523

607 Project 1

Joshua Registe

9/17/2019