Tutorial 3


library(dplyr)
library(XML) #for htmlParse & readHTMLTable

E.G.1: passing data

Set up table to run loop then clean table.

Create lookup table:

#Create lookup table
  links  <- expand.grid(LinkRoot = "http://espn.go.com/nfl/statistics/player/_/stat/",
                        LinkMid = "passing",
                        PHolder = "/year/",
                        Year =  c(seq(2005,2015,1)),
                        LinkEnd = "/qualified/false/count/", # removed --> "sort/passingYards/"
                        Pages = c(1 + seq(0,80,40)),                                  #pgs 1, 21, 41 
                        KEEP.OUT.ATTRS = F, stringsAsFactors = F) %>%
            mutate(URL = paste0(LinkRoot, LinkMid, PHolder, Year, LinkEnd, Pages))
head(links)
##                                           LinkRoot LinkMid PHolder Year
## 1 http://espn.go.com/nfl/statistics/player/_/stat/ passing  /year/ 2005
## 2 http://espn.go.com/nfl/statistics/player/_/stat/ passing  /year/ 2006
## 3 http://espn.go.com/nfl/statistics/player/_/stat/ passing  /year/ 2007
## 4 http://espn.go.com/nfl/statistics/player/_/stat/ passing  /year/ 2008
## 5 http://espn.go.com/nfl/statistics/player/_/stat/ passing  /year/ 2009
## 6 http://espn.go.com/nfl/statistics/player/_/stat/ passing  /year/ 2010
##                   LinkEnd Pages
## 1 /qualified/false/count/     1
## 2 /qualified/false/count/     1
## 3 /qualified/false/count/     1
## 4 /qualified/false/count/     1
## 5 /qualified/false/count/     1
## 6 /qualified/false/count/     1
##                                                                                         URL
## 1 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2005/qualified/false/count/1
## 2 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2006/qualified/false/count/1
## 3 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2007/qualified/false/count/1
## 4 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2008/qualified/false/count/1
## 5 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2009/qualified/false/count/1
## 6 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2010/qualified/false/count/1

Get the table from the webpage

#get the table from the webpage
  tables <- getNodeSet(htmlParse(links$URL[1]), "//table") #parses html and finds all rows (nodes) in the table

  #make a base table from the frist page  
   dataBuild <- readHTMLTable(tables[[1]], trim = TRUE, stringsAsFactors = FALSE) 

#loop through other pages
 { for(i in 1:nrow(links)){
      tables <- getNodeSet(htmlParse(links$URL[i]), "//table")
      
      nfldata <- readHTMLTable(tables[[1]], trim = TRUE, 
                               stringsAsFactors = FALSE)
    #add nfldata to the "passing" table      
      dataBuild <- rbind(dataBuild, nfldata)
    }}
   
# Clean the df
   
passingFinal <- dataBuild %>%
                  filter(PLAYER != "PLAYER") %>%
                  select(-RK) %>%
                  mutate(Stat = "Passing",
                         Year = "2014",
                         YDS = gsub(",", "", YDS)) %>%
                  rename(YDSA = `YDS/A`,  #use back ticks `` for columns with invalid characters
                         YDSG = `YDS/G`) %>%
                  mutate_each(funs(as.integer), c(COMP, ATT, YDS, LONG, TD, INT, SACK, YDSG))%>%
                  mutate_each(funs(as.numeric), c(PCT, YDSA, RATE, YDSG))%>%  #converts all columns in c()
                  distinct()

So using this we can create a function:

get_ESPN <- function(LinkMid, LinkEnd, npages, by)
  {
  dataBuild <<- dataBuild[1,] # This is the base table that the rest of the loop is built on. 
#create lookup table (double arrow "<<-" means to make it in the environment not just as a temporary table)
  links  <<- expand.grid(LinkRoot = "http://espn.go.com/nfl/statistics/player/_/stat/",
                         LinkMid = LinkMid,
                         PHolder = "/year/",
                         Year = c(seq(2005,2015,1)),
                         LinkEnd = LinkEnd,
                        Pages = c(1 + seq(0, (npages-1)*by, by)), 
                        KEEP.OUT.ATTRS = F, stringsAsFactors = F)  %>%
            mutate(URL = paste0(LinkRoot, LinkMid, PHolder, Year, LinkEnd, Pages))
  
#URL.Only <<- links %>% select(URL)
  
#a<<- data.frame (Yr.Tag = gsub(pattern = "(.*year/)(.*)(/qualified.*)", replacement = "\\2", x = URL.Only[, 1], perl=T))

#get the table from the webpage
    tables <- getNodeSet(htmlParse(links$URL[1]), "//table") #parses html and finds all rows (nodes) in the table
  
#make a base table from the frist page  
     dataBuild <<- readHTMLTable(tables[[1]], trim = TRUE, stringsAsFactors = FALSE) %>%
                  mutate(Year = links$Year[1])
  
#loop through other pages
    for(i in 2:nrow(links)){
        tables <- getNodeSet(htmlParse(links$URL[i]), "//table")
        
        nfldata <- readHTMLTable(tables[[1]], trim = TRUE, 
                                 stringsAsFactors = FALSE) %>%
                   mutate(Year = links$Year[i])
        
#add nfldata to the "passing" table      
        dataBuild <<- rbind(dataBuild, nfldata) 

#spl <<-strsplit(as.character(dataBuild$PLAYER), ",") 
  
#dataBuildA <<-  data.frame(PLAYER= sapply(spl, "[", 1), POS= sapply(spl, "[", 2), dataBuild, stringsAsFactors = F)%>%
                  #select(-PLAYER.1, -RK) %>% filter(PLAYER != "PLAYER")
        
#print(i) < This prints out a running count of the loop as it processes. 

    }
}
# Test it out by pulling scoring data.
get_ESPN(LinkMid = "scoring", LinkEnd = "/qualified/false/count/", npages = 7, by = 40) 

Clean up table for scoring.DF

# Rename the output to give it a unique identity.
scoring.DF <- dataBuild 

# Take a look at the result.
head(scoring.DF, n = 15L)
##    V1                      V2      V3   V4   V5   V6   V7   V8   V9  V10
## 1  RK                  PLAYER    TEAM RUSH  REC  RET   TD   FG   XP  2PT
## 2   1     Shaun Alexander, RB     SEA   27    1    0   28    0    0    0
## 3   2           Jay Feely, PK     NYG    0    0    0    0   35   43    0
## 4   3        Neil Rackers, PK     ARI    0    0    0    0   40   20    0
## 5   4       Shayne Graham, PK     CIN    0    0    0    0   28   47    0
## 6   5       Larry Johnson, RB      KC   20    1    0   21    0    0    0
## 7   6      Lawrence Tynes, PK      KC    0    0    0    0   27   44    0
## 8   7          John Kasay, PK     CAR    0    0    0    0   26   43    0
## 9   Â     Mike Vanderjagt, PK     IND    0    0    0    0   23   52    0
## 10  9 LaDainian Tomlinson, RB      SD   18    2    0   20    0    0    0
## 11 10        Jeff Wilkins, PK      LA    0    0    0    0   27   36    0
## 12  Â              TOUCHDOWNS SCORING <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 13 RK                  PLAYER    TEAM RUSH  REC  RET   TD   FG   XP  2PT
## 14  Â           Jeff Reed, PK     PIT    0    0    0    0   24   45    0
## 15 12          Jason Elam, PK     DEN    0    0    0    0   24   43    0
##     V11   V12 Year
## 1   PTS PTS/G 2005
## 2   168  10.5 2005
## 3   148   9.3 2005
## 4   140   9.3 2005
## 5   131   8.2 2005
## 6   126   7.9 2005
## 7   125   7.8 2005
## 8   121   7.6 2005
## 9   121   7.6 2005
## 10  120   7.5 2005
## 11  117   7.3 2005
## 12 <NA>  <NA> 2005
## 13  PTS PTS/G 2005
## 14  117   7.3 2005
## 15  115   7.2 2005
# First off - I have a couple of issues in this df as is. First - I have row entry that reads something like, "PLAYER    TEAM RUSH  REC  RET   TD   FG   XP  2PT", etc. and columns named like "V1 V2 V3 V4 V5 V6 V7", etc. Furthermore, I have an entry that reads, "TOUCHDOWNS SCORING <NA> <NA> <NA> <NA> <NA> <NA> <NA>" and ". This - as you'd expect - is not desirable. In addition to these issues - I have a column entry under player that has both the player name and his position (e.g. "Shaun Alexander, RB"). All of this will be addressed below. 

# Create a list of Column names.
listforCols <- scoring.DF %>%
                  filter(V2 == "PLAYER") %>%
                  select(-c(Year, V1)) %>%
                  distinct()
print(listforCols)
##       V2   V3   V4  V5  V6 V7 V8 V9 V10 V11   V12
## 1 PLAYER TEAM RUSH REC RET TD FG XP 2PT PTS PTS/G
# Split a data entry at a "," (This particular entry is the player name and position)
spl <<-strsplit(as.character(scoring.DF$V2), ",")

scoring.DF1 <- data.frame(PLAYER= sapply(spl, "[", 1), POS= sapply(spl, "[", 2),  
                          scoring.DF, stringsAsFactors = F) 
head(scoring.DF1, n = 15L)
##                 PLAYER  POS V1                      V2      V3   V4   V5
## 1               PLAYER <NA> RK                  PLAYER    TEAM RUSH  REC
## 2      Shaun Alexander   RB  1     Shaun Alexander, RB     SEA   27    1
## 3            Jay Feely   PK  2           Jay Feely, PK     NYG    0    0
## 4         Neil Rackers   PK  3        Neil Rackers, PK     ARI    0    0
## 5        Shayne Graham   PK  4       Shayne Graham, PK     CIN    0    0
## 6        Larry Johnson   RB  5       Larry Johnson, RB      KC   20    1
## 7       Lawrence Tynes   PK  6      Lawrence Tynes, PK      KC    0    0
## 8           John Kasay   PK  7          John Kasay, PK     CAR    0    0
## 9      Mike Vanderjagt   PK  Â     Mike Vanderjagt, PK     IND    0    0
## 10 LaDainian Tomlinson   RB  9 LaDainian Tomlinson, RB      SD   18    2
## 11        Jeff Wilkins   PK 10        Jeff Wilkins, PK      LA    0    0
## 12          TOUCHDOWNS <NA>  Â              TOUCHDOWNS SCORING <NA> <NA>
## 13              PLAYER <NA> RK                  PLAYER    TEAM RUSH  REC
## 14           Jeff Reed   PK  Â           Jeff Reed, PK     PIT    0    0
## 15          Jason Elam   PK 12          Jason Elam, PK     DEN    0    0
##      V6   V7   V8   V9  V10  V11   V12 Year
## 1   RET   TD   FG   XP  2PT  PTS PTS/G 2005
## 2     0   28    0    0    0  168  10.5 2005
## 3     0    0   35   43    0  148   9.3 2005
## 4     0    0   40   20    0  140   9.3 2005
## 5     0    0   28   47    0  131   8.2 2005
## 6     0   21    0    0    0  126   7.9 2005
## 7     0    0   27   44    0  125   7.8 2005
## 8     0    0   26   43    0  121   7.6 2005
## 9     0    0   23   52    0  121   7.6 2005
## 10    0   20    0    0    0  120   7.5 2005
## 11    0    0   27   36    0  117   7.3 2005
## 12 <NA> <NA> <NA> <NA> <NA> <NA>  <NA> 2005
## 13  RET   TD   FG   XP  2PT  PTS PTS/G 2005
## 14    0    0   24   45    0  117   7.3 2005
## 15    0    0   24   43    0  115   7.2 2005
# Remove column names and category tags.
scoring.DFA <<-  data.frame(PLAYER= sapply(spl, "[", 1), POS= sapply(spl, "[", 2)
                           , scoring.DF, stringsAsFactors = F)  %>%
                             select(-c(V2, V1)) %>% 
                             filter(PLAYER != "PLAYER") %>%
                             filter (PLAYER != "TOUCHDOWNS")

head(scoring.DFA, n = 15L)
##                 PLAYER POS  V3 V4 V5 V6 V7 V8 V9 V10 V11  V12 Year
## 1      Shaun Alexander  RB SEA 27  1  0 28  0  0   0 168 10.5 2005
## 2            Jay Feely  PK NYG  0  0  0  0 35 43   0 148  9.3 2005
## 3         Neil Rackers  PK ARI  0  0  0  0 40 20   0 140  9.3 2005
## 4        Shayne Graham  PK CIN  0  0  0  0 28 47   0 131  8.2 2005
## 5        Larry Johnson  RB  KC 20  1  0 21  0  0   0 126  7.9 2005
## 6       Lawrence Tynes  PK  KC  0  0  0  0 27 44   0 125  7.8 2005
## 7           John Kasay  PK CAR  0  0  0  0 26 43   0 121  7.6 2005
## 8      Mike Vanderjagt  PK IND  0  0  0  0 23 52   0 121  7.6 2005
## 9  LaDainian Tomlinson  RB  SD 18  2  0 20  0  0   0 120  7.5 2005
## 10        Jeff Wilkins  PK  LA  0  0  0  0 27 36   0 117  7.3 2005
## 11           Jeff Reed  PK PIT  0  0  0  0 24 45   0 117  7.3 2005
## 12          Jason Elam  PK DEN  0  0  0  0 24 43   0 115  7.2 2005
## 13         Matt Stover  PK BAL  0  0  0  0 30 23   0 113  7.1 2005
## 14        Rian Lindell  PK BUF  0  0  0  0 29 26   0 113  7.1 2005
## 15        Nate Kaeding  PK  SD  0  0  0  0 21 49   0 112  7.0 2005
# I use this to give me the names I want to change my current column names to. 
print(listforCols)  # Use this again to see the list of column names that I want to use. 
##       V2   V3   V4  V5  V6 V7 V8 V9 V10 V11   V12
## 1 PLAYER TEAM RUSH REC RET TD FG XP 2PT PTS PTS/G
colnames(scoring.DFA)[which(colnames(scoring.DFA) %in% c(
  "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"
   , "V12") )] <- c("TEAM"
   , "RUSH", "REC", "RET", "TD", "FG", "XP", "2PT", "PTS"
   , "PTS-G") 

head(scoring.DFA, n = 15L) 
##                 PLAYER POS TEAM RUSH REC RET TD FG XP 2PT PTS PTS-G Year
## 1      Shaun Alexander  RB  SEA   27   1   0 28  0  0   0 168  10.5 2005
## 2            Jay Feely  PK  NYG    0   0   0  0 35 43   0 148   9.3 2005
## 3         Neil Rackers  PK  ARI    0   0   0  0 40 20   0 140   9.3 2005
## 4        Shayne Graham  PK  CIN    0   0   0  0 28 47   0 131   8.2 2005
## 5        Larry Johnson  RB   KC   20   1   0 21  0  0   0 126   7.9 2005
## 6       Lawrence Tynes  PK   KC    0   0   0  0 27 44   0 125   7.8 2005
## 7           John Kasay  PK  CAR    0   0   0  0 26 43   0 121   7.6 2005
## 8      Mike Vanderjagt  PK  IND    0   0   0  0 23 52   0 121   7.6 2005
## 9  LaDainian Tomlinson  RB   SD   18   2   0 20  0  0   0 120   7.5 2005
## 10        Jeff Wilkins  PK   LA    0   0   0  0 27 36   0 117   7.3 2005
## 11           Jeff Reed  PK  PIT    0   0   0  0 24 45   0 117   7.3 2005
## 12          Jason Elam  PK  DEN    0   0   0  0 24 43   0 115   7.2 2005
## 13         Matt Stover  PK  BAL    0   0   0  0 30 23   0 113   7.1 2005
## 14        Rian Lindell  PK  BUF    0   0   0  0 29 26   0 113   7.1 2005
## 15        Nate Kaeding  PK   SD    0   0   0  0 21 49   0 112   7.0 2005