Tutorial 3
library(dplyr)
library(XML) #for htmlParse & readHTMLTable
E.G.1: passing data
Set up table to run loop then clean table.
Create lookup table:
#Create lookup table
links <- expand.grid(LinkRoot = "http://espn.go.com/nfl/statistics/player/_/stat/",
LinkMid = "passing",
PHolder = "/year/",
Year = c(seq(2005,2015,1)),
LinkEnd = "/qualified/false/count/", # removed --> "sort/passingYards/"
Pages = c(1 + seq(0,80,40)), #pgs 1, 21, 41
KEEP.OUT.ATTRS = F, stringsAsFactors = F) %>%
mutate(URL = paste0(LinkRoot, LinkMid, PHolder, Year, LinkEnd, Pages))
head(links)
## LinkRoot LinkMid PHolder Year
## 1 http://espn.go.com/nfl/statistics/player/_/stat/ passing /year/ 2005
## 2 http://espn.go.com/nfl/statistics/player/_/stat/ passing /year/ 2006
## 3 http://espn.go.com/nfl/statistics/player/_/stat/ passing /year/ 2007
## 4 http://espn.go.com/nfl/statistics/player/_/stat/ passing /year/ 2008
## 5 http://espn.go.com/nfl/statistics/player/_/stat/ passing /year/ 2009
## 6 http://espn.go.com/nfl/statistics/player/_/stat/ passing /year/ 2010
## LinkEnd Pages
## 1 /qualified/false/count/ 1
## 2 /qualified/false/count/ 1
## 3 /qualified/false/count/ 1
## 4 /qualified/false/count/ 1
## 5 /qualified/false/count/ 1
## 6 /qualified/false/count/ 1
## URL
## 1 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2005/qualified/false/count/1
## 2 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2006/qualified/false/count/1
## 3 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2007/qualified/false/count/1
## 4 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2008/qualified/false/count/1
## 5 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2009/qualified/false/count/1
## 6 http://espn.go.com/nfl/statistics/player/_/stat/passing/year/2010/qualified/false/count/1
Get the table from the webpage
#get the table from the webpage
tables <- getNodeSet(htmlParse(links$URL[1]), "//table") #parses html and finds all rows (nodes) in the table
#make a base table from the frist page
dataBuild <- readHTMLTable(tables[[1]], trim = TRUE, stringsAsFactors = FALSE)
#loop through other pages
{ for(i in 1:nrow(links)){
tables <- getNodeSet(htmlParse(links$URL[i]), "//table")
nfldata <- readHTMLTable(tables[[1]], trim = TRUE,
stringsAsFactors = FALSE)
#add nfldata to the "passing" table
dataBuild <- rbind(dataBuild, nfldata)
}}
# Clean the df
passingFinal <- dataBuild %>%
filter(PLAYER != "PLAYER") %>%
select(-RK) %>%
mutate(Stat = "Passing",
Year = "2014",
YDS = gsub(",", "", YDS)) %>%
rename(YDSA = `YDS/A`, #use back ticks `` for columns with invalid characters
YDSG = `YDS/G`) %>%
mutate_each(funs(as.integer), c(COMP, ATT, YDS, LONG, TD, INT, SACK, YDSG))%>%
mutate_each(funs(as.numeric), c(PCT, YDSA, RATE, YDSG))%>% #converts all columns in c()
distinct()
So using this we can create a function:
get_ESPN <- function(LinkMid, LinkEnd, npages, by)
{
dataBuild <<- dataBuild[1,] # This is the base table that the rest of the loop is built on.
#create lookup table (double arrow "<<-" means to make it in the environment not just as a temporary table)
links <<- expand.grid(LinkRoot = "http://espn.go.com/nfl/statistics/player/_/stat/",
LinkMid = LinkMid,
PHolder = "/year/",
Year = c(seq(2005,2015,1)),
LinkEnd = LinkEnd,
Pages = c(1 + seq(0, (npages-1)*by, by)),
KEEP.OUT.ATTRS = F, stringsAsFactors = F) %>%
mutate(URL = paste0(LinkRoot, LinkMid, PHolder, Year, LinkEnd, Pages))
#URL.Only <<- links %>% select(URL)
#a<<- data.frame (Yr.Tag = gsub(pattern = "(.*year/)(.*)(/qualified.*)", replacement = "\\2", x = URL.Only[, 1], perl=T))
#get the table from the webpage
tables <- getNodeSet(htmlParse(links$URL[1]), "//table") #parses html and finds all rows (nodes) in the table
#make a base table from the frist page
dataBuild <<- readHTMLTable(tables[[1]], trim = TRUE, stringsAsFactors = FALSE) %>%
mutate(Year = links$Year[1])
#loop through other pages
for(i in 2:nrow(links)){
tables <- getNodeSet(htmlParse(links$URL[i]), "//table")
nfldata <- readHTMLTable(tables[[1]], trim = TRUE,
stringsAsFactors = FALSE) %>%
mutate(Year = links$Year[i])
#add nfldata to the "passing" table
dataBuild <<- rbind(dataBuild, nfldata)
#spl <<-strsplit(as.character(dataBuild$PLAYER), ",")
#dataBuildA <<- data.frame(PLAYER= sapply(spl, "[", 1), POS= sapply(spl, "[", 2), dataBuild, stringsAsFactors = F)%>%
#select(-PLAYER.1, -RK) %>% filter(PLAYER != "PLAYER")
#print(i) < This prints out a running count of the loop as it processes.
}
}
# Test it out by pulling scoring data.
get_ESPN(LinkMid = "scoring", LinkEnd = "/qualified/false/count/", npages = 7, by = 40)
Clean up table for scoring.DF
# Rename the output to give it a unique identity.
scoring.DF <- dataBuild
# Take a look at the result.
head(scoring.DF, n = 15L)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 RK PLAYER TEAM RUSH REC RET TD FG XP 2PT
## 2 1 Shaun Alexander, RB SEA 27 1 0 28 0 0 0
## 3 2 Jay Feely, PK NYG 0 0 0 0 35 43 0
## 4 3 Neil Rackers, PK ARI 0 0 0 0 40 20 0
## 5 4 Shayne Graham, PK CIN 0 0 0 0 28 47 0
## 6 5 Larry Johnson, RB KC 20 1 0 21 0 0 0
## 7 6 Lawrence Tynes, PK KC 0 0 0 0 27 44 0
## 8 7 John Kasay, PK CAR 0 0 0 0 26 43 0
## 9 Â Mike Vanderjagt, PK IND 0 0 0 0 23 52 0
## 10 9 LaDainian Tomlinson, RB SD 18 2 0 20 0 0 0
## 11 10 Jeff Wilkins, PK LA 0 0 0 0 27 36 0
## 12 Â TOUCHDOWNS SCORING <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 13 RK PLAYER TEAM RUSH REC RET TD FG XP 2PT
## 14 Â Jeff Reed, PK PIT 0 0 0 0 24 45 0
## 15 12 Jason Elam, PK DEN 0 0 0 0 24 43 0
## V11 V12 Year
## 1 PTS PTS/G 2005
## 2 168 10.5 2005
## 3 148 9.3 2005
## 4 140 9.3 2005
## 5 131 8.2 2005
## 6 126 7.9 2005
## 7 125 7.8 2005
## 8 121 7.6 2005
## 9 121 7.6 2005
## 10 120 7.5 2005
## 11 117 7.3 2005
## 12 <NA> <NA> 2005
## 13 PTS PTS/G 2005
## 14 117 7.3 2005
## 15 115 7.2 2005
# First off - I have a couple of issues in this df as is. First - I have row entry that reads something like, "PLAYER TEAM RUSH REC RET TD FG XP 2PT", etc. and columns named like "V1 V2 V3 V4 V5 V6 V7", etc. Furthermore, I have an entry that reads, "TOUCHDOWNS SCORING <NA> <NA> <NA> <NA> <NA> <NA> <NA>" and ". This - as you'd expect - is not desirable. In addition to these issues - I have a column entry under player that has both the player name and his position (e.g. "Shaun Alexander, RB"). All of this will be addressed below.
# Create a list of Column names.
listforCols <- scoring.DF %>%
filter(V2 == "PLAYER") %>%
select(-c(Year, V1)) %>%
distinct()
print(listforCols)
## V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1 PLAYER TEAM RUSH REC RET TD FG XP 2PT PTS PTS/G
# Split a data entry at a "," (This particular entry is the player name and position)
spl <<-strsplit(as.character(scoring.DF$V2), ",")
scoring.DF1 <- data.frame(PLAYER= sapply(spl, "[", 1), POS= sapply(spl, "[", 2),
scoring.DF, stringsAsFactors = F)
head(scoring.DF1, n = 15L)
## PLAYER POS V1 V2 V3 V4 V5
## 1 PLAYER <NA> RK PLAYER TEAM RUSH REC
## 2 Shaun Alexander RB 1 Shaun Alexander, RB SEA 27 1
## 3 Jay Feely PK 2 Jay Feely, PK NYG 0 0
## 4 Neil Rackers PK 3 Neil Rackers, PK ARI 0 0
## 5 Shayne Graham PK 4 Shayne Graham, PK CIN 0 0
## 6 Larry Johnson RB 5 Larry Johnson, RB KC 20 1
## 7 Lawrence Tynes PK 6 Lawrence Tynes, PK KC 0 0
## 8 John Kasay PK 7 John Kasay, PK CAR 0 0
## 9 Mike Vanderjagt PK Â Mike Vanderjagt, PK IND 0 0
## 10 LaDainian Tomlinson RB 9 LaDainian Tomlinson, RB SD 18 2
## 11 Jeff Wilkins PK 10 Jeff Wilkins, PK LA 0 0
## 12 TOUCHDOWNS <NA> Â TOUCHDOWNS SCORING <NA> <NA>
## 13 PLAYER <NA> RK PLAYER TEAM RUSH REC
## 14 Jeff Reed PK Â Jeff Reed, PK PIT 0 0
## 15 Jason Elam PK 12 Jason Elam, PK DEN 0 0
## V6 V7 V8 V9 V10 V11 V12 Year
## 1 RET TD FG XP 2PT PTS PTS/G 2005
## 2 0 28 0 0 0 168 10.5 2005
## 3 0 0 35 43 0 148 9.3 2005
## 4 0 0 40 20 0 140 9.3 2005
## 5 0 0 28 47 0 131 8.2 2005
## 6 0 21 0 0 0 126 7.9 2005
## 7 0 0 27 44 0 125 7.8 2005
## 8 0 0 26 43 0 121 7.6 2005
## 9 0 0 23 52 0 121 7.6 2005
## 10 0 20 0 0 0 120 7.5 2005
## 11 0 0 27 36 0 117 7.3 2005
## 12 <NA> <NA> <NA> <NA> <NA> <NA> <NA> 2005
## 13 RET TD FG XP 2PT PTS PTS/G 2005
## 14 0 0 24 45 0 117 7.3 2005
## 15 0 0 24 43 0 115 7.2 2005
# Remove column names and category tags.
scoring.DFA <<- data.frame(PLAYER= sapply(spl, "[", 1), POS= sapply(spl, "[", 2)
, scoring.DF, stringsAsFactors = F) %>%
select(-c(V2, V1)) %>%
filter(PLAYER != "PLAYER") %>%
filter (PLAYER != "TOUCHDOWNS")
head(scoring.DFA, n = 15L)
## PLAYER POS V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 Year
## 1 Shaun Alexander RB SEA 27 1 0 28 0 0 0 168 10.5 2005
## 2 Jay Feely PK NYG 0 0 0 0 35 43 0 148 9.3 2005
## 3 Neil Rackers PK ARI 0 0 0 0 40 20 0 140 9.3 2005
## 4 Shayne Graham PK CIN 0 0 0 0 28 47 0 131 8.2 2005
## 5 Larry Johnson RB KC 20 1 0 21 0 0 0 126 7.9 2005
## 6 Lawrence Tynes PK KC 0 0 0 0 27 44 0 125 7.8 2005
## 7 John Kasay PK CAR 0 0 0 0 26 43 0 121 7.6 2005
## 8 Mike Vanderjagt PK IND 0 0 0 0 23 52 0 121 7.6 2005
## 9 LaDainian Tomlinson RB SD 18 2 0 20 0 0 0 120 7.5 2005
## 10 Jeff Wilkins PK LA 0 0 0 0 27 36 0 117 7.3 2005
## 11 Jeff Reed PK PIT 0 0 0 0 24 45 0 117 7.3 2005
## 12 Jason Elam PK DEN 0 0 0 0 24 43 0 115 7.2 2005
## 13 Matt Stover PK BAL 0 0 0 0 30 23 0 113 7.1 2005
## 14 Rian Lindell PK BUF 0 0 0 0 29 26 0 113 7.1 2005
## 15 Nate Kaeding PK SD 0 0 0 0 21 49 0 112 7.0 2005
# I use this to give me the names I want to change my current column names to.
print(listforCols) # Use this again to see the list of column names that I want to use.
## V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1 PLAYER TEAM RUSH REC RET TD FG XP 2PT PTS PTS/G
colnames(scoring.DFA)[which(colnames(scoring.DFA) %in% c(
"V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11"
, "V12") )] <- c("TEAM"
, "RUSH", "REC", "RET", "TD", "FG", "XP", "2PT", "PTS"
, "PTS-G")
head(scoring.DFA, n = 15L)
## PLAYER POS TEAM RUSH REC RET TD FG XP 2PT PTS PTS-G Year
## 1 Shaun Alexander RB SEA 27 1 0 28 0 0 0 168 10.5 2005
## 2 Jay Feely PK NYG 0 0 0 0 35 43 0 148 9.3 2005
## 3 Neil Rackers PK ARI 0 0 0 0 40 20 0 140 9.3 2005
## 4 Shayne Graham PK CIN 0 0 0 0 28 47 0 131 8.2 2005
## 5 Larry Johnson RB KC 20 1 0 21 0 0 0 126 7.9 2005
## 6 Lawrence Tynes PK KC 0 0 0 0 27 44 0 125 7.8 2005
## 7 John Kasay PK CAR 0 0 0 0 26 43 0 121 7.6 2005
## 8 Mike Vanderjagt PK IND 0 0 0 0 23 52 0 121 7.6 2005
## 9 LaDainian Tomlinson RB SD 18 2 0 20 0 0 0 120 7.5 2005
## 10 Jeff Wilkins PK LA 0 0 0 0 27 36 0 117 7.3 2005
## 11 Jeff Reed PK PIT 0 0 0 0 24 45 0 117 7.3 2005
## 12 Jason Elam PK DEN 0 0 0 0 24 43 0 115 7.2 2005
## 13 Matt Stover PK BAL 0 0 0 0 30 23 0 113 7.1 2005
## 14 Rian Lindell PK BUF 0 0 0 0 29 26 0 113 7.1 2005
## 15 Nate Kaeding PK SD 0 0 0 0 21 49 0 112 7.0 2005