Task:
If you want some practice scraping web pages, consider the challenge of getting the National Football League Super Bowl results published here http://www.espn.com/nfl/superbowl/history/winners into a data frame.
Libraries
library(rvest)
library(tidyverse)
library(knitr)
Read in html to R
url <- ("http://www.espn.com/nfl/superbowl/history/winners")
football <- read_html(url)
tables <- html_nodes(football, css = "table")
football1 <- html_table(tables, header = T)
football2 <- as.data.frame(football1)
head(football2, 15)
## Super.Bowl.Winners.and.Results Super.Bowl.Winners.and.Results.1
## 1 NO. DATE
## 2 I Jan. 15, 1967
## 3 II Jan. 14, 1968
## 4 III Jan. 12, 1969
## 5 IV Jan. 11, 1970
## 6 V Jan. 17, 1971
## 7 VI Jan. 16, 1972
## 8 VII Jan. 14, 1973
## 9 VIII Jan. 13, 1974
## 10 IX Jan. 12, 1975
## 11 X Jan. 18, 1976
## 12 XI Jan. 9, 1977
## 13 XII Jan. 15, 1978
## 14 XIII Jan. 21, 1979
## 15 XIV Jan. 20, 1980
## Super.Bowl.Winners.and.Results.2 Super.Bowl.Winners.and.Results.3
## 1 SITE RESULT
## 2 Los Angeles Memorial Coliseum Green Bay 35, Kansas City 10
## 3 Orange Bowl (Miami) Green Bay 33, Oakland 14
## 4 Orange Bowl (Miami) New York Jets 16, Baltimore 7
## 5 Tulane Stadium (New Orleans) Kansas City 23, Minnesota 7
## 6 Orange Bowl (Miami) Baltimore 16, Dallas 13
## 7 Tulane Stadium (New Orleans) Dallas 24, Miami 3
## 8 Los Angeles Memorial Coliseum Miami 14, Washington 7
## 9 Rice Stadium (Houston) Miami 24, Minnesota 7
## 10 Tulane Stadium (New Orleans) Pittsburgh 16, Minnesota 6
## 11 Orange Bowl (Miami) Pittsburgh 21, Dallas 17
## 12 Rose Bowl (Pasadena, Calif.) Oakland 32, Minnesota 14
## 13 Superdome (New Orleans) Dallas 27, Denver 10
## 14 Orange Bowl (Miami) Pittsburgh 35, Dallas 31
## 15 Rose Bowl (Pasadena, Calif.) Pittsburgh 31, Los Angeles 19
Rename football columns
football3 <- football2 %>%
rename(Num = Super.Bowl.Winners.and.Results, Date = Super.Bowl.Winners.and.Results.1,
Site = Super.Bowl.Winners.and.Results.2, Result = Super.Bowl.Winners.and.Results.3)
Convert char to date (for dates)
football3$Date <- as.Date(football3$Date, "%b.%d,%Y")
Filter by recent superbowls
football3 <- football3 %>%
filter(football3$Date >= "2010-02-01" )
Visualization
ggplot(football3, aes(football3$Result, Num )) +
geom_point(aes(color = football3$Result)) +
coord_flip() +
theme_bw() +
theme(legend.position = "none")
kable(football3)
Num | Date | Site | Result |
---|---|---|---|
XLIV | 2010-02-07 | Sun Life Stadium (Miami) | New Orleans Saints 31, Indianapolis Colts 17 |
XLV | 2011-02-06 | Cowboys Stadium (Arlington, Texas) | Green Bay Packers 31, Pittsburgh Steelers 25 |
XLVI | 2012-02-05 | Lucas Oil Stadium (Indianapolis) | New York Giants 21, New England Patriots 17 |
XLVII | 2013-02-03 | Mercedes-Benz Superdome (New Orleans) | Baltimore Ravens 34, San Francisco 49ers 31 |
XLVIII | 2014-02-02 | MetLife Stadium (East Rutherford, N.J.) | Seattle Seahawks 43, Denver Broncos 8 |
XLIX | 2015-02-01 | University of Phoenix Stadium (Glendale, Ariz.) | New England Patriots 28, Seattle Seahawks 24 |
50 | 2016-02-07 | Levi’s Stadium (Santa Clara, Calif.) | Denver Broncos 24, Carolina Panthers 10 |
LI | 2017-02-05 | NRG Stadium (Houston) | New England Patriots 34, Atlanta Falcons 28 |
LII | 2018-02-04 | U.S. Bank Stadium (Minneapolis) | Philadelphia Eagles 41, New England Patriots 33 |