Based on information from Mark Rijckenberg post in SABR101x
See https://github.com/BurntSushi/nfldb
https://github.com/BurntSushi/nflgame
These scrape NFL data using Python. Alternatively just download the latest version of the database from http://burntsushi.net/stuff/nfldb/nfldb.sql.zip - 15MB, 200MB unzipped.
This file demonstrates use of the downloaded database with the RPostgreSQL package. Note that it is not quite turnkey because we need to create a PostgreSQL database from the downloaded database dump.
# If necessary download the file (can be improved to download if > n days old)
dbUrl <- "http://burntsushi.net/stuff/nfldb/nfldb.sql.zip"
dbDir = "./data"
dbZipPath <- "./data/nfldb.zip.sql"
dbPath <- "./data/nfldb.sql"
dateDownloadedFile <- "./data/dateDownloaded.txt"
if (!file.exists(dbDir)) {
dir.create(dbDir)
}
if (!file.exists(dbPath)) {
download.file(dbUrl, destfile = dbZipPath)
unzip(dbZipPath, exdir=dbDir)
dateDownloaded <- date()
dateDownloaded
cat(dateDownloaded, file=dateDownloadedFile)
print(paste("Downloaded database", dbPath, "at"))
cat(readChar(dateDownloadedFile, 1e5))
} else if (file.exists(dateDownloadedFile)) {
print(paste("Database", dbPath, "downloaded on"))
cat(readChar(dateDownloadedFile, 1e5))
}
## [1] "Database ./data/nfldb.sql downloaded on"
## Mon Jul 14 07:10:00 2014
Examine the database. See R and PostgreSQL using RPostgreSQL and sqldf.
PostgreSQL will need to be downloaded and configured separately: PostGreSQL
I created my PostgreSQL database using the downloaded SQL file:
psql -h localhost -d nfldb -U postgres -f nfldb.sql
For more information about databases in general see Introduction to Databases - Stanford Online also available as Introduction to Databases.
Database Master is useful as an interface to multiple database platforms (e.g. PostgreSQL, MySQL, SQLite).
if (!require(RPostgreSQL)) {
install.packages("RPostgreSQL")
library(RPostgreSQL)
}
## Loading required package: RPostgreSQL
## Loading required package: DBI
# # Use sqldf instead
# # Establish connection to PoststgreSQL using RPostgreSQL
# drv <- dbDriver("PostgreSQL")
# # con <- dbConnect(drv) # Simple version (localhost as default)
# # Full version of connection setting
# con <- dbConnect(drv, dbname="nfldb",host="localhost",port=5432,user="postgre",password="password", )
# # At the end of your script, do not forget to close the connection (see below)!
# With sqldf you can easily execute any SQL command. For example, you can drop
# table, create table, insert new records. Or just execute queries.
library("sqldf")
## Warning: package 'sqldf' was built under R version 3.0.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.0.3
## Loading required package: proto
## Loading required namespace: tcltk
## Loading required package: RSQLite
## Loading required package: RSQLite.extfuns
## sqldf will default to using PostgreSQL
# Connection to PostgreSQL database with sqldf package has to be set in a
# different way, by setting RPostgreSQL options:
options(sqldf.RPostgreSQL.user ="postgres",
#sqldf.RPostgreSQL.password ="password", # Set above and not echoed
sqldf.RPostgreSQL.dbname ="nfldb",
sqldf.RPostgreSQL.host ="localhost",
sqldf.RPostgreSQL.port =5432)
sqldf("
/* sql comments can be used */
/* Not as nice as mySQL: show tables; */
SELECT * FROM pg_catalog.pg_tables WHERE schemaname = 'public';
")
## Loading required package: tcltk
## schemaname tablename tableowner tablespace hasindexes hasrules
## 1 public meta postgres <NA> FALSE FALSE
## 2 public player postgres <NA> TRUE FALSE
## 3 public play postgres <NA> TRUE FALSE
## 4 public team postgres <NA> TRUE FALSE
## 5 public drive postgres <NA> TRUE FALSE
## 6 public game postgres <NA> TRUE FALSE
## 7 public play_player postgres <NA> TRUE FALSE
## hastriggers
## 1 FALSE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## 7 TRUE
sqldf("select * from team;")
## team_id city name
## 1 ARI Arizona Cardinals
## 2 ATL Atlanta Falcons
## 3 BAL Baltimore Ravens
## 4 BUF Buffalo Bills
## 5 CAR Carolina Panthers
## 6 CHI Chicago Bears
## 7 CIN Cincinnati Bengals
## 8 CLE Cleveland Browns
## 9 DAL Dallas Cowboys
## 10 DEN Denver Broncos
## 11 DET Detroit Lions
## 12 GB Green Bay Packers
## 13 HOU Houston Texans
## 14 IND Indianapolis Colts
## 15 JAC Jacksonville Jaguars
## 16 KC Kansas City Chiefs
## 17 MIA Miami Dolphins
## 18 MIN Minnesota Vikings
## 19 NE New England Patriots
## 20 NO New Orleans Saints
## 21 OAK Oakland Raiders
## 22 PHI Philadelphia Eagles
## 23 PIT Pittsburgh Steelers
## 24 SD San Diego Chargers
## 25 SEA Seattle Seahawks
## 26 SF San Francisco 49ers
## 27 STL St. Louis Rams
## 28 TB Tampa Bay Buccaneers
## 29 TEN Tennessee Titans
## 30 WAS Washington Redskins
## 31 UNK UNK UNK
## 32 NYG New York Giants
## 33 NYJ New York Jets
# For fun take a look at the 10 oldest players
players <- sqldf("select * from player;")
bd <- as.Date(players$birthdate, "%m/%d/%Y")
head(players[order(bd),])
## player_id gsis_name full_name first_name last_name team position
## 728 00-0002655 J.Carney John Carney John Carney UNK UNK
## 452 00-0005113 J.Feagles Jeff Feagles Jeff Feagles UNK UNK
## 2977 00-0015784 M.Stover Matt Stover Matt Stover UNK UNK
## 5178 00-0016689 M.Turk Matt Turk Matt Turk UNK UNK
## 4059 00-0014619 J.Seau Junior Seau Junior Seau UNK UNK
## 754 00-0005106 B.Favre Brett Favre Brett Favre UNK UNK
## profile_id profile_url
## 728 2499991 http://www.nfl.com/player/johncarney/2499991/profile
## 452 2500608 http://www.nfl.com/player/jefffeagles/2500608/profile
## 2977 2503167 http://www.nfl.com/player/mattstover/2503167/profile
## 5178 2503416 http://www.nfl.com/player/mattturk/2503416/profile
## 4059 2502886 http://www.nfl.com/player/juniorseau/2502886/profile
## 754 2500606 http://www.nfl.com/player/brettfavre/2500606/profile
## uniform_number birthdate college height weight
## 728 NA 4/20/1964 Notre Dame 71 185
## 452 NA 3/7/1966 Miami (Fla.) 73 215
## 2977 NA 1/27/1968 Louisiana Tech 71 180
## 5178 NA 6/16/1968 Wis.-Whitewater 77 251
## 4059 NA 1/19/1969 USC 75 250
## 754 NA 10/10/1969 Southern Mississippi 74 222
## years_pro status
## 728 23 Unknown
## 452 22 Unknown
## 2977 19 Unknown
## 5178 17 Unknown
## 4059 20 Unknown
## 754 20 Unknown
sqldf("select * from drive limit 10;")
## Warning: RS-DBI driver warning: (unrecognized PostgreSQL field type field_pos (id:40965) in column 2)
## Warning: RS-DBI driver warning: (unrecognized PostgreSQL field type game_time (id:41005) in column 3)
## Warning: RS-DBI driver warning: (unrecognized PostgreSQL field type field_pos (id:40965) in column 4)
## Warning: RS-DBI driver warning: (unrecognized PostgreSQL field type game_time (id:41005) in column 5)
## Warning: RS-DBI driver warning: (unrecognized PostgreSQL field type pos_period (id:41087) in column 7)
## gsis_id drive_id start_field start_time end_field end_time pos_team
## 1 2009081350 1 (-30) (Q1,0) (-11) (Q1,239) PIT
## 2 2009081350 2 (-35) (Q1,239) (7) (Q1,526) ARI
## 3 2009081350 3 (-42) (Q1,526) (-14) (Q1,870) PIT
## 4 2009081350 4 (-29) (Q1,870) (11) (Q2,244) ARI
## 5 2009081350 5 (-39) (Q2,244) (18) (Q2,437) PIT
## 6 2009081350 6 (-42) (Q2,437) (43) (Q2,838) ARI
## 7 2009081350 7 (-43) (Q2,838) (-41) (Q2,862) PIT
## 8 2009081350 8 (-8) (Q2,862) (39) (Q2,900) ARI
## 9 2009081350 9 (-19) (Q3,0) (-18) (Q3,98) ARI
## 10 2009081350 10 (-26) (Q3,98) (32) (Q3,447) PIT
## pos_time first_downs result penalty_yards yards_gained play_count
## 1 (239) 1 Punt 4 15 6
## 2 (287) 3 Punt 0 42 10
## 3 (344) 2 Punt -5 33 8
## 4 (274) 3 Punt 0 40 10
## 5 (193) 1 Field Goal -5 62 6
## 6 (401) 5 Downs 0 85 14
## 7 (24) 0 Punt 0 2 3
## 8 (38) 1 Field Goal 40 7 3
## 9 (98) 0 Punt 0 1 3
## 10 (349) 3 Missed FG 0 58 13
## time_inserted time_updated
## 1 2013-09-03 22:36:23 2013-09-03 22:36:23
## 2 2013-09-03 22:36:23 2013-09-03 22:36:23
## 3 2013-09-03 22:36:23 2013-09-03 22:36:23
## 4 2013-09-03 22:36:23 2013-09-03 22:36:23
## 5 2013-09-03 22:36:23 2013-09-03 22:36:23
## 6 2013-09-03 22:36:23 2013-09-03 22:36:23
## 7 2013-09-03 22:36:23 2013-09-03 22:36:23
## 8 2013-09-03 22:36:23 2013-09-03 22:36:23
## 9 2013-09-03 22:36:23 2013-09-03 22:36:23
## 10 2013-09-03 22:36:23 2013-09-03 22:36:23
sqldf("select * from game limit 10;")
## gsis_id gamekey start_time week day_of_week season_year
## 1 2013110306 55964 2013-11-03 13:05:00 9 Sunday 2013
## 2 2013110301 55959 2013-11-03 10:00:00 9 Sunday 2013
## 3 2013110300 55958 2013-11-03 10:00:00 9 Sunday 2013
## 4 2013110303 55961 2013-11-03 10:00:00 9 Sunday 2013
## 5 2011111304 55301 2011-11-13 10:00:00 10 Sunday 2011
## 6 2011111305 55302 2011-11-13 10:00:00 10 Sunday 2011
## 7 2011111306 55304 2011-11-13 10:00:00 10 Sunday 2011
## 8 2011111307 55297 2011-11-13 13:15:00 10 Sunday 2011
## 9 2011081852 55439 2011-08-18 17:00:00 2 Thursday 2011
## 10 2011111301 55296 2011-11-13 10:00:00 10 Sunday 2011
## season_type finished home_team home_score home_score_q1 home_score_q2
## 1 Regular TRUE OAK 20 3 10
## 2 Regular TRUE CAR 34 7 7
## 3 Regular TRUE BUF 13 7 3
## 4 Regular TRUE NYJ 26 3 17
## 5 Regular TRUE IND 3 0 3
## 6 Regular TRUE KC 10 0 0
## 7 Regular TRUE PHI 17 0 14
## 8 Regular TRUE CHI 37 10 10
## 9 Preseason TRUE PIT 24 7 14
## 10 Regular TRUE CAR 3 0 0
## home_score_q3 home_score_q4 home_score_q5 home_turnovers away_team
## 1 0 7 0 1 PHI
## 2 3 17 0 3 ATL
## 3 3 0 0 1 KC
## 4 3 3 0 1 NO
## 5 0 0 0 0 JAC
## 6 7 3 0 0 DEN
## 7 0 3 0 0 ARI
## 8 17 0 0 3 DET
## 9 0 3 0 2 PHI
## 10 3 0 0 3 TEN
## away_score away_score_q1 away_score_q2 away_score_q3 away_score_q4
## 1 49 7 21 21 0
## 2 10 0 10 0 0
## 3 23 0 3 10 10
## 4 20 7 7 3 3
## 5 17 3 0 7 7
## 6 17 7 3 0 7
## 7 21 0 7 0 14
## 8 13 0 6 0 7
## 9 14 0 0 0 14
## 10 30 14 3 3 10
## away_score_q5 away_turnovers time_inserted time_updated
## 1 0 3 2013-09-03 22:36:23 2014-01-06 11:07:40
## 2 0 3 2013-09-03 22:36:23 2014-01-06 11:07:40
## 3 0 3 2013-09-03 22:36:23 2014-01-06 11:07:40
## 4 0 0 2013-09-03 22:36:23 2014-01-06 11:07:40
## 5 0 2 2013-09-03 22:36:23 2014-01-06 11:07:40
## 6 0 1 2013-09-03 22:36:23 2014-01-06 11:07:40
## 7 0 3 2013-09-03 22:36:23 2014-01-06 11:07:40
## 8 0 3 2013-09-03 22:36:23 2014-01-06 11:07:40
## 9 0 0 2013-09-03 22:36:23 2014-01-06 11:07:40
## 10 0 3 2013-09-03 22:36:23 2014-01-06 11:07:40
sqldf("select * from play limit 10;") # 70MB total
## Warning: RS-DBI driver warning: (unrecognized PostgreSQL field type game_time (id:41005) in column 3)
## Warning: RS-DBI driver warning: (unrecognized PostgreSQL field type field_pos (id:40965) in column 5)
## gsis_id drive_id play_id time pos_team yardline down yards_to_go
## 1 2009081350 1 36 (Q1,0) ARI (-20) NA 0
## 2 2009081350 1 59 (Q1,0) PIT (-30) 1 10
## 3 2009081350 1 80 (Q1,39) PIT (-31) 2 11
## 4 2009081350 1 104 (Q1,82) PIT (-25) 3 5
## 5 2013120804 1 35 (Q1,0) NO (-15) NA 0
## 6 2009081350 1 144 (Q1,96) PIT (-21) 3 1
## 7 2009081350 1 169 (Q1,139) PIT (-18) 1 10
## 8 2009081350 1 193 (Q1,181) PIT (-15) 2 7
## 9 2009081350 1 214 (Q1,225) PIT (-11) 3 3
## 10 2009081350 1 258 (Q1,232) PIT (-11) 4 3
## description
## 1 N.Rackers kicks 75 yards from ARI 30 to PIT -5. M.Wallace, Touchback.
## 2 (15:00) R.Mendenhall left guard to PIT 19 for -1 yards (C.Campbell).
## 3 (14:21) B.Roethlisberger pass short right to H.Ward to PIT 25 for 6 yards (A.Rolle).
## 4 (13:38) (Shotgun) PENALTY on ARI-A.Wilson, Defensive Offside, 4 yards, enforced at PIT 25 - No Play. UNABATED TO THE QB
## 5 T.Morstead kicks 65 yards from NO 35 to end zone, Touchback.
## 6 (13:24) R.Mendenhall up the middle to PIT 32 for 3 yards (K.Dansby, A.Wilson).
## 7 (12:41) B.Roethlisberger pass short right to H.Miller to PIT 35 for 3 yards (K.Dansby, G.Hayes).
## 8 (11:59) R.Mendenhall right guard to PIT 39 for 4 yards (G.Hayes, C.Campbell).
## 9 (11:15) (Shotgun) B.Roethlisberger pass incomplete short right to S.Holmes. Penalty on PIT-W.Colon, Illegal Formation, declined.
## 10 (11:08) (Punt formation) D.Sepulveda punts 46 yards to ARI 15, Center-G.Warren, fair catch by A.Rolle.
## note time_inserted time_updated first_down
## 1 KICKOFF 2013-09-03 22:36:23 2013-09-03 22:36:23 0
## 2 <NA> 2013-09-03 22:36:23 2013-09-03 22:36:23 0
## 3 <NA> 2013-09-03 22:36:23 2013-09-03 22:36:23 0
## 4 PENALTY 2013-09-03 22:36:23 2013-09-03 22:36:23 0
## 5 KICKOFF 2013-12-09 14:52:50 2013-12-09 14:52:50 0
## 6 <NA> 2013-09-03 22:36:23 2013-09-03 22:36:23 1
## 7 <NA> 2013-09-03 22:36:23 2013-09-03 22:36:23 0
## 8 <NA> 2013-09-03 22:36:23 2013-09-03 22:36:23 0
## 9 <NA> 2013-09-03 22:36:23 2013-09-03 22:36:23 0
## 10 PUNT 2013-09-03 22:36:23 2013-09-03 22:36:23 0
## fourth_down_att fourth_down_conv fourth_down_failed passing_first_down
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## 7 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
## 10 0 0 0 0
## penalty penalty_first_down penalty_yds rushing_first_down
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 1 0 4 0
## 5 0 0 0 0
## 6 0 0 0 1
## 7 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
## 10 0 0 0 0
## third_down_att third_down_conv third_down_failed timeout xp_aborted
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 1 1 0 0 0
## 7 0 0 0 0 0
## 8 0 0 0 0 0
## 9 1 0 1 0 0
## 10 0 0 0 0 0
# play_player has many fields
play_player <- sqldf("select * from play_player limit 10;") # 160MB total
play_player[,1:5]
## gsis_id drive_id play_id player_id team
## 1 2009081350 1 36 00-0019676 ARI
## 2 2009081350 1 59 00-0026163 PIT
## 3 2009081350 1 59 00-0026190 ARI
## 4 2009081350 1 80 00-0017162 PIT
## 5 2009081350 1 80 00-0023443 ARI
## 6 2009081350 1 80 00-0022924 PIT
## 7 2009081350 1 104 00-0020505 ARI
## 8 2009081350 1 144 00-0020505 ARI
## 9 2009081350 1 144 00-0022927 ARI
## 10 2009081350 1 144 00-0026163 PIT
# Close PostgreSQL connection
# dbDisconnect(con)