setwd('/home/ryan/sports/nfl-life-death/')
library('tidyverse')
library('zoo')
Bring in all the player data from pro-football-reference.com
careers <- read_csv('data/nfl-players.csv')
## Parsed with column specification:
## cols(
## name = col_character(),
## srid = col_character(),
## active = col_character(),
## hof = col_character(),
## positions = col_character(),
## start = col_integer(),
## end = col_integer(),
## letter = col_character()
## )
head(careers)
Unfurl the data into separate rows for each position listed
allpositions <- careers %>%
select(name, srid, active, positions, start, end) %>%
replace_na(list(positions = '')) %>%
mutate(positions = str_replace(positions, ',|/', '-')) %>%
separate_rows(positions, sep = '-')
head(allpositions)
What positions exist over time?
ggplot(data = allpositions) +
geom_bar(mapping = aes(x = start))
allpositions %>%
group_by(start) %>%
count() %>%
arrange(desc(n)) %>%
head()
Why the bump in 1987? Strike season!
What positions exist at what times?
ggplot(data = allpositions) +
geom_bar(mapping = aes(x = start)) +
facet_grid(positions ~ .) +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())
There are a lot of “backs” (wing backs, blocking backs, etc.) in earlier years that are probably not worth including now.
Spit out the positions so I can classify them
posfreq <- allpositions %>%
count(positions)
write_csv(posfreq, 'data/positions.csv')
Bring in the “simple” positions
simplepos <- read_csv('data/positions-categorized.csv')
## Parsed with column specification:
## cols(
## positions = col_character(),
## n = col_integer(),
## category = col_character()
## )
knitr::kable(simplepos %>% arrange(category, desc(n)))
positions | n | category |
---|---|---|
DB | 3558 | defback |
CB | 314 | defback |
S | 85 | defback |
FS | 69 | defback |
SS | 62 | defback |
DE | 2098 | defline |
DT | 1668 | defline |
DL | 50 | defline |
DG | 21 | defline |
LB | 3096 | linebacker |
OLB | 146 | linebacker |
ILB | 75 | linebacker |
MG | 25 | linebacker |
G | 2845 | offline |
T | 2712 | offline |
C | 1392 | offline |
NT | 359 | offline |
OT | 116 | offline |
OG | 71 | offline |
OL | 61 | offline |
QB | 1019 | quarterback |
RB | 2077 | runback |
HB | 939 | runback |
FB | 816 | runback |
TB | 278 | runback |
TE | 1258 | tightend |
WR | 2399 | widerec |
E | 1175 | NA |
B | 551 | NA |
K | 380 | NA |
P | 331 | NA |
WB | 281 | NA |
BB | 267 | NA |
FL | 91 | NA |
SE | 68 | NA |
LS | 26 | NA |
KR | 1 | NA |
PR | 1 | NA |
NA | 1 | NA |
Join the simple positions back to the data
allpositions <- allpositions %>%
left_join(simplepos %>% filter(!is.na(category)) %>% select(positions, category)) %>%
replace_na(list(category = 'other'))
## Joining, by = "positions"
Now chart positions over time
ggplot(data = allpositions) +
geom_bar(mapping = aes(x = start)) +
facet_grid(category ~ .) +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())
Very little of “other” in recent years
Let’s filter down to “retired” players (where active is False, according to PFR)
retired <- allpositions %>%
filter(active == "False" & category != 'other') %>%
mutate(career = (end - start))
Now calculate raw average for each position in each year and plot it
# lining it up to match WSJ plot
retired$category <- factor(
x = retired$category,
levels = c('offline','defback','quarterback','linebacker','defline','tightend','runback','widerec')
)
careers.by.pos <- retired %>%
group_by(category, end) %>%
summarise(avgcareer = mean(career))
ggplot(data = careers.by.pos) +
geom_bar(mapping = aes(x = end, y = avgcareer), stat = 'identity') +
facet_grid(. ~ category) +
scale_x_continuous(limits = c(2000, 2014))
## Warning: Removed 541 rows containing missing values (position_stack).
There is some noise. Will a moving average smooth it out?
window <- 5 # years
run.careers.by.pos <- retired %>%
group_by(category, end) %>%
summarise(careers = sum(career),
players = n()) %>%
arrange(category, end) %>%
group_by(category) %>%
mutate(sumcareers = rollsumr(careers, k = window, na.pad = TRUE),
sumplayers = rollsumr(players, k = window, na.pad = TRUE),
runavgcareer = sumcareers / sumplayers)
ggplot(data = run.careers.by.pos) +
geom_bar(mapping = aes(x = end, y = runavgcareer), stat = 'identity') +
facet_grid(. ~ category) +
scale_x_continuous(limits = c(2000, 2014))
## Warning: Removed 541 rows containing missing values (position_stack).