setwd('/home/ryan/sports/nfl-life-death/')
library('tidyverse')
library('zoo')

Data import and cleaning

Bring in all the player data from pro-football-reference.com

careers <- read_csv('data/nfl-players.csv')
## Parsed with column specification:
## cols(
##   name = col_character(),
##   srid = col_character(),
##   active = col_character(),
##   hof = col_character(),
##   positions = col_character(),
##   start = col_integer(),
##   end = col_integer(),
##   letter = col_character()
## )
head(careers)

Unfurl the data into separate rows for each position listed

allpositions <- careers %>% 
  select(name, srid, active, positions, start, end) %>% 
  replace_na(list(positions = '')) %>% 
  mutate(positions = str_replace(positions, ',|/', '-')) %>% 
  separate_rows(positions, sep = '-')
head(allpositions)

What positions exist over time?

ggplot(data = allpositions) +
  geom_bar(mapping = aes(x = start))

allpositions %>% 
  group_by(start) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  head()

Why the bump in 1987? Strike season!

What positions exist at what times?

ggplot(data = allpositions) +
  geom_bar(mapping = aes(x = start)) +
  facet_grid(positions ~ .) +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank())

There are a lot of “backs” (wing backs, blocking backs, etc.) in earlier years that are probably not worth including now.

PFR links: positions glossary

Spit out the positions so I can classify them

posfreq <- allpositions %>% 
  count(positions)
write_csv(posfreq, 'data/positions.csv')

Bring in the “simple” positions

simplepos <- read_csv('data/positions-categorized.csv')
## Parsed with column specification:
## cols(
##   positions = col_character(),
##   n = col_integer(),
##   category = col_character()
## )
knitr::kable(simplepos %>% arrange(category, desc(n)))
positions n category
DB 3558 defback
CB 314 defback
S 85 defback
FS 69 defback
SS 62 defback
DE 2098 defline
DT 1668 defline
DL 50 defline
DG 21 defline
LB 3096 linebacker
OLB 146 linebacker
ILB 75 linebacker
MG 25 linebacker
G 2845 offline
T 2712 offline
C 1392 offline
NT 359 offline
OT 116 offline
OG 71 offline
OL 61 offline
QB 1019 quarterback
RB 2077 runback
HB 939 runback
FB 816 runback
TB 278 runback
TE 1258 tightend
WR 2399 widerec
E 1175 NA
B 551 NA
K 380 NA
P 331 NA
WB 281 NA
BB 267 NA
FL 91 NA
SE 68 NA
LS 26 NA
KR 1 NA
PR 1 NA
NA 1 NA

Join the simple positions back to the data

allpositions <- allpositions %>%
  left_join(simplepos %>% filter(!is.na(category)) %>% select(positions, category)) %>% 
  replace_na(list(category = 'other'))
## Joining, by = "positions"

Now chart positions over time

ggplot(data = allpositions) +
  geom_bar(mapping = aes(x = start)) +
  facet_grid(category ~ .) +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank())

Very little of “other” in recent years

Let’s filter down to “retired” players (where active is False, according to PFR)

retired <- allpositions %>% 
  filter(active == "False" & category != 'other') %>% 
  mutate(career = (end - start))

Now calculate raw average for each position in each year and plot it

# lining it up to match WSJ plot
retired$category <- factor(
  x = retired$category,
  levels = c('offline','defback','quarterback','linebacker','defline','tightend','runback','widerec')
)

careers.by.pos <- retired %>% 
  group_by(category, end) %>% 
  summarise(avgcareer = mean(career))

ggplot(data = careers.by.pos) +
  geom_bar(mapping = aes(x = end, y = avgcareer), stat = 'identity') +
  facet_grid(. ~ category) +
  scale_x_continuous(limits = c(2000, 2014))
## Warning: Removed 541 rows containing missing values (position_stack).

There is some noise. Will a moving average smooth it out?

window <- 5 # years

run.careers.by.pos <- retired %>%
  group_by(category, end) %>% 
  summarise(careers = sum(career),
            players = n()) %>% 
  arrange(category, end) %>% 
  group_by(category) %>% 
  mutate(sumcareers = rollsumr(careers, k = window, na.pad = TRUE),
         sumplayers = rollsumr(players, k = window, na.pad = TRUE),
         runavgcareer = sumcareers / sumplayers)

ggplot(data = run.careers.by.pos) +
  geom_bar(mapping = aes(x = end, y = runavgcareer), stat = 'identity') +
  facet_grid(. ~ category) +
  scale_x_continuous(limits = c(2000, 2014))
## Warning: Removed 541 rows containing missing values (position_stack).