Similar to an example I put together for the Bill Barnwell A Tale of Two Cities column, here's a quick post showing how to reproduce the Pythagenpat wins tables in this article from the great Football Perpective site, with a couple of plots for fun. Let's get to it:
# read 2013 data
df <- read.csv("http://www.repole.com/sun4cast/stats/nfl2013stats.csv", stringsAsFactors = FALSE)
# create wins column
i <- 1
for (i in i:nrow(df)) {
if (df$ScoreOff[i] > df$ScoreDef[i]) {
df$Wins[i] <- 1
} else if (df$ScoreOff[i] == df$ScoreDef[i]) {
df$Wins[i] <- 0.5
} else {
df$Wins[i] <- 0
}
}
# create points for (PF) and points allowed (PA) table
library(dplyr)
df <- group_by(df, TeamName)
df <- summarise(df, PF = sum(ScoreOff), PA = sum(ScoreDef), Wins = sum(Wins))
# reorder data frame and remove old row name values
df <- df[order(df$TeamName), ]
row.names(df) <- NULL
# find best fit exponent for each team
df$bfe <- ((df$PF + df$PA)/16)^0.251
head(df[df$TeamName == "Seattle Seahawks", ])
## Source: local data frame [1 x 5]
##
## TeamName PF PA Wins bfe
## 28 Seattle Seahawks 417 231 13 2.532
# calculate pythagenpat winning % and wins
df$pypat_pct <- (df$PF^df$bfe)/((df$PF^df$bfe) + (df$PA^df$bfe))
df$pypat_wins <- round(df$pypat_pct * 16, 1)
head(df[df$TeamName == "Seattle Seahawks", ])
## Source: local data frame [1 x 7]
##
## TeamName PF PA Wins bfe pypat_pct pypat_wins
## 28 Seattle Seahawks 417 231 13 2.532 0.8169 13.1
# calculate difference between actual wins and pythagenpat wins
df$diff <- df$Wins - df$pypat_wins
# sort data frame descending by pythagenpat wins
df <- df[order(-df$pypat_wins), ]
row.names(df) <- NULL
# save data frame for prediction later
df2013 <- df
# show data frame
df[, c("TeamName", "PF", "PA", "Wins", "pypat_wins", "diff")]
## Source: local data frame [32 x 6]
##
## TeamName PF PA Wins pypat_wins diff
## 1 Seattle Seahawks 417 231 13.0 13.1 -0.1
## 2 Denver Broncos 606 399 13.0 12.2 0.8
## 3 Carolina Panthers 366 241 12.0 11.8 0.2
## 4 San Francisco 49ers 406 272 12.0 11.8 0.2
## 5 Cincinnati Bengals 430 305 11.0 11.4 -0.4
## 6 Kansas City Chiefs 430 305 11.0 11.4 -0.4
## 7 New Orleans Saints 414 304 11.0 11.0 0.0
## 8 New England Patriots 444 338 12.0 10.8 1.2
## 9 Arizona Cardinals 379 324 10.0 9.6 0.4
## 10 Indianapolis Colts 391 336 11.0 9.6 1.4
## 11 Philadelphia Eagles 442 382 10.0 9.5 0.5
## 12 San Diego Chargers 396 348 9.0 9.3 -0.3
## 13 Detroit Lions 395 376 7.0 8.5 -1.5
## 14 Pittsburgh Steelers 379 370 8.0 8.3 -0.3
## 15 Dallas Cowboys 439 432 8.0 8.2 -0.2
## 16 Green Bay Packers 417 428 8.5 7.7 0.8
## 17 St Louis Rams 348 364 7.0 7.5 -0.5
## 18 Tennessee Titans 362 381 7.0 7.5 -0.5
## 19 Miami Dolphins 317 335 8.0 7.4 0.6
## 20 Chicago Bears 445 478 8.0 7.2 0.8
## 21 Baltimore Ravens 320 352 8.0 7.0 1.0
## 22 Buffalo Bills 339 388 6.0 6.6 -0.6
## 23 Minnesota Vikings 391 480 5.5 5.8 -0.3
## 24 Atlanta Falcons 353 443 4.0 5.6 -1.6
## 25 New York Giants 294 383 7.0 5.4 1.6
## 26 Cleveland Browns 308 406 4.0 5.2 -1.2
## 27 New York Jets 290 387 8.0 5.2 2.8
## 28 Tampa Bay Buccaneers 288 389 4.0 5.1 -1.1
## 29 Oakland Raiders 322 453 4.0 4.6 -0.6
## 30 Washington Redskins 334 478 3.0 4.4 -1.4
## 31 Houston Texans 276 428 2.0 3.9 -1.9
## 32 Jacksonville Jaguars 247 449 4.0 2.8 1.2
Plot of each team with the difference between their Actual and Pythagenpat wins in 2013.
# plot
library(ggplot2)
ggplot(df, aes(x = factor(TeamName), y = diff)) + geom_point(size = 3) + coord_flip() +
geom_hline(x = 0, linetype = "dashed") + theme(axis.title.y = element_blank(),
plot.title = element_text(size = 13)) + ylab("(Actual Wins) - (Pythagenpat Wins)") +
ggtitle("NFL Actual Wins minus Pythagenpat Wins by Team, 2013")
Next piece is using 2013 Pythagenpat wins to predict 2014 Actual wins using a regression formula incorporating data from 1990 - 2013 (excluding some expansion years). First step here is grabbing this zip file and extracting the contents into a folder called nfllines in your working directory. This will give you a nice group of .xml and .csv files for scores and lines for each game back to 1978 and through 2012. Note we'll grab the 2013 data straight from a URL since 2013 isn't included in the .zip file.
## import files make sure you've saved the 1978 - 2012 files in folder called
## nfllines under your current working directory
# add 2013 file to nfllines folder (not included in .zip file) for inclusion
# in for loop
write.table(read.csv("http://www.repole.com/sun4cast/stats/nfl2013lines.csv"),
file = paste0(getwd(), "/nfllines/nfl2013lines.csv"), row.names = FALSE,
quote = FALSE, sep = ",")
# create vector of csv file names and years for each file
scores_list <- grep(".csv", list.files((paste0(getwd(), "/nfllines"))), value = TRUE)
# set years to use, 1990 - 2012, excluding 1994, 1998 and 2001
years <- c(1990:1993, 1995:1997, 1999, 2000, 2002:2013)
# loop through each file name and read csv file
i <- 1
for (i in i:length(years)) {
df <- read.csv(paste0(getwd(), "/nfllines/", grep(years[i], scores_list,
value = TRUE)), stringsAsFactors = FALSE)
# each file has one row for each game, split into home/visitor data frames
# then combine into one for easier manipulation
df1 <- select(df, Visitor, Visitor.Score, Home.Score)
df2 <- select(df, Home.Team, Home.Score, Visitor.Score)
# change column names for rbind
names(df1) <- c("Team", "PF", "PA")
names(df2) <- c("Team", "PF", "PA")
# combine data frames
df <- rbind_list(df1, df2)
# count number of games for season, generally 16 games but 1982 strike
# shortened season and older 14 game schedules (pre-1978) need to be
# accounted for
num_games <- df %.% group_by(Team) %.% tally()
# use game count for first team since all should be equal
num_games <- num_games[1, 2]
# create wins column
ii <- 1
for (ii in ii:nrow(df)) {
if (df$PF[ii] > df$PA[ii]) {
df$Wins[ii] <- 1
} else if (df$PF[ii] == df$PA[ii]) {
df$Wins[ii] <- 0.5
} else {
df$Wins[ii] <- 0
}
}
# sum total points for (PF), points against (PA) and Wins by Team
df$Team <- as.factor(df$Team)
df <- summarise(group_by(df, Team), PF = sum(PF), PA = sum(PA), Wins = sum(Wins))
# find best fit exponent for each team
df$bfe <- ((df$PF + df$PA)/num_games)^0.251
# calculate pythagenpat winning % and wins
df$pypat_pct <- (df$PF^df$bfe)/((df$PF^df$bfe) + (df$PA^df$bfe))
df$pypat_wins <- round(df$pypat_pct * num_games, 1)
# set year
df$Year <- years[i]
# change Team back to character
df$Team <- as.character(df$Team)
# remove unneeded coumns
df <- subset(df, select = -c(pypat_pct, bfe, PF, PA))
# create or append to data frames
if (i == 1) {
df_f <- df
} else {
df_f <- rbind_list(df_f, df)
}
}
# rename and clean up
df <- df_f
rm(df_f, df1, df2)
# reorder data frame
df <- df[order(df$Year, df$Team), ]
row.names(df) <- NULL
# add columns so each row has values for Year N (Year.x) and Year N+1
# (Year.y)
i <- 1
for (i in i:length(years)) {
n <- df[df$Year == years[i], ]
n1 <- df[df$Year == years[i + 1], ]
dft <- left_join(n, n1, by = c("Team"))
if (i == 1) {
df_f <- dft
} else {
df_f <- rbind_list(df_f, dft)
}
}
# rename and clean up
df <- df_f
rm(df_f, dft, n, n1)
# remove rows with two year gap due to years removed, also 2013 since no
# 2014
df <- df[!(df$Year.x %in% c(1993, 1997, 2000, 2013)), ]
row.names(df) <- NULL
head(df)
## Team Wins.x pypat_wins.x Year.x Wins.y Year.y pypat_wins.y
## 1 Atlanta Falcons 5 7.5 1990 10 1991 8.7
## 2 Buffalo Bills 13 12.4 1990 13 1991 11.6
## 3 Chicago Bears 11 10.1 1990 11 1991 9.0
## 4 Cincinnati Bengals 9 8.2 1990 3 1991 3.4
## 5 Cleveland Browns 3 2.2 1990 6 1991 7.8
## 6 Dallas Cowboys 7 5.8 1990 11 1991 9.0
Since the analysis excluded the years 1994, 1998 and 2001 we've removed years N-1 (1993, 1997 and 2000) as Year N rows from the regression model as well since there's no N+1 value for those years (the data for those years will still be used as N+1 for 1992, 1996 and 1999). Two other rows will be removed from the data set:
# show cases with NA values
df[!complete.cases(df), ]
## Team Wins.x pypat_wins.x Year.x Wins.y Year.y pypat_wins.y
## 91 Cleveland Browns 5 5.9 1995 NA NA NA
## 126 Houston Oilers 8 8.8 1996 NA NA NA
The N+1 values for these could be the 1996 Baltimore Ravens and 1997 Tenessee Oilers (?), but given the city change I treated them as new teams and excluded them. Note this could be part of the slight differences in results from this example and the column.
# remove rows with NAs
df <- na.omit(df)
Now calculate the regression coefficients:
# calculate best-fit formula using actual wins
act_wins_lm <- lm(Wins.y ~ Wins.x, data = df)
act_wins_lm
##
## Call:
## lm(formula = Wins.y ~ Wins.x, data = df)
##
## Coefficients:
## (Intercept) Wins.x
## 5.472 0.317
# calculate best-fit formula using pypat wins
pypat_wins_lm <- lm(Wins.y ~ pypat_wins.x, data = df)
pypat_wins_lm
##
## Call:
## lm(formula = Wins.y ~ pypat_wins.x, data = df)
##
## Coefficients:
## (Intercept) pypat_wins.x
## 4.81 0.40
Now predict 2014 wins using both the Actual wins best-fit formula and Pythagenpat wins best-fit formula:
# predict 2014 wins using each model
newdata <- data.frame(Wins.x = df2013$Wins)
df2013$Proj_2014_Act <- round(predict(act_wins_lm, newdata), 1)
newdata <- data.frame(pypat_wins.x = df2013$pypat_wins)
df2013$Proj_2014_Pypat <- round(predict(pypat_wins_lm, newdata), 1)
# show results
df2013[, c("TeamName", "Wins", "pypat_wins", "Proj_2014_Act", "Proj_2014_Pypat")]
## Source: local data frame [32 x 5]
##
## TeamName Wins pypat_wins Proj_2014_Act Proj_2014_Pypat
## 1 Seattle Seahawks 13.0 13.1 9.6 10.1
## 2 Denver Broncos 13.0 12.2 9.6 9.7
## 3 Carolina Panthers 12.0 11.8 9.3 9.5
## 4 San Francisco 49ers 12.0 11.8 9.3 9.5
## 5 Cincinnati Bengals 11.0 11.4 9.0 9.4
## 6 Kansas City Chiefs 11.0 11.4 9.0 9.4
## 7 New Orleans Saints 11.0 11.0 9.0 9.2
## 8 New England Patriots 12.0 10.8 9.3 9.1
## 9 Arizona Cardinals 10.0 9.6 8.6 8.7
## 10 Indianapolis Colts 11.0 9.6 9.0 8.7
## 11 Philadelphia Eagles 10.0 9.5 8.6 8.6
## 12 San Diego Chargers 9.0 9.3 8.3 8.5
## 13 Detroit Lions 7.0 8.5 7.7 8.2
## 14 Pittsburgh Steelers 8.0 8.3 8.0 8.1
## 15 Dallas Cowboys 8.0 8.2 8.0 8.1
## 16 Green Bay Packers 8.5 7.7 8.2 7.9
## 17 St Louis Rams 7.0 7.5 7.7 7.8
## 18 Tennessee Titans 7.0 7.5 7.7 7.8
## 19 Miami Dolphins 8.0 7.4 8.0 7.8
## 20 Chicago Bears 8.0 7.2 8.0 7.7
## 21 Baltimore Ravens 8.0 7.0 8.0 7.6
## 22 Buffalo Bills 6.0 6.6 7.4 7.5
## 23 Minnesota Vikings 5.5 5.8 7.2 7.1
## 24 Atlanta Falcons 4.0 5.6 6.7 7.1
## 25 New York Giants 7.0 5.4 7.7 7.0
## 26 Cleveland Browns 4.0 5.2 6.7 6.9
## 27 New York Jets 8.0 5.2 8.0 6.9
## 28 Tampa Bay Buccaneers 4.0 5.1 6.7 6.9
## 29 Oakland Raiders 4.0 4.6 6.7 6.7
## 30 Washington Redskins 3.0 4.4 6.4 6.6
## 31 Houston Texans 2.0 3.9 6.1 6.4
## 32 Jacksonville Jaguars 4.0 2.8 6.7 5.9
Very similar results to the Football Perspective column, off by a tenth of a win for a few teams, I imagine it's due to slightly different data set and maybe some rounding discrepancies.
Last let's plot actual wins for Year N (Wins.x) and Year N+1 (Wins.y) and show the lines specified by the slope and intercept of the best-fit formulas. The blue line represents the formula based on actual wins, and the red line is based on the Pythagenpat wins:
# plot regression lines
ggplot(df, aes(x = Wins.x, y = Wins.y)) + geom_jitter(alpha = 7/10, size = 3) +
geom_abline(intercept = coef(act_wins_lm)[1], slope = coef(act_wins_lm)[2],
colour = "#6666FF") + geom_abline(intercept = coef(pypat_wins_lm)[1],
slope = coef(pypat_wins_lm)[2], colour = "#F8766D") + xlab("Year N Wins (Wins.x)") +
ylab("Year N + 1 Wins (Wins.y)") + ggtitle("Wins in Year N and N+1, 1990 - 2013, excl 1994, 1998 and 2001")