Analyzing baseball Data with R Chapter 4

First let’s install and upload some of the libraries we’ll be using

library(ggplot2)
library(devtools)

## Warning: package 'devtools' was built under R version 4.3.2

## Loading required package: usethis

## Warning: package 'usethis' was built under R version 4.3.2

library(ggrepel)

## Warning: package 'ggrepel' was built under R version 4.3.2

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(Lahman)

## Warning: package 'Lahman' was built under R version 4.3.2

tail(Teams, 3)

my_teams <- Teams %>%
  filter(yearID>2000) %>%
  select(teamID, yearID, lgID, G, W, L, R, RA)

my_teams %>%
  tail()

my_teams <- my_teams %>%
  mutate( RD =R - RA, Wpct = W/(W+L))

my_teams %>%
  tail()

A scatterlot of the run differential and winning percentages gives and indication of the association betwen the two variables

run_diff <- ggplot(my_teams, aes(x=RD, y=Wpct)) +
  geom_point() + 
  scale_x_continuous("Run Differential") +
  scale_y_continuous("Win Percentage")

print(run_diff)

linfit <- lm(Wpct ~ RD, data = my_teams)
linfit

## 
## Call:
## lm(formula = Wpct ~ RD, data = my_teams)
## 
## Coefficients:
## (Intercept)           RD  
##   0.4999855    0.0006235

summary(linfit)

## 
## Call:
## lm(formula = Wpct ~ RD, data = my_teams)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.134059 -0.018834 -0.000616  0.016946  0.131880 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.000e-01  1.092e-03  458.04   <2e-16 ***
## RD          6.235e-04  9.685e-06   64.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02804 on 658 degrees of freedom
## Multiple R-squared:  0.863,  Adjusted R-squared:  0.8628 
## F-statistic:  4145 on 1 and 658 DF,  p-value: < 2.2e-16

#let's add these points to the data frame
my_teams <- my_teams %>%
  mutate(fitted_value = linfit$fitted.values, residual = linfit$residuals)

my_teams$is_outlier <- abs(my_teams$residual) > (3 * sd(my_teams$residual))
my_teams

my_teams <- my_teams %>%
  mutate(label = paste(teamID, yearID, "Fitted:", round(fitted_value, 2), sep = " "))

x_min <- min(my_teams$RD)

my_teams$is_positive_outlier <- my_teams$residual > 0 & my_teams$is_outlier & !(my_teams$teamID == "SEA" & my_teams$yearID == 2021)
my_teams$is_negative_outlier <- my_teams$residual <= 0 & my_teams$is_outlier
my_teams$specific_point <- my_teams$teamID == "SEA" & my_teams$yearID == 2021 & my_teams$is_outlier


x_min <- min(my_teams$RD)

run_diff +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  geom_segment(data = subset(my_teams, is_outlier),
               aes(x = RD, y = Wpct, xend = RD, yend = fitted_value),
               linetype = "dotted", color = "blue") +
  geom_segment(data = subset(my_teams, is_outlier),
               aes(x = RD, y = fitted_value, xend = x_min, yend = fitted_value),
               linetype = "dotted", color = "blue") +
  geom_text(data = subset(my_teams, is_positive_outlier),
            aes(x = RD, y = Wpct, label = paste(teamID, yearID, "Actual Wpct:", round(Wpct, 3), sep = " ")),
            nudge_x = -0.1, nudge_y = 0.015, color = "darkgreen") +
    geom_text(data = subset(my_teams, specific_point),
            aes(x = RD, y = Wpct, label = paste(teamID, yearID, "Actual Wpct:", round(Wpct, 3), sep = " ")),
            nudge_x = -0.1, nudge_y = 0.08, color = "darkgreen") +
  geom_segment(data = subset(my_teams, specific_point),
               aes(x = RD, y = Wpct, xend = RD - 0.1, yend = Wpct + 0.08),
               color = "darkgreen") +
  geom_text(data = subset(my_teams, is_negative_outlier),
            aes(x = RD, y = Wpct, label = paste(teamID, yearID, "Actual WPct:", round(Wpct,3), sep = " ")),
            nudge_x = 0.03, nudge_y = -0.03, color = "red") +
  geom_text(data = subset(my_teams, is_outlier),
            aes(x = x_min, y = fitted_value, label = paste(round(fitted_value, 3))),
            hjust = 1, color = "blue", check_overlap = TRUE) +
  annotate("text", x=200, y = 0.3, label = "Blue = Predicted Win Percentage",  color = "blue", size = 3, hjust = 0.1)

## `geom_smooth()` using formula = 'y ~ x'

library(broom)

my_teams_aug <- augment(linfit, data=my_teams)

base_plot<- ggplot(my_teams_aug, aes(x=RD, y = .resid)) +
  geom_point(alpha=0.3) +
  geom_hline(yintercept = 0, linetype = 3) +
  xlab("Run Differential") + ylab("Residual")

highlight_teams <- my_teams_aug %>%
  arrange(desc(abs(.resid))) %>%
  head(4)

base_plot +
  geom_point(data = highlight_teams, color = "blue") +
  geom_text_repel(data = highlight_teams, color = "blue",
                  aes(label = paste(teamID, yearID)))

base_plot

Next, let’s calculate the Root Mean Square Error:

resid_summary <- my_teams_aug %>%
  summarise(N = n(), avg = mean(.resid),
            RMSE = sqrt(mean(.resid^2)))

resid_summary

rmse <- resid_summary %>%
  pull(RMSE)

Let’s see if the errors are normally distributed (Linear Regression Diagnostics)

my_teams_aug %>%
  summarise(N = n(),
            within_one = sum(abs(.resid)<rmse),
            within_two = sum(abs(.resid)<2*rmse)) %>%
  mutate(within_one_pct = within_one / N,
         within_two_pct = within_two / N)

Pythagorean Wins Formula

my_teams <- my_teams %>%
  mutate(Wpct_pyt = R ^ 2 / (R ^ 2 + RA ^ 2),
         residuals_pyt = Wpct - Wpct_pyt)

my_teams

my_teams %>%
  summarise (rmse = sqrt(mean(residuals_pyt^2)))

The exponent in the formula:

my_teams <- my_teams %>%
  mutate(logWratio = log(W/L),
         logRratio = log(R/RA))

pytFit <- lm(logWratio ~ 0 + logRratio, data = my_teams)
pytFit

## 
## Call:
## lm(formula = logWratio ~ 0 + logRratio, data = my_teams)
## 
## Coefficients:
## logRratio  
##     1.833

ggplot(my_teams, aes(logRratio, logWratio)) +
  geom_point() + 
  geom_abline(intercept = 0, slope=pytFit$coefficients[1], color = "red")

Let’s use the new suggestion

my_teams <- my_teams %>%
  mutate(Wpct_pyt_exp = R ^ 1.833 / (R ^ 1.833 + RA ^ 1.833),
         residuals_pyt_exp = Wpct - Wpct_pyt_exp)

my_teams

my_teams %>%
  summarise (rmse = sqrt(mean(residuals_pyt_exp^2)))

4.4.2 Good and bad predictions by the Pythagorean model

library(readr)
setwd("C:\\Users\\james\\R_Working_Directory\\Analyzing_Baseball_Data_With_R\\baseball_R\\data")
glheaders <- read_csv("game_log_header.csv")

## Rows: 0 Columns: 161
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (161): Date, DoubleHeader, DayOfWeek, VisitingTeam, VisitingTeamLeague, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

gl2011 <- read_csv("gl2011.txt", 
                   col_names = names(glheaders),
                   na = character())

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 2429 Columns: 161
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (73): DayOfWeek, VisitingTeam, VisitingTeamLeague, HomeTeam, HomeTeamLea...
## dbl (83): Date, DoubleHeader, VisitingTeamGameNumber, HomeTeamGameNumber, Vi...
## num  (1): CompletionInfo
## lgl  (4): ForfeitInfo, ProtestInfo, UmpireLFID, UmpireRFID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Let’s start creating the filtered dataframes

BOS2011 <- gl2011 %>%
  filter(HomeTeam == "BOS" | VisitingTeam == "BOS") %>%
  select(VisitingTeam, HomeTeam, VisitorRunsScored, HomeRunsScore)

head(BOS2011)

I’m going to attempt to look at 2020 Pirates, based on the residual chart above. This is not in the book, and will need to grab the data from https://www.retrosheet.org/gamelogs/index.html Lahaman data doesn’t have game logs, will need to grab from retrosheet

gl2020 <- read.csv("C:\\Users\\james\\R_Working_Directory\\Analyzing_Baseball_Data_With_R\\baseball_R\\data\\gl2020.txt", 
                   col.names = names(glheaders),
                   na = character(), 
                   fileEncoding = "UTF-8")


PIT2020 <- gl2020 %>%
  filter(HomeTeam == "PIT" | VisitingTeam == "PIT") %>%
  select(VisitingTeam, HomeTeam, VisitorRunsScored, HomeRunsScore)

PIT2020$VisitorRunsScored <- as.double(PIT2020$VisitorRunsScored)
PIT2020$HomeRunsScore <- as.double(PIT2020$HomeRunsScore)

Let’s calculate Run Differentials

BOS2011 <- BOS2011 %>%
  mutate(ScoreDiff = ifelse(HomeTeam == "BOS", 
                            HomeRunsScore - VisitorRunsScored, 
                            VisitorRunsScored - HomeRunsScore), 
         W = ScoreDiff > 0)

PIT2020 <- PIT2020 %>%
  mutate(ScoreDiff = ifelse(HomeTeam == "PIT", 
                            HomeRunsScore - VisitorRunsScored,
                            VisitorRunsScored - HomeRunsScore), 
         W = ScoreDiff > 0)


head(PIT2020)

Next, we’ll compute the summary statistics using skim() from the skimr package.

Clean Text Data: If the error persists, it may be necessary to clean the text data in the ScoreDiff column of both data frames. You can use the iconv function in R to convert the text data to a different encoding or to replace problematic characters. For example:

#BOS2011$ScoreDiff <- as.numeric(as.character(BOS2011$ScoreDiff))
#PIT2020$ScoreDiff <- as.numeric(as.character(PIT2020$ScoreDiff))
str(BOS2011$ScoreDiff)

##  num [1:162] -4 -7 -4 -2 -4 -1 3 -5 4 -11 ...

str(PIT2020$ScoreDiff)

##  num [1:60] -1 -8 4 -1 2 -3 -3 -1 -1 -1 ...

#BOS2011$ScoreDiff <- iconv(BOS2011$ScoreDiff, from = "UTF-8", to = "ASCII", sub = ",")
#PIT2020$ScoreDiff <- iconv(PIT2020$ScoreDiff, from = "UTF-8", to = "ASCII", sub = ",")

PIT_sum <- PIT2020 %>%
  group_by(W) %>%
  summarise(
    mean_run_dif = mean(ScoreDiff, na.rm = TRUE),
    median_run_diff = median(ScoreDiff, na.rm = TRUE),
    sd_run_dif = sd(ScoreDiff, na.rm = TRUE),
    perc_0 = quantile(ScoreDiff, probs = 0, na.rm = TRUE),
    perc_25 = quantile(ScoreDiff, probs = 0.25, na.rm = TRUE),
    perc_50 = quantile(ScoreDiff, probs = 0.50, na.rm = TRUE),
    perc_75 = quantile(ScoreDiff, probs = 0.75, na.rm = TRUE),
    perc_100 = quantile(ScoreDiff, probs = 1, na.rm = TRUE)
  )

BOS_sum <- BOS2011 %>%
  group_by(W) %>%
  summarise(
    mean_run_dif = mean(ScoreDiff, na.rm = TRUE),
    median_run_diff = median(ScoreDiff, na.rm = TRUE),
    sd_run_dif = sd(ScoreDiff, na.rm = TRUE),
    perc_0 = quantile(ScoreDiff, probs = 0, na.rm = TRUE),
    perc_25 = quantile(ScoreDiff, probs = 0.25, na.rm = TRUE),
    perc_50 = quantile(ScoreDiff, probs = 0.50, na.rm = TRUE),
    perc_75 = quantile(ScoreDiff, probs = 0.75, na.rm = TRUE),
    perc_100 = quantile(ScoreDiff, probs = 1, na.rm = TRUE)
  )

BOS_sum

PIT_sum

Boston in 2011 underperformed because you see their win’s had a significantly bigger run differential than their losses. Whereas, for Pittsburgh, you see a postive skew to the data, where the mean is significantly higher than the median. This indicates some larger wins influenced the mean. You can also see the right skew in the percentiles, with both the 0 and 25th percentile being 1. Pittsburgh lost games by a lot more runs than they won games by, which led to an underperformance. Additionally, it’s worth noting that the 2020 season only had 60 games. While this sample is large enough to draw inference, it’s not as long as a typical season. Pittsburgh didn’t win enough games to see their winning distribution normalize.

library(skimr)

## Warning: package 'skimr' was built under R version 4.3.2

BOS_skim <- BOS2011 %>%
  group_by(W) %>%
  skim(ScoreDiff)

PIT_skim <- PIT2020 %>%
  group_by(W) %>%
  skim(ScoreDiff)


print(BOS_skim)

## ── Data Summary ────────────────────────
##                            Values    
## Name                       Piped data
## Number of rows             162       
## Number of columns          6         
## _______________________              
## Column type frequency:               
##   numeric                  1         
## ________________________             
## Group variables            W         
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable W     n_missing complete_rate  mean   sd  p0 p25 p50 p75 p100
## 1 ScoreDiff     FALSE         0             1 -3.46 2.56 -11  -4  -3  -1   -1
## 2 ScoreDiff     TRUE          0             1  4.3  3.28   1   2   4   6   14
##   hist 
## 1 ▁▂▂▆▇
## 2 ▇▆▁▁▁

print(PIT_skim, include_summary = FALSE)

## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable W     n_missing complete_rate  mean   sd  p0 p25 p50 p75 p100
## 1 ScoreDiff     FALSE         0             1 -3.34 2.55 -11  -5  -3  -1   -1
## 2 ScoreDiff     TRUE          0             1  3.05 2.34   1   1   2   4    8
##   hist 
## 1 ▁▂▂▅▇
## 2 ▇▁▃▁▂

Let’s create two new columns to look at which team won and by how much for each year 2011 and the COVID shortened 2020

results_2011 <- gl2011 %>%
  select(VisitingTeam, HomeTeam,
         VisitorRunsScored, HomeRunsScore) %>%
  mutate(winner = ifelse(HomeRunsScore > VisitorRunsScored, HomeTeam, VisitingTeam),
         diff = abs(VisitorRunsScored - HomeRunsScore))

results_2020 <- gl2020 %>%
  select(VisitingTeam, HomeTeam,
         VisitorRunsScored, HomeRunsScore) %>%
  mutate(winner = ifelse(HomeRunsScore > VisitorRunsScored, HomeTeam, VisitingTeam),
         diff = abs(VisitorRunsScored - HomeRunsScore))

Now, let’s look at one run wins

one_run_wins_2011 <- results_2011 %>%
  filter(diff == 1) %>%
  group_by(winner) %>%
  summarise(one_run_w = n())

one_run_wins_2020 <- results_2020 %>%
  filter(diff == 1) %>%
  group_by(winner) %>%
  summarise(one_run_w = n())

Now let’s combine the data with the Lahman database to create a plot

teams2011 <- my_teams %>%
  filter(yearID==2011) %>%
  mutate(teamID == ifelse(teamID == "LAA", "ANA", as.character(teamID))) %>%
  inner_join(one_run_wins_2011, by = c("teamID" = "winner"))

teams2020 <- my_teams %>%
  filter(yearID==2020) %>%
  mutate(teamID == ifelse(teamID == "LAA", "ANA", as.character(teamID))) %>%
  inner_join(one_run_wins_2020, by = c("teamID" = "winner"))

Now let’s plot, first let’s combine the data

ggplot(teams2011, aes(x=one_run_w, y= residuals_pyt)) +
  geom_point() +
  geom_text_repel(aes(label = teamID)) +
  xlab("One run wins") + ylab("Pythagorean residuals")

ggplot(teams2020, aes(x=one_run_w, y= residuals_pyt)) +
  geom_point() +
  geom_text_repel(aes(label = teamID)) +
  xlab("One run wins") + ylab("Pythagorean residuals")

Let’s see which teams have top relief pitchers, which may be more likely to preserve one run leads

top_closers <- Pitching %>%
  filter(GF > 50 & ERA < 2.5) %>%
  select(playerID, yearID, teamID)
head(top_closers)

top_closers_2011 <- top_closers %>%
  filter(yearID == 2011)

#Create a dataframe of only top closers
my_teams %>%
  inner_join(top_closers) %>%
  pull(residuals_pyt) %>%
  summary()

## Joining with `by = join_by(teamID, yearID)`

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.048693 -0.010947  0.002509  0.005416  0.021532  0.081164

wins_over_pyt_expected_with_good_closer <- 0.002509 * 162

Chapter 4.5 How many runs for a win? Ralph Coala took partial derivative offor Runs of the following formula: W = G * R^2 / (R^2 + RA^2) Let’s start by doing the same

IR_per_W <- D(expression(G*R^2/(R^2*RA^2)), "R")
IR_per_W

## G * (2 * R)/(R^2 * RA^2) - G * R^2 * (2 * R * RA^2)/(R^2 * RA^2)^2

Using this formula one can compute the incremental runs needed per one win for various runs scored/runs allowed scenarios. As a first step, lets create a function to calculate incremental runs according to Caola’s formula.

IR <- function(RS=5, RA = 5) {
  (RS ^ 2 + RA ^ 2)^2 / (2 * RS * RA ^ 2)
}

We’ll use this function to create a table for various runs scored/runs allowed combinations.

ir_table <- expand.grid(RS = seq(3, 6, 0.5),
                        RA = seq(3, 6, 0.5))
head(ir_table)

Finally, we calculate the incremental runs for the various scenarios. The spread() function in the third line of the following code is used to show the results in tabular form.

ir_table_tabular <- ir_table %>%
  mutate(IRW = IR(RS, RA)) %>%
  spread(key = RA, value = IRW, sep = "=") %>%
  round(1)

ir_table_tabular

4.7 Exercises relationships Between Winning Percentage and Run Differential Accross Decades

Teams %>% filter(yearID >= 1961) %>% 
     mutate(Era = ifelse(yearID <= 1970, "1961-1970",
     ifelse(yearID <= 1980, "1971-1980",
     ifelse(yearID <= 1990, "1981-1990",
     ifelse(yearID <= 2000, "1991-2000",
     ifelse(yearID <= 2010, "2001-2010", 
     ifelse(yearID <= 2020, "2011-2020", "2021-2030")))))),
     WinPct = W / (W + L)) ->
     Eras

one_fit <- function(years){
  lm(WinPct ~ I(R - RA), 
     data = filter(Eras, Era == years))
}

the_eras <- c("1961-1970", "1971-1980", 
              "1981-1990", "1991-2000",
              "2001-2010", "2011-2020",
              "2021-2030")
seven_fits <- lapply(the_eras, one_fit)

names(seven_fits) <- the_eras

sapply(seven_fits, coef)

##                1961-1970    1971-1980    1981-1990   1991-2000    2001-2010
## (Intercept) 0.4999330793 0.4999883663 0.4999447717 0.499999401 0.4999909185
## I(R - RA)   0.0007039746 0.0006375447 0.0007013848 0.000627574 0.0006216137
##                2011-2020    2021-2030
## (Intercept) 0.4999769811 0.5000006390
## I(R - RA)   0.0006345252 0.0005968228

Now let’s compare the predicted winning percentage for a team with a RD of 10 runs across the decades:

p10 <- function(fit){
  predict(fit, data.frame(R = 30, RA = 20))
}
sapply(seven_fits, p10)

## 1961-1970.1 1971-1980.1 1981-1990.1 1991-2000.1 2001-2010.1 2011-2020.1 
##   0.5069728   0.5063638   0.5069586   0.5062751   0.5062071   0.5063222 
## 2021-2030.1 
##   0.5059689

Exercise 2 (Pythagorean Residuals for Poor and Great Teams in the 19th Century)

As baseball was evolving into its modern form, nineteenth century leagues often featured abysmal teams that did not even succeed in finishing their season, as well as some dominant clubs.

Fit a Pythagorean formula model to the run-differential, win-loss data for teams who played in the 19th century.

library(Lahman)
library(tidyverse)
Teams %>% filter(yearID <= 1900) %>% 
     mutate(WinPct = W / (W + L)) ->
     D_19th

By inspecting the residual plot of your fitted model from (a), did the great and poor teams in the 19th century do better or worse than one would expect on the basis of their run differentials?

Below I construct a graph of the values of R - RA (horizontal) against the residual (vertical). I color the point by the winning proportion (bad is WinPct < .3 and great is WinPct > .7). We see some great teams with large positive residuals and bad teams with large negative residuals. By exploring further, can find the identity of the teams with the large residuals.

fit <- lm(WinPct ~ I(R - RA), data = D_19th)
fit

## 
## Call:
## lm(formula = WinPct ~ I(R - RA), data = D_19th)
## 
## Coefficients:
## (Intercept)    I(R - RA)  
##   0.4876483    0.0007367

library(broom)
out <- augment(fit)
# Add a 'type' column based on WinPct values
out <- out %>% 
  mutate(type = ifelse(WinPct > .7, "great",
          ifelse(WinPct < .3, "bad", "other")))

# Plotting with ggplot2
ggplot(out, aes(x = `.fitted`, y = `.resid`, color = type)) + 
  geom_point() +
  geom_hline(yintercept = 0)

Exercise 3: (Exploring the Manager Effect in Baseball)

Retrosheet game logs report, for every game played, the managers of both teams.

Select a period of your choice (encompassing at least ten years) and fit the Pythagorean formula model to the run-differential, win-loss data.

Here I decided to look at the 2010-2024 period (added 4 extra years since we aren’t through the decade yet). I fit the Pythagorean formula and store the output in the variable fit.

library(Lahman)
library(tidyverse)
library(broom)
Teams %>% filter(yearID >= 2010, yearID <= 2024) %>% 
  mutate(WinPct = W / (W + L)) -> d
fit <- lm(WinPct ~ I(R - RA), data = d)
summary(fit)

## 
## Call:
## lm(formula = WinPct ~ I(R - RA), data = d)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.134233 -0.017958  0.000799  0.016304  0.132188 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.000e-01  1.492e-03  335.06   <2e-16 ***
## I(R - RA)   6.213e-04  1.296e-05   47.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02947 on 388 degrees of freedom
## Multiple R-squared:  0.8555, Adjusted R-squared:  0.8551 
## F-statistic:  2297 on 1 and 388 DF,  p-value: < 2.2e-16

On the basis of your fit in part (a) and the list of managers, compile a list of the managers who most overperformed their Pythagorean winning percentage and the managers who most underperformed it.

I find the residuals for each team/season. For each manager, I find the number of seasons coached (in this period) and the average residual. I arrange the resulting data frame by the value of Mean Residual. The head and tail of this table give the top overperforming and underperforming managers.

# Ensure augment is correctly augmenting the data with residuals
out <- augment(fit, data = select(d, yearID, teamID, R, RA))


# Joining with Manager data
out <- out %>%
  inner_join(Managers, by = c("yearID", "teamID"))

# Ensuring that group_by and summarize can correctly compute Mean_Residual
output <- out %>%
  group_by(playerID) %>%
  summarize(N = n(), Mean_Residual = mean(.std.resid), .groups = 'drop') %>%
  arrange(desc(Mean_Residual))

# Displaying the results
print("Top Overperforming Managers:")

## [1] "Top Overperforming Managers:"

print(head(out))

## # A tibble: 6 × 17
##   yearID teamID     R    RA .fitted    .hat .sigma  .cooksd .std.resid playerID 
##    <int> <fct>  <int> <int>   <dbl>   <dbl>  <dbl>    <dbl>      <dbl> <chr>    
## 1   2010 ARI      713   836   0.424 0.00549 0.0295 0.00159      -0.760 hinchaj01
## 2   2010 ARI      713   836   0.424 0.00549 0.0295 0.00159      -0.760 gibsoki01
## 3   2010 ATL      738   629   0.568 0.00486 0.0295 0.000101     -0.203 coxbo01  
## 4   2010 ATL      738   629   0.568 0.00486 0.0295 0.000101     -0.203 cadahch99
## 5   2010 BAL      613   785   0.393 0.00829 0.0295 0.000991      0.487 trembda99
## 6   2010 BAL      613   785   0.393 0.00829 0.0295 0.000991      0.487 samueju01
## # ℹ 7 more variables: lgID <fct>, inseason <int>, G <int>, W <int>, L <int>,
## #   rank <int>, plyrMgr <fct>

print("Top Underperforming Managers:")

## [1] "Top Underperforming Managers:"

print(tail(out))

## # A tibble: 6 × 17
##   yearID teamID     R    RA .fitted    .hat .sigma   .cooksd .std.resid playerID
##    <int> <fct>  <int> <int>   <dbl>   <dbl>  <dbl>     <dbl>      <dbl> <chr>   
## 1   2022 TBA      666   614   0.532 0.00309 0.0295   3.64e-6    -0.0485 cashke01
## 2   2022 TEX      707   743   0.478 0.00281 0.0294   5.46e-3    -1.97   woodwch…
## 3   2022 TEX      707   743   0.478 0.00281 0.0294   5.46e-3    -1.97   beaslto…
## 4   2022 TOR      775   679   0.560 0.00435 0.0295   1.73e-4     0.281  montoch…
## 5   2022 TOR      775   679   0.560 0.00435 0.0295   1.73e-4     0.281  schnejo…
## 6   2022 WAS      603   855   0.343 0.0149  0.0295   1.35e-4    -0.134  martida…
## # ℹ 7 more variables: lgID <fct>, inseason <int>, G <int>, W <int>, L <int>,
## #   rank <int>, plyrMgr <fct>

fit2 <- lm(Mean_Residual ~ N, data = output)
summary(fit2)

## 
## Call:
## lm(formula = Mean_Residual ~ N, data = output)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.80851 -0.34557 -0.01774  0.43486  1.65354 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.18387    0.07832  -2.348   0.0205 *
## N            0.02612    0.01600   1.633   0.1051  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5574 on 123 degrees of freedom
## Multiple R-squared:  0.02121,    Adjusted R-squared:  0.01325 
## F-statistic: 2.666 on 1 and 123 DF,  p-value: 0.1051

ggplot(output, aes(x=N, y=Mean_Residual)) +
  geom_point() +
  geom_abline(intercept = 0, slope=0, color = "red")