First let’s install and upload some of the libraries we’ll be using
library(ggplot2)
library(devtools)
## Warning: package 'devtools' was built under R version 4.3.2
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.3.2
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Lahman)
## Warning: package 'Lahman' was built under R version 4.3.2
tail(Teams, 3)
my_teams <- Teams %>%
filter(yearID>2000) %>%
select(teamID, yearID, lgID, G, W, L, R, RA)
my_teams %>%
tail()
my_teams <- my_teams %>%
mutate( RD =R - RA, Wpct = W/(W+L))
my_teams %>%
tail()
A scatterlot of the run differential and winning percentages gives and indication of the association betwen the two variables
run_diff <- ggplot(my_teams, aes(x=RD, y=Wpct)) +
geom_point() +
scale_x_continuous("Run Differential") +
scale_y_continuous("Win Percentage")
print(run_diff)
linfit <- lm(Wpct ~ RD, data = my_teams)
linfit
##
## Call:
## lm(formula = Wpct ~ RD, data = my_teams)
##
## Coefficients:
## (Intercept) RD
## 0.4999855 0.0006235
summary(linfit)
##
## Call:
## lm(formula = Wpct ~ RD, data = my_teams)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.134059 -0.018834 -0.000616 0.016946 0.131880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.000e-01 1.092e-03 458.04 <2e-16 ***
## RD 6.235e-04 9.685e-06 64.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02804 on 658 degrees of freedom
## Multiple R-squared: 0.863, Adjusted R-squared: 0.8628
## F-statistic: 4145 on 1 and 658 DF, p-value: < 2.2e-16
#let's add these points to the data frame
my_teams <- my_teams %>%
mutate(fitted_value = linfit$fitted.values, residual = linfit$residuals)
my_teams$is_outlier <- abs(my_teams$residual) > (3 * sd(my_teams$residual))
my_teams
my_teams <- my_teams %>%
mutate(label = paste(teamID, yearID, "Fitted:", round(fitted_value, 2), sep = " "))
x_min <- min(my_teams$RD)
my_teams$is_positive_outlier <- my_teams$residual > 0 & my_teams$is_outlier & !(my_teams$teamID == "SEA" & my_teams$yearID == 2021)
my_teams$is_negative_outlier <- my_teams$residual <= 0 & my_teams$is_outlier
my_teams$specific_point <- my_teams$teamID == "SEA" & my_teams$yearID == 2021 & my_teams$is_outlier
x_min <- min(my_teams$RD)
run_diff +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
geom_segment(data = subset(my_teams, is_outlier),
aes(x = RD, y = Wpct, xend = RD, yend = fitted_value),
linetype = "dotted", color = "blue") +
geom_segment(data = subset(my_teams, is_outlier),
aes(x = RD, y = fitted_value, xend = x_min, yend = fitted_value),
linetype = "dotted", color = "blue") +
geom_text(data = subset(my_teams, is_positive_outlier),
aes(x = RD, y = Wpct, label = paste(teamID, yearID, "Actual Wpct:", round(Wpct, 3), sep = " ")),
nudge_x = -0.1, nudge_y = 0.015, color = "darkgreen") +
geom_text(data = subset(my_teams, specific_point),
aes(x = RD, y = Wpct, label = paste(teamID, yearID, "Actual Wpct:", round(Wpct, 3), sep = " ")),
nudge_x = -0.1, nudge_y = 0.08, color = "darkgreen") +
geom_segment(data = subset(my_teams, specific_point),
aes(x = RD, y = Wpct, xend = RD - 0.1, yend = Wpct + 0.08),
color = "darkgreen") +
geom_text(data = subset(my_teams, is_negative_outlier),
aes(x = RD, y = Wpct, label = paste(teamID, yearID, "Actual WPct:", round(Wpct,3), sep = " ")),
nudge_x = 0.03, nudge_y = -0.03, color = "red") +
geom_text(data = subset(my_teams, is_outlier),
aes(x = x_min, y = fitted_value, label = paste(round(fitted_value, 3))),
hjust = 1, color = "blue", check_overlap = TRUE) +
annotate("text", x=200, y = 0.3, label = "Blue = Predicted Win Percentage", color = "blue", size = 3, hjust = 0.1)
## `geom_smooth()` using formula = 'y ~ x'
library(broom)
my_teams_aug <- augment(linfit, data=my_teams)
base_plot<- ggplot(my_teams_aug, aes(x=RD, y = .resid)) +
geom_point(alpha=0.3) +
geom_hline(yintercept = 0, linetype = 3) +
xlab("Run Differential") + ylab("Residual")
highlight_teams <- my_teams_aug %>%
arrange(desc(abs(.resid))) %>%
head(4)
base_plot +
geom_point(data = highlight_teams, color = "blue") +
geom_text_repel(data = highlight_teams, color = "blue",
aes(label = paste(teamID, yearID)))
base_plot
Next, let’s calculate the Root Mean Square Error:
resid_summary <- my_teams_aug %>%
summarise(N = n(), avg = mean(.resid),
RMSE = sqrt(mean(.resid^2)))
resid_summary
rmse <- resid_summary %>%
pull(RMSE)
Let’s see if the errors are normally distributed (Linear Regression Diagnostics)
my_teams_aug %>%
summarise(N = n(),
within_one = sum(abs(.resid)<rmse),
within_two = sum(abs(.resid)<2*rmse)) %>%
mutate(within_one_pct = within_one / N,
within_two_pct = within_two / N)
Pythagorean Wins Formula
my_teams <- my_teams %>%
mutate(Wpct_pyt = R ^ 2 / (R ^ 2 + RA ^ 2),
residuals_pyt = Wpct - Wpct_pyt)
my_teams
my_teams %>%
summarise (rmse = sqrt(mean(residuals_pyt^2)))
The exponent in the formula:
my_teams <- my_teams %>%
mutate(logWratio = log(W/L),
logRratio = log(R/RA))
pytFit <- lm(logWratio ~ 0 + logRratio, data = my_teams)
pytFit
##
## Call:
## lm(formula = logWratio ~ 0 + logRratio, data = my_teams)
##
## Coefficients:
## logRratio
## 1.833
ggplot(my_teams, aes(logRratio, logWratio)) +
geom_point() +
geom_abline(intercept = 0, slope=pytFit$coefficients[1], color = "red")
Let’s use the new suggestion
my_teams <- my_teams %>%
mutate(Wpct_pyt_exp = R ^ 1.833 / (R ^ 1.833 + RA ^ 1.833),
residuals_pyt_exp = Wpct - Wpct_pyt_exp)
my_teams
my_teams %>%
summarise (rmse = sqrt(mean(residuals_pyt_exp^2)))
4.4.2 Good and bad predictions by the Pythagorean model
library(readr)
setwd("C:\\Users\\james\\R_Working_Directory\\Analyzing_Baseball_Data_With_R\\baseball_R\\data")
glheaders <- read_csv("game_log_header.csv")
## Rows: 0 Columns: 161
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (161): Date, DoubleHeader, DayOfWeek, VisitingTeam, VisitingTeamLeague, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gl2011 <- read_csv("gl2011.txt",
col_names = names(glheaders),
na = character())
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 2429 Columns: 161
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (73): DayOfWeek, VisitingTeam, VisitingTeamLeague, HomeTeam, HomeTeamLea...
## dbl (83): Date, DoubleHeader, VisitingTeamGameNumber, HomeTeamGameNumber, Vi...
## num (1): CompletionInfo
## lgl (4): ForfeitInfo, ProtestInfo, UmpireLFID, UmpireRFID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Let’s start creating the filtered dataframes
BOS2011 <- gl2011 %>%
filter(HomeTeam == "BOS" | VisitingTeam == "BOS") %>%
select(VisitingTeam, HomeTeam, VisitorRunsScored, HomeRunsScore)
head(BOS2011)
I’m going to attempt to look at 2020 Pirates, based on the residual chart above. This is not in the book, and will need to grab the data from https://www.retrosheet.org/gamelogs/index.html Lahaman data doesn’t have game logs, will need to grab from retrosheet
gl2020 <- read.csv("C:\\Users\\james\\R_Working_Directory\\Analyzing_Baseball_Data_With_R\\baseball_R\\data\\gl2020.txt",
col.names = names(glheaders),
na = character(),
fileEncoding = "UTF-8")
PIT2020 <- gl2020 %>%
filter(HomeTeam == "PIT" | VisitingTeam == "PIT") %>%
select(VisitingTeam, HomeTeam, VisitorRunsScored, HomeRunsScore)
PIT2020$VisitorRunsScored <- as.double(PIT2020$VisitorRunsScored)
PIT2020$HomeRunsScore <- as.double(PIT2020$HomeRunsScore)
Let’s calculate Run Differentials
BOS2011 <- BOS2011 %>%
mutate(ScoreDiff = ifelse(HomeTeam == "BOS",
HomeRunsScore - VisitorRunsScored,
VisitorRunsScored - HomeRunsScore),
W = ScoreDiff > 0)
PIT2020 <- PIT2020 %>%
mutate(ScoreDiff = ifelse(HomeTeam == "PIT",
HomeRunsScore - VisitorRunsScored,
VisitorRunsScored - HomeRunsScore),
W = ScoreDiff > 0)
head(PIT2020)
Next, we’ll compute the summary statistics using skim() from the skimr package.
Clean Text Data: If the error persists, it may be necessary to clean the text data in the ScoreDiff column of both data frames. You can use the iconv function in R to convert the text data to a different encoding or to replace problematic characters. For example:
#BOS2011$ScoreDiff <- as.numeric(as.character(BOS2011$ScoreDiff))
#PIT2020$ScoreDiff <- as.numeric(as.character(PIT2020$ScoreDiff))
str(BOS2011$ScoreDiff)
## num [1:162] -4 -7 -4 -2 -4 -1 3 -5 4 -11 ...
str(PIT2020$ScoreDiff)
## num [1:60] -1 -8 4 -1 2 -3 -3 -1 -1 -1 ...
#BOS2011$ScoreDiff <- iconv(BOS2011$ScoreDiff, from = "UTF-8", to = "ASCII", sub = ",")
#PIT2020$ScoreDiff <- iconv(PIT2020$ScoreDiff, from = "UTF-8", to = "ASCII", sub = ",")
PIT_sum <- PIT2020 %>%
group_by(W) %>%
summarise(
mean_run_dif = mean(ScoreDiff, na.rm = TRUE),
median_run_diff = median(ScoreDiff, na.rm = TRUE),
sd_run_dif = sd(ScoreDiff, na.rm = TRUE),
perc_0 = quantile(ScoreDiff, probs = 0, na.rm = TRUE),
perc_25 = quantile(ScoreDiff, probs = 0.25, na.rm = TRUE),
perc_50 = quantile(ScoreDiff, probs = 0.50, na.rm = TRUE),
perc_75 = quantile(ScoreDiff, probs = 0.75, na.rm = TRUE),
perc_100 = quantile(ScoreDiff, probs = 1, na.rm = TRUE)
)
BOS_sum <- BOS2011 %>%
group_by(W) %>%
summarise(
mean_run_dif = mean(ScoreDiff, na.rm = TRUE),
median_run_diff = median(ScoreDiff, na.rm = TRUE),
sd_run_dif = sd(ScoreDiff, na.rm = TRUE),
perc_0 = quantile(ScoreDiff, probs = 0, na.rm = TRUE),
perc_25 = quantile(ScoreDiff, probs = 0.25, na.rm = TRUE),
perc_50 = quantile(ScoreDiff, probs = 0.50, na.rm = TRUE),
perc_75 = quantile(ScoreDiff, probs = 0.75, na.rm = TRUE),
perc_100 = quantile(ScoreDiff, probs = 1, na.rm = TRUE)
)
BOS_sum
PIT_sum
Boston in 2011 underperformed because you see their win’s had a significantly bigger run differential than their losses. Whereas, for Pittsburgh, you see a postive skew to the data, where the mean is significantly higher than the median. This indicates some larger wins influenced the mean. You can also see the right skew in the percentiles, with both the 0 and 25th percentile being 1. Pittsburgh lost games by a lot more runs than they won games by, which led to an underperformance. Additionally, it’s worth noting that the 2020 season only had 60 games. While this sample is large enough to draw inference, it’s not as long as a typical season. Pittsburgh didn’t win enough games to see their winning distribution normalize.
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.2
BOS_skim <- BOS2011 %>%
group_by(W) %>%
skim(ScoreDiff)
PIT_skim <- PIT2020 %>%
group_by(W) %>%
skim(ScoreDiff)
print(BOS_skim)
## ── Data Summary ────────────────────────
## Values
## Name Piped data
## Number of rows 162
## Number of columns 6
## _______________________
## Column type frequency:
## numeric 1
## ________________________
## Group variables W
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable W n_missing complete_rate mean sd p0 p25 p50 p75 p100
## 1 ScoreDiff FALSE 0 1 -3.46 2.56 -11 -4 -3 -1 -1
## 2 ScoreDiff TRUE 0 1 4.3 3.28 1 2 4 6 14
## hist
## 1 ▁▂▂▆▇
## 2 ▇▆▁▁▁
print(PIT_skim, include_summary = FALSE)
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable W n_missing complete_rate mean sd p0 p25 p50 p75 p100
## 1 ScoreDiff FALSE 0 1 -3.34 2.55 -11 -5 -3 -1 -1
## 2 ScoreDiff TRUE 0 1 3.05 2.34 1 1 2 4 8
## hist
## 1 ▁▂▂▅▇
## 2 ▇▁▃▁▂
Let’s create two new columns to look at which team won and by how much for each year 2011 and the COVID shortened 2020
results_2011 <- gl2011 %>%
select(VisitingTeam, HomeTeam,
VisitorRunsScored, HomeRunsScore) %>%
mutate(winner = ifelse(HomeRunsScore > VisitorRunsScored, HomeTeam, VisitingTeam),
diff = abs(VisitorRunsScored - HomeRunsScore))
results_2020 <- gl2020 %>%
select(VisitingTeam, HomeTeam,
VisitorRunsScored, HomeRunsScore) %>%
mutate(winner = ifelse(HomeRunsScore > VisitorRunsScored, HomeTeam, VisitingTeam),
diff = abs(VisitorRunsScored - HomeRunsScore))
Now, let’s look at one run wins
one_run_wins_2011 <- results_2011 %>%
filter(diff == 1) %>%
group_by(winner) %>%
summarise(one_run_w = n())
one_run_wins_2020 <- results_2020 %>%
filter(diff == 1) %>%
group_by(winner) %>%
summarise(one_run_w = n())
Now let’s combine the data with the Lahman database to create a plot
teams2011 <- my_teams %>%
filter(yearID==2011) %>%
mutate(teamID == ifelse(teamID == "LAA", "ANA", as.character(teamID))) %>%
inner_join(one_run_wins_2011, by = c("teamID" = "winner"))
teams2020 <- my_teams %>%
filter(yearID==2020) %>%
mutate(teamID == ifelse(teamID == "LAA", "ANA", as.character(teamID))) %>%
inner_join(one_run_wins_2020, by = c("teamID" = "winner"))
Now let’s plot, first let’s combine the data
ggplot(teams2011, aes(x=one_run_w, y= residuals_pyt)) +
geom_point() +
geom_text_repel(aes(label = teamID)) +
xlab("One run wins") + ylab("Pythagorean residuals")
ggplot(teams2020, aes(x=one_run_w, y= residuals_pyt)) +
geom_point() +
geom_text_repel(aes(label = teamID)) +
xlab("One run wins") + ylab("Pythagorean residuals")
Let’s see which teams have top relief pitchers, which may be more likely
to preserve one run leads
top_closers <- Pitching %>%
filter(GF > 50 & ERA < 2.5) %>%
select(playerID, yearID, teamID)
head(top_closers)
top_closers_2011 <- top_closers %>%
filter(yearID == 2011)
#Create a dataframe of only top closers
my_teams %>%
inner_join(top_closers) %>%
pull(residuals_pyt) %>%
summary()
## Joining with `by = join_by(teamID, yearID)`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.048693 -0.010947 0.002509 0.005416 0.021532 0.081164
wins_over_pyt_expected_with_good_closer <- 0.002509 * 162
Chapter 4.5 How many runs for a win? Ralph Coala took partial derivative offor Runs of the following formula: W = G * R^2 / (R^2 + RA^2) Let’s start by doing the same
IR_per_W <- D(expression(G*R^2/(R^2*RA^2)), "R")
IR_per_W
## G * (2 * R)/(R^2 * RA^2) - G * R^2 * (2 * R * RA^2)/(R^2 * RA^2)^2
Using this formula one can compute the incremental runs needed per one win for various runs scored/runs allowed scenarios. As a first step, lets create a function to calculate incremental runs according to Caola’s formula.
IR <- function(RS=5, RA = 5) {
(RS ^ 2 + RA ^ 2)^2 / (2 * RS * RA ^ 2)
}
We’ll use this function to create a table for various runs scored/runs allowed combinations.
ir_table <- expand.grid(RS = seq(3, 6, 0.5),
RA = seq(3, 6, 0.5))
head(ir_table)
Finally, we calculate the incremental runs for the various scenarios. The spread() function in the third line of the following code is used to show the results in tabular form.
ir_table_tabular <- ir_table %>%
mutate(IRW = IR(RS, RA)) %>%
spread(key = RA, value = IRW, sep = "=") %>%
round(1)
ir_table_tabular
4.7 Exercises relationships Between Winning Percentage and Run Differential Accross Decades
Teams %>% filter(yearID >= 1961) %>%
mutate(Era = ifelse(yearID <= 1970, "1961-1970",
ifelse(yearID <= 1980, "1971-1980",
ifelse(yearID <= 1990, "1981-1990",
ifelse(yearID <= 2000, "1991-2000",
ifelse(yearID <= 2010, "2001-2010",
ifelse(yearID <= 2020, "2011-2020", "2021-2030")))))),
WinPct = W / (W + L)) ->
Eras
one_fit <- function(years){
lm(WinPct ~ I(R - RA),
data = filter(Eras, Era == years))
}
the_eras <- c("1961-1970", "1971-1980",
"1981-1990", "1991-2000",
"2001-2010", "2011-2020",
"2021-2030")
seven_fits <- lapply(the_eras, one_fit)
names(seven_fits) <- the_eras
sapply(seven_fits, coef)
## 1961-1970 1971-1980 1981-1990 1991-2000 2001-2010
## (Intercept) 0.4999330793 0.4999883663 0.4999447717 0.499999401 0.4999909185
## I(R - RA) 0.0007039746 0.0006375447 0.0007013848 0.000627574 0.0006216137
## 2011-2020 2021-2030
## (Intercept) 0.4999769811 0.5000006390
## I(R - RA) 0.0006345252 0.0005968228
Now let’s compare the predicted winning percentage for a team with a RD of 10 runs across the decades:
p10 <- function(fit){
predict(fit, data.frame(R = 30, RA = 20))
}
sapply(seven_fits, p10)
## 1961-1970.1 1971-1980.1 1981-1990.1 1991-2000.1 2001-2010.1 2011-2020.1
## 0.5069728 0.5063638 0.5069586 0.5062751 0.5062071 0.5063222
## 2021-2030.1
## 0.5059689
As baseball was evolving into its modern form, nineteenth century leagues often featured abysmal teams that did not even succeed in finishing their season, as well as some dominant clubs.
library(Lahman)
library(tidyverse)
Teams %>% filter(yearID <= 1900) %>%
mutate(WinPct = W / (W + L)) ->
D_19th
Below I construct a graph of the values of R - RA (horizontal) against the residual (vertical). I color the point by the winning proportion (bad is WinPct < .3 and great is WinPct > .7). We see some great teams with large positive residuals and bad teams with large negative residuals. By exploring further, can find the identity of the teams with the large residuals.
fit <- lm(WinPct ~ I(R - RA), data = D_19th)
fit
##
## Call:
## lm(formula = WinPct ~ I(R - RA), data = D_19th)
##
## Coefficients:
## (Intercept) I(R - RA)
## 0.4876483 0.0007367
library(broom)
out <- augment(fit)
# Add a 'type' column based on WinPct values
out <- out %>%
mutate(type = ifelse(WinPct > .7, "great",
ifelse(WinPct < .3, "bad", "other")))
# Plotting with ggplot2
ggplot(out, aes(x = `.fitted`, y = `.resid`, color = type)) +
geom_point() +
geom_hline(yintercept = 0)
Retrosheet game logs report, for every game played, the managers of both teams.
Here I decided to look at the 2010-2024 period (added 4 extra years since we aren’t through the decade yet). I fit the Pythagorean formula and store the output in the variable fit.
library(Lahman)
library(tidyverse)
library(broom)
Teams %>% filter(yearID >= 2010, yearID <= 2024) %>%
mutate(WinPct = W / (W + L)) -> d
fit <- lm(WinPct ~ I(R - RA), data = d)
summary(fit)
##
## Call:
## lm(formula = WinPct ~ I(R - RA), data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.134233 -0.017958 0.000799 0.016304 0.132188
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.000e-01 1.492e-03 335.06 <2e-16 ***
## I(R - RA) 6.213e-04 1.296e-05 47.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02947 on 388 degrees of freedom
## Multiple R-squared: 0.8555, Adjusted R-squared: 0.8551
## F-statistic: 2297 on 1 and 388 DF, p-value: < 2.2e-16
I find the residuals for each team/season. For each manager, I find the number of seasons coached (in this period) and the average residual. I arrange the resulting data frame by the value of Mean Residual. The head and tail of this table give the top overperforming and underperforming managers.
# Ensure augment is correctly augmenting the data with residuals
out <- augment(fit, data = select(d, yearID, teamID, R, RA))
# Joining with Manager data
out <- out %>%
inner_join(Managers, by = c("yearID", "teamID"))
# Ensuring that group_by and summarize can correctly compute Mean_Residual
output <- out %>%
group_by(playerID) %>%
summarize(N = n(), Mean_Residual = mean(.std.resid), .groups = 'drop') %>%
arrange(desc(Mean_Residual))
# Displaying the results
print("Top Overperforming Managers:")
## [1] "Top Overperforming Managers:"
print(head(out))
## # A tibble: 6 × 17
## yearID teamID R RA .fitted .hat .sigma .cooksd .std.resid playerID
## <int> <fct> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2010 ARI 713 836 0.424 0.00549 0.0295 0.00159 -0.760 hinchaj01
## 2 2010 ARI 713 836 0.424 0.00549 0.0295 0.00159 -0.760 gibsoki01
## 3 2010 ATL 738 629 0.568 0.00486 0.0295 0.000101 -0.203 coxbo01
## 4 2010 ATL 738 629 0.568 0.00486 0.0295 0.000101 -0.203 cadahch99
## 5 2010 BAL 613 785 0.393 0.00829 0.0295 0.000991 0.487 trembda99
## 6 2010 BAL 613 785 0.393 0.00829 0.0295 0.000991 0.487 samueju01
## # ℹ 7 more variables: lgID <fct>, inseason <int>, G <int>, W <int>, L <int>,
## # rank <int>, plyrMgr <fct>
print("Top Underperforming Managers:")
## [1] "Top Underperforming Managers:"
print(tail(out))
## # A tibble: 6 × 17
## yearID teamID R RA .fitted .hat .sigma .cooksd .std.resid playerID
## <int> <fct> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2022 TBA 666 614 0.532 0.00309 0.0295 3.64e-6 -0.0485 cashke01
## 2 2022 TEX 707 743 0.478 0.00281 0.0294 5.46e-3 -1.97 woodwch…
## 3 2022 TEX 707 743 0.478 0.00281 0.0294 5.46e-3 -1.97 beaslto…
## 4 2022 TOR 775 679 0.560 0.00435 0.0295 1.73e-4 0.281 montoch…
## 5 2022 TOR 775 679 0.560 0.00435 0.0295 1.73e-4 0.281 schnejo…
## 6 2022 WAS 603 855 0.343 0.0149 0.0295 1.35e-4 -0.134 martida…
## # ℹ 7 more variables: lgID <fct>, inseason <int>, G <int>, W <int>, L <int>,
## # rank <int>, plyrMgr <fct>
fit2 <- lm(Mean_Residual ~ N, data = output)
summary(fit2)
##
## Call:
## lm(formula = Mean_Residual ~ N, data = output)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80851 -0.34557 -0.01774 0.43486 1.65354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.18387 0.07832 -2.348 0.0205 *
## N 0.02612 0.01600 1.633 0.1051
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5574 on 123 degrees of freedom
## Multiple R-squared: 0.02121, Adjusted R-squared: 0.01325
## F-statistic: 2.666 on 1 and 123 DF, p-value: 0.1051
ggplot(output, aes(x=N, y=Mean_Residual)) +
geom_point() +
geom_abline(intercept = 0, slope=0, color = "red")