library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
mlb <- read.csv("mlb.csv")
# Run the basic regression
model1 <- lm(log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr +
frstbase + scndbase + thrdbase + shrtstop + catcher, data = mlb)
# Report using modelsummary
summary(model1)
##
## Call:
## lm(formula = log(salary) ~ years + gamesyr + bavg + hrunsyr +
## rbisyr + frstbase + scndbase + thrdbase + shrtstop + catcher,
## data = mlb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0206 -0.4713 -0.0332 0.4562 2.9438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.020184 0.352620 31.252 < 2e-16 ***
## years 0.067665 0.012666 5.342 1.72e-07 ***
## gamesyr 0.012759 0.002863 4.456 1.14e-05 ***
## bavg 0.001559 0.001397 1.116 0.265
## hrunsyr 0.011562 0.016579 0.697 0.486
## rbisyr 0.011775 0.007477 1.575 0.116
## frstbase -0.190745 0.132553 -1.439 0.151
## scndbase -0.134699 0.145501 -0.926 0.355
## thrdbase 0.012231 0.141637 0.086 0.931
## shrtstop -0.008383 0.129603 -0.065 0.948
## catcher 0.202499 0.126045 1.607 0.109
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7257 on 328 degrees of freedom
## Multiple R-squared: 0.628, Adjusted R-squared: 0.6166
## F-statistic: 55.37 on 10 and 328 DF, p-value: < 2.2e-16
# Extract and rank position coefficients
coef(summary(model1)) %>%
as.data.frame() %>%
rownames_to_column(var = "Variable") %>%
filter(Variable %in% c("D1B", "D2B", "D3B", "DSS", "DC")) %>%
arrange(Estimate)
## [1] Variable Estimate Std. Error t value Pr(>|t|)
## <0 rows> (or 0-length row.names)
log(salaryi) = B0 + B1yearsi + B2gamesyr + B3bavg + B4hrunsyr + B5rbisyr + B6frstbase + B7scndbase + B8thrdbase + B9shrtstop + B10catcher + y1(years x frstbaseBi) + y2(years x scndbaseBi) + y3(years x thrdbaseBi) + y4(years x shrtstopBi) + ei
# Run the interaction regression
model2 <- lm(log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr +
frstbase + scndbase + thrdbase + shrtstop + catcher +
years:frstbase + years:scndbase + years:thrdbase + years:shrtstop
+ years:catcher, data = mlb)
# Output regression results
summary(model2, output = "markdown")
##
## Call:
## lm(formula = log(salary) ~ years + gamesyr + bavg + hrunsyr +
## rbisyr + frstbase + scndbase + thrdbase + shrtstop + catcher +
## years:frstbase + years:scndbase + years:thrdbase + years:shrtstop +
## years:catcher, data = mlb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.75778 -0.41739 -0.03735 0.44433 2.90950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.099991 0.356512 31.135 < 2e-16 ***
## years 0.038451 0.017469 2.201 0.0284 *
## gamesyr 0.012551 0.002865 4.381 1.6e-05 ***
## bavg 0.001930 0.001420 1.359 0.1750
## hrunsyr 0.014157 0.016542 0.856 0.3927
## rbisyr 0.012056 0.007477 1.612 0.1078
## frstbase -0.257126 0.230681 -1.115 0.2658
## scndbase -0.478296 0.278717 -1.716 0.0871 .
## thrdbase -0.421422 0.312445 -1.349 0.1784
## shrtstop -0.295563 0.234409 -1.261 0.2083
## catcher -0.338273 0.243794 -1.388 0.1662
## years:frstbase 0.007264 0.030641 0.237 0.8127
## years:scndbase 0.054945 0.036438 1.508 0.1326
## years:thrdbase 0.067901 0.044390 1.530 0.1271
## years:shrtstop 0.048142 0.032376 1.487 0.1380
## years:catcher 0.090585 0.035021 2.587 0.0101 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7204 on 323 degrees of freedom
## Multiple R-squared: 0.639, Adjusted R-squared: 0.6222
## F-statistic: 38.11 on 15 and 323 DF, p-value: < 2.2e-16
Run an F-test comparing model2
(interaction model) and model1:
anova(model1, model2)
## Analysis of Variance Table
##
## Model 1: log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr + frstbase +
## scndbase + thrdbase + shrtstop + catcher
## Model 2: log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr + frstbase +
## scndbase + thrdbase + shrtstop + catcher + years:frstbase +
## years:scndbase + years:thrdbase + years:shrtstop + years:catcher
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 328 172.73
## 2 323 167.62 5 5.112 1.9701 0.08269 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model without position dummies
model_no_positions <- lm(log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr, data = mlb)
# F-test comparing models
anova(model_no_positions, model1)
## Analysis of Variance Table
##
## Model 1: log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr
## Model 2: log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr + frstbase +
## scndbase + thrdbase + shrtstop + catcher
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 333 176.52
## 2 328 172.73 5 3.7826 1.4365 0.2106