Load Packages and Data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
mlb <- read.csv("mlb.csv")

Question 1: Estimate the first regression

# Run the basic regression
model1 <- lm(log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr + 
               frstbase + scndbase + thrdbase + shrtstop + catcher, data = mlb)

# Report using modelsummary
summary(model1)
## 
## Call:
## lm(formula = log(salary) ~ years + gamesyr + bavg + hrunsyr + 
##     rbisyr + frstbase + scndbase + thrdbase + shrtstop + catcher, 
##     data = mlb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0206 -0.4713 -0.0332  0.4562  2.9438 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.020184   0.352620  31.252  < 2e-16 ***
## years        0.067665   0.012666   5.342 1.72e-07 ***
## gamesyr      0.012759   0.002863   4.456 1.14e-05 ***
## bavg         0.001559   0.001397   1.116    0.265    
## hrunsyr      0.011562   0.016579   0.697    0.486    
## rbisyr       0.011775   0.007477   1.575    0.116    
## frstbase    -0.190745   0.132553  -1.439    0.151    
## scndbase    -0.134699   0.145501  -0.926    0.355    
## thrdbase     0.012231   0.141637   0.086    0.931    
## shrtstop    -0.008383   0.129603  -0.065    0.948    
## catcher      0.202499   0.126045   1.607    0.109    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7257 on 328 degrees of freedom
## Multiple R-squared:  0.628,  Adjusted R-squared:  0.6166 
## F-statistic: 55.37 on 10 and 328 DF,  p-value: < 2.2e-16

Question 2: Explain why log(salary)

Question 3: Rank the positions

# Extract and rank position coefficients
coef(summary(model1)) %>%
  as.data.frame() %>%
  rownames_to_column(var = "Variable") %>%
  filter(Variable %in% c("D1B", "D2B", "D3B", "DSS", "DC")) %>%
  arrange(Estimate)
## [1] Variable   Estimate   Std. Error t value    Pr(>|t|)  
## <0 rows> (or 0-length row.names)

Question 4: Write down the regression

log(salaryi) = B0 + B1yearsi + B2gamesyr + B3bavg + B4hrunsyr + B5rbisyr + B6frstbase + B7scndbase + B8thrdbase + B9shrtstop + B10catcher + y1(years x frstbaseBi) + y2(years x scndbaseBi) + y3(years x thrdbaseBi) + y4(years x shrtstopBi) + ei

Question 5: Estimate the new regression in R

# Run the interaction regression
model2 <- lm(log(salary) ~  years + gamesyr + bavg + hrunsyr + rbisyr + 
               frstbase + scndbase + thrdbase + shrtstop + catcher +
               years:frstbase + years:scndbase + years:thrdbase + years:shrtstop
             + years:catcher, data = mlb)

# Output regression results
summary(model2, output = "markdown")
## 
## Call:
## lm(formula = log(salary) ~ years + gamesyr + bavg + hrunsyr + 
##     rbisyr + frstbase + scndbase + thrdbase + shrtstop + catcher + 
##     years:frstbase + years:scndbase + years:thrdbase + years:shrtstop + 
##     years:catcher, data = mlb)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.75778 -0.41739 -0.03735  0.44433  2.90950 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    11.099991   0.356512  31.135  < 2e-16 ***
## years           0.038451   0.017469   2.201   0.0284 *  
## gamesyr         0.012551   0.002865   4.381  1.6e-05 ***
## bavg            0.001930   0.001420   1.359   0.1750    
## hrunsyr         0.014157   0.016542   0.856   0.3927    
## rbisyr          0.012056   0.007477   1.612   0.1078    
## frstbase       -0.257126   0.230681  -1.115   0.2658    
## scndbase       -0.478296   0.278717  -1.716   0.0871 .  
## thrdbase       -0.421422   0.312445  -1.349   0.1784    
## shrtstop       -0.295563   0.234409  -1.261   0.2083    
## catcher        -0.338273   0.243794  -1.388   0.1662    
## years:frstbase  0.007264   0.030641   0.237   0.8127    
## years:scndbase  0.054945   0.036438   1.508   0.1326    
## years:thrdbase  0.067901   0.044390   1.530   0.1271    
## years:shrtstop  0.048142   0.032376   1.487   0.1380    
## years:catcher   0.090585   0.035021   2.587   0.0101 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7204 on 323 degrees of freedom
## Multiple R-squared:  0.639,  Adjusted R-squared:  0.6222 
## F-statistic: 38.11 on 15 and 323 DF,  p-value: < 2.2e-16

Question 6: Test if years effect is same for all positions

Run an F-test comparing model2 (interaction model) and model1:

anova(model1, model2)
## Analysis of Variance Table
## 
## Model 1: log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr + frstbase + 
##     scndbase + thrdbase + shrtstop + catcher
## Model 2: log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr + frstbase + 
##     scndbase + thrdbase + shrtstop + catcher + years:frstbase + 
##     years:scndbase + years:thrdbase + years:shrtstop + years:catcher
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1    328 172.73                              
## 2    323 167.62  5     5.112 1.9701 0.08269 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Question 7: How to test if all positions have same salary

# Model without position dummies
model_no_positions <- lm(log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr, data = mlb)

# F-test comparing models
anova(model_no_positions, model1)
## Analysis of Variance Table
## 
## Model 1: log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr
## Model 2: log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr + frstbase + 
##     scndbase + thrdbase + shrtstop + catcher
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    333 176.52                           
## 2    328 172.73  5    3.7826 1.4365 0.2106