1 Setting Up

setwd(cd)
rm(list = ls()) 
gc()

##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 518214 27.7    1145942 61.2         NA   669337 35.8
## Vcells 953920  7.3    8388608 64.0      16384  1839833 14.1

cat("\f")

dev.off

## function (which = dev.cur()) 
## {
##     if (which == 1) 
##         stop("cannot shut down device 1 (the null device)")
##     .External(C_devoff, as.integer(which))
##     dev.cur()
## }
## <bytecode: 0x7fcd962cd230>
## <environment: namespace:grDevices>

packages <- c("psych","tidyverse")

  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()

rm(packages)

library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(stargazer)

## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

2 Data Exploration

Raw data figures shown below.

df_train    <- read.csv("moneyball-training-data.csv")
df_eval     <- read.csv("moneyball-evaluation-data.csv")

describe(df_train)

##                  vars    n    mean      sd median trimmed    mad  min   max
## INDEX               1 2276 1268.46  736.35 1270.5 1268.57 952.57    1  2535
## TARGET_WINS         2 2276   80.79   15.75   82.0   81.31  14.83    0   146
## TEAM_BATTING_H      3 2276 1469.27  144.59 1454.0 1459.04 114.16  891  2554
## TEAM_BATTING_2B     4 2276  241.25   46.80  238.0  240.40  47.44   69   458
## TEAM_BATTING_3B     5 2276   55.25   27.94   47.0   52.18  23.72    0   223
## TEAM_BATTING_HR     6 2276   99.61   60.55  102.0   97.39  78.58    0   264
## TEAM_BATTING_BB     7 2276  501.56  122.67  512.0  512.18  94.89    0   878
## TEAM_BATTING_SO     8 2174  735.61  248.53  750.0  742.31 284.66    0  1399
## TEAM_BASERUN_SB     9 2145  124.76   87.79  101.0  110.81  60.79    0   697
## TEAM_BASERUN_CS    10 1504   52.80   22.96   49.0   50.36  17.79    0   201
## TEAM_BATTING_HBP   11  191   59.36   12.97   58.0   58.86  11.86   29    95
## TEAM_PITCHING_H    12 2276 1779.21 1406.84 1518.0 1555.90 174.95 1137 30132
## TEAM_PITCHING_HR   13 2276  105.70   61.30  107.0  103.16  74.13    0   343
## TEAM_PITCHING_BB   14 2276  553.01  166.36  536.5  542.62  98.59    0  3645
## TEAM_PITCHING_SO   15 2174  817.73  553.09  813.5  796.93 257.23    0 19278
## TEAM_FIELDING_E    16 2276  246.48  227.77  159.0  193.44  62.27   65  1898
## TEAM_FIELDING_DP   17 1990  146.39   26.23  149.0  147.58  23.72   52   228
##                  range  skew kurtosis    se
## INDEX             2534  0.00    -1.22 15.43
## TARGET_WINS        146 -0.40     1.03  0.33
## TEAM_BATTING_H    1663  1.57     7.28  3.03
## TEAM_BATTING_2B    389  0.22     0.01  0.98
## TEAM_BATTING_3B    223  1.11     1.50  0.59
## TEAM_BATTING_HR    264  0.19    -0.96  1.27
## TEAM_BATTING_BB    878 -1.03     2.18  2.57
## TEAM_BATTING_SO   1399 -0.30    -0.32  5.33
## TEAM_BASERUN_SB    697  1.97     5.49  1.90
## TEAM_BASERUN_CS    201  1.98     7.62  0.59
## TEAM_BATTING_HBP    66  0.32    -0.11  0.94
## TEAM_PITCHING_H  28995 10.33   141.84 29.49
## TEAM_PITCHING_HR   343  0.29    -0.60  1.28
## TEAM_PITCHING_BB  3645  6.74    96.97  3.49
## TEAM_PITCHING_SO 19278 22.17   671.19 11.86
## TEAM_FIELDING_E   1833  2.99    10.97  4.77
## TEAM_FIELDING_DP   176 -0.39     0.18  0.59

glimpse(df_eval)

## Rows: 259
## Columns: 16
## $ INDEX            <int> 9, 10, 14, 47, 60, 63, 74, 83, 98, 120, 123, 135, 138…
## $ TEAM_BATTING_H   <int> 1209, 1221, 1395, 1539, 1445, 1431, 1430, 1385, 1259,…
## $ TEAM_BATTING_2B  <int> 170, 151, 183, 309, 203, 236, 219, 158, 177, 212, 243…
## $ TEAM_BATTING_3B  <int> 33, 29, 29, 29, 68, 53, 55, 42, 78, 42, 40, 55, 57, 2…
## $ TEAM_BATTING_HR  <int> 83, 88, 93, 159, 5, 10, 37, 33, 23, 58, 50, 164, 186,…
## $ TEAM_BATTING_BB  <int> 447, 516, 509, 486, 95, 215, 568, 356, 466, 452, 495,…
## $ TEAM_BATTING_SO  <int> 1080, 929, 816, 914, 416, 377, 527, 609, 689, 584, 64…
## $ TEAM_BASERUN_SB  <int> 62, 54, 59, 148, NA, NA, 365, 185, 150, 52, 64, 48, 3…
## $ TEAM_BASERUN_CS  <int> 50, 39, 47, 57, NA, NA, NA, NA, NA, NA, NA, 28, 21, 8…
## $ TEAM_BATTING_HBP <int> NA, NA, NA, 42, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TEAM_PITCHING_H  <int> 1209, 1221, 1395, 1539, 3902, 2793, 1544, 1626, 1342,…
## $ TEAM_PITCHING_HR <int> 83, 88, 93, 159, 14, 20, 40, 39, 25, 62, 53, 173, 196…
## $ TEAM_PITCHING_BB <int> 447, 516, 509, 486, 257, 420, 613, 418, 497, 482, 521…
## $ TEAM_PITCHING_SO <int> 1080, 929, 816, 914, 1123, 736, 569, 715, 734, 622, 6…
## $ TEAM_FIELDING_E  <int> 140, 135, 156, 124, 616, 572, 490, 328, 226, 184, 200…
## $ TEAM_FIELDING_DP <int> 156, 164, 153, 154, 130, 105, NA, 104, 132, 145, 183,…

print(x = 2006-1871+1,
      digits = 1)

## [1] 136

136*162

## [1] 22032

2.1 Missing Data

Finding any missing data, visually expressing them, and omitting where necessary.

library(visdat)

vis_dat(df_train)

df_no_missing <- df_train
df_no_missing$TEAM_BATTING_HBP = NULL
df_no_missing$TEAM_BATTING_CS   = NULL 
 vis_miss(df_no_missing)

hist(df_no_missing$TEAM_BASERUN_CS )

df_no_missing$TEAM_BASERUN_CS <- ifelse(is.na(x = df_no_missing$TEAM_BASERUN_CS), 
                                        yes = mean(x = df_no_missing$TEAM_BASERUN_CS,                                                    na.rm = TRUE),
                                        no = df_no_missing$TEAM_BASERUN_CS)
hist(df_no_missing$TEAM_BASERUN_CS )

ggplot(data = df_no_missing, 
       mapping = aes(x = TEAM_BASERUN_CS)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = df_no_missing, 
       mapping = aes(x = TEAM_BASERUN_CS)) + geom_histogram(color = "black", 
                                                            fill = "blue", 
                                                            binwidth = 10) +
  labs(title = "Histogram of X", x = "X", y = "Count")

df_no_missing <- na.omit(df_no_missing)

2.2 Finding Correlation

Identifying which variables have correlation.

library(ggcorrplot)
mycorr<- cor(x = df_no_missing[, 1:ncol(df_no_missing )])
p.mat <- ggcorrplot::cor_pmat(x = df_no_missing[,1:ncol(df_no_missing)])
head(p.mat)

##                       INDEX  TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## INDEX           0.000000000 3.085723e-01   2.870071e-01    1.012176e-01
## TARGET_WINS     0.308572334 0.000000e+00   1.013043e-54    2.490498e-20
## TEAM_BATTING_H  0.287007118 1.013043e-54   0.000000e+00   9.355168e-233
## TEAM_BATTING_2B 0.101217596 2.490498e-20  9.355168e-233    0.000000e+00
## TEAM_BATTING_3B 0.769316672 1.340586e-07   2.896946e-60    2.894398e-02
## TEAM_BATTING_HR 0.006420957 1.416443e-21   1.602541e-06    1.786146e-62
##                 TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## INDEX              7.693167e-01    6.420957e-03    2.259876e-02    7.106396e-07
## TARGET_WINS        1.340586e-07    1.416443e-21    6.788724e-40    1.082744e-02
## TEAM_BATTING_H     2.896946e-60    1.602541e-06    1.402162e-05    3.110665e-54
## TEAM_BATTING_2B    2.894398e-02    1.786146e-62    2.281359e-23    1.601577e-07
## TEAM_BATTING_3B    0.000000e+00   3.146352e-208    5.493361e-25   1.722999e-271
## TEAM_BATTING_HR   3.146352e-208    0.000000e+00    2.833152e-68   4.262615e-267
##                 TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_H
## INDEX              1.177395e-04    9.919907e-01    9.517913e-01
## TARGET_WINS        1.846788e-07    6.773671e-01    8.836481e-22
## TEAM_BATTING_H     1.497875e-02    6.096341e-01   7.485250e-277
## TEAM_BATTING_2B    9.931836e-07    9.663607e-06    6.214542e-71
## TEAM_BATTING_3B    8.653120e-30    2.117662e-35    5.645504e-73
## TEAM_BATTING_HR    1.327937e-41    9.273809e-63    2.113987e-05
##                 TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO
## INDEX               7.419790e-03     1.014059e-02     1.744343e-06
## TARGET_WINS         1.685830e-21     2.011167e-32     4.962297e-03
## TEAM_BATTING_H      4.769017e-08     3.435879e-08     7.730597e-49
## TEAM_BATTING_2B     1.151450e-63     2.211572e-17     5.592219e-07
## TEAM_BATTING_3B    2.009525e-190     1.798207e-07    2.195339e-225
## TEAM_BATTING_HR     0.000000e+00     2.039060e-29    1.157630e-220
##                 TEAM_FIELDING_E TEAM_FIELDING_DP
## INDEX              1.287249e-02     7.993341e-01
## TARGET_WINS        1.454931e-15     1.165960e-01
## TEAM_BATTING_H     2.969523e-04     1.173988e-11
## TEAM_BATTING_2B    9.295027e-35     1.901141e-14
## TEAM_BATTING_3B   2.596789e-244     7.211648e-21
## TEAM_BATTING_HR   4.191484e-271     2.530492e-36

  myplot<-ggcorrplot(corr     = mycorr,   
                     method   = "square", 
                     type     = "full",  
                     title    = "Correlation Plot",  
                     colors   = c("red", "white","green"), 
                     lab      = TRUE,   
                     lab_size = 2,      
                     p.mat    = p.mat,  
                     insig    = "pch",  
                     pch      = 4, 
                     hc.order = TRUE, 
                     tl.cex   = 8,
                      tl.col   = "black", 
                     digits = 2
                     )

myplot

Box plots to find patterns in data.

df_no_missing %>%
  gather(variable, value, TARGET_WINS:TEAM_FIELDING_DP) %>%
  ggplot(., aes(x= variable, y=value)) + 
  geom_boxplot() +
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

3 Data Preparation

Filtering out the outlier missing data so that it does not affect the larger aggregate data.

df_no_missing <- df_no_missing %>%
  filter(TARGET_WINS >19) %>% 
  filter(TARGET_WINS <121)

Reviewing prepped data set.

df_no_missing %>%
  gather(variable, value, -TARGET_WINS) %>%
  ggplot(., aes(value, TARGET_WINS)) + 
  geom_point(fill = "blue", color="blue") + 
  geom_smooth(method = "lm", se = FALSE, color = "black") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = "Wins")

## `geom_smooth()` using formula = 'y ~ x'

mycorr<- cor(x = df_no_missing[, 1:ncol(df_no_missing )])
p.mat <- ggcorrplot::cor_pmat(x = df_no_missing[,1:ncol(df_no_missing)])
head(p.mat)

##                       INDEX  TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## INDEX           0.000000000 3.085723e-01   2.870071e-01    1.012176e-01
## TARGET_WINS     0.308572334 0.000000e+00   1.013043e-54    2.490498e-20
## TEAM_BATTING_H  0.287007118 1.013043e-54   0.000000e+00   9.355168e-233
## TEAM_BATTING_2B 0.101217596 2.490498e-20  9.355168e-233    0.000000e+00
## TEAM_BATTING_3B 0.769316672 1.340586e-07   2.896946e-60    2.894398e-02
## TEAM_BATTING_HR 0.006420957 1.416443e-21   1.602541e-06    1.786146e-62
##                 TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## INDEX              7.693167e-01    6.420957e-03    2.259876e-02    7.106396e-07
## TARGET_WINS        1.340586e-07    1.416443e-21    6.788724e-40    1.082744e-02
## TEAM_BATTING_H     2.896946e-60    1.602541e-06    1.402162e-05    3.110665e-54
## TEAM_BATTING_2B    2.894398e-02    1.786146e-62    2.281359e-23    1.601577e-07
## TEAM_BATTING_3B    0.000000e+00   3.146352e-208    5.493361e-25   1.722999e-271
## TEAM_BATTING_HR   3.146352e-208    0.000000e+00    2.833152e-68   4.262615e-267
##                 TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_H
## INDEX              1.177395e-04    9.919907e-01    9.517913e-01
## TARGET_WINS        1.846788e-07    6.773671e-01    8.836481e-22
## TEAM_BATTING_H     1.497875e-02    6.096341e-01   7.485250e-277
## TEAM_BATTING_2B    9.931836e-07    9.663607e-06    6.214542e-71
## TEAM_BATTING_3B    8.653120e-30    2.117662e-35    5.645504e-73
## TEAM_BATTING_HR    1.327937e-41    9.273809e-63    2.113987e-05
##                 TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO
## INDEX               7.419790e-03     1.014059e-02     1.744343e-06
## TARGET_WINS         1.685830e-21     2.011167e-32     4.962297e-03
## TEAM_BATTING_H      4.769017e-08     3.435879e-08     7.730597e-49
## TEAM_BATTING_2B     1.151450e-63     2.211572e-17     5.592219e-07
## TEAM_BATTING_3B    2.009525e-190     1.798207e-07    2.195339e-225
## TEAM_BATTING_HR     0.000000e+00     2.039060e-29    1.157630e-220
##                 TEAM_FIELDING_E TEAM_FIELDING_DP
## INDEX              1.287249e-02     7.993341e-01
## TARGET_WINS        1.454931e-15     1.165960e-01
## TEAM_BATTING_H     2.969523e-04     1.173988e-11
## TEAM_BATTING_2B    9.295027e-35     1.901141e-14
## TEAM_BATTING_3B   2.596789e-244     7.211648e-21
## TEAM_BATTING_HR   4.191484e-271     2.530492e-36

  myplot<-ggcorrplot(corr     = mycorr,   
                     method   = "square", 
                     type     = "full",  
                     title    = "Correlation Plot",  
                     colors   = c("red", "white","green"), 
                     lab      = TRUE,   
                     lab_size = 2,      
                     p.mat    = p.mat,  
                     insig    = "pch",  
                     pch      = 4, 
                     hc.order = TRUE, 
                     tl.cex   = 8,
                      tl.col   = "black", 
                     digits = 2
                     )

myplot

4 Build Models

model1 <- lm(data = df_no_missing, 
             TARGET_WINS ~ .)

model2 <- lm(data = df_no_missing,
             TARGET_WINS ~ TEAM_PITCHING_H + TEAM_FIELDING_E + 
             TEAM_PITCHING_BB)

model3 <- stepAIC(object = lm(data = df_no_missing, TARGET_WINS ~ .), 
                  direction = c("backward")
                  )

## Start:  AIC=8526.62
## TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
##     TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_PITCHING_HR  1      11.1 187978 8524.7
## - INDEX             1      21.5 187989 8524.8
## - TEAM_BATTING_SO   1     104.2 188071 8525.6
## <none>                          187967 8526.6
## - TEAM_BATTING_HR   1     235.9 188203 8526.9
## - TEAM_BATTING_H    1     272.3 188239 8527.3
## - TEAM_PITCHING_BB  1     353.0 188320 8528.1
## - TEAM_PITCHING_SO  1     436.0 188403 8528.9
## - TEAM_BASERUN_CS   1     670.4 188637 8531.2
## - TEAM_BATTING_BB   1     716.8 188684 8531.6
## - TEAM_PITCHING_H   1    1312.2 189279 8537.4
## - TEAM_BATTING_2B   1    3237.3 191204 8556.0
## - TEAM_FIELDING_DP  1    8408.7 196376 8604.9
## - TEAM_BATTING_3B   1    9723.5 197691 8617.2
## - TEAM_BASERUN_SB   1   15739.9 203707 8672.2
## - TEAM_FIELDING_E   1   29328.6 217296 8790.7
## 
## Step:  AIC=8524.73
## TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - INDEX             1      22.7 188001 8523.0
## <none>                          187978 8524.7
## - TEAM_BATTING_SO   1     229.8 188208 8525.0
## - TEAM_BATTING_H    1     326.6 188305 8525.9
## - TEAM_PITCHING_BB  1     404.6 188383 8526.7
## - TEAM_BASERUN_CS   1     685.1 188663 8529.4
## - TEAM_BATTING_BB   1     799.6 188778 8530.5
## - TEAM_PITCHING_SO  1     822.9 188801 8530.7
## - TEAM_PITCHING_H   1    1494.3 189473 8537.3
## - TEAM_BATTING_2B   1    3235.3 191214 8554.0
## - TEAM_FIELDING_DP  1    8431.8 196410 8603.2
## - TEAM_BATTING_3B   1    9719.2 197697 8615.2
## - TEAM_BATTING_HR   1   11281.0 199259 8629.7
## - TEAM_BASERUN_SB   1   15854.4 203833 8671.3
## - TEAM_FIELDING_E   1   29390.8 217369 8789.3
## 
## Step:  AIC=8522.95
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          188001 8523.0
## - TEAM_BATTING_SO   1     229.1 188230 8523.2
## - TEAM_BATTING_H    1     325.8 188327 8524.1
## - TEAM_PITCHING_BB  1     400.4 188401 8524.9
## - TEAM_BASERUN_CS   1     677.5 188679 8527.6
## - TEAM_BATTING_BB   1     795.1 188796 8528.7
## - TEAM_PITCHING_SO  1     825.2 188826 8529.0
## - TEAM_PITCHING_H   1    1488.2 189489 8535.4
## - TEAM_BATTING_2B   1    3223.9 191225 8552.2
## - TEAM_FIELDING_DP  1    8462.1 196463 8601.7
## - TEAM_BATTING_3B   1    9711.3 197712 8613.4
## - TEAM_BATTING_HR   1   11284.0 199285 8627.9
## - TEAM_BASERUN_SB   1   15856.4 203857 8669.5
## - TEAM_FIELDING_E   1   29396.6 217398 8787.5

stargazer(model1, model2, model3,
          type = "text"
          )

## 
## ================================================================================================
##                                                 Dependent variable:                             
##                     ----------------------------------------------------------------------------
##                                                     TARGET_WINS                                 
##                                (1)                      (2)                       (3)           
## ------------------------------------------------------------------------------------------------
## INDEX                        -0.0002                                                            
##                             (0.0003)                                                            
##                                                                                                 
## TEAM_BATTING_H               -0.027                                             -0.028*         
##                              (0.017)                                            (0.016)         
##                                                                                                 
## TEAM_BATTING_2B             -0.050***                                          -0.050***        
##                              (0.009)                                            (0.009)         
##                                                                                                 
## TEAM_BATTING_3B             0.185***                                           0.184***         
##                              (0.019)                                            (0.019)         
##                                                                                                 
## TEAM_BATTING_HR               0.123                                            0.097***         
##                              (0.082)                                            (0.009)         
##                                                                                                 
## TEAM_BATTING_BB             0.112***                                           0.115***         
##                              (0.042)                                            (0.041)         
##                                                                                                 
## TEAM_BATTING_SO               0.022                                              0.026          
##                              (0.022)                                            (0.018)         
##                                                                                                 
## TEAM_BASERUN_SB             0.077***                                           0.077***         
##                              (0.006)                                            (0.006)         
##                                                                                                 
## TEAM_BASERUN_CS             -0.036**                                           -0.036**         
##                              (0.014)                                            (0.014)         
##                                                                                                 
## TEAM_PITCHING_H             0.054***                  0.017***                 0.055***         
##                              (0.015)                  (0.002)                   (0.014)         
##                                                                                                 
## TEAM_PITCHING_HR             -0.026                                                             
##                              (0.078)                                                            
##                                                                                                 
## TEAM_PITCHING_BB             -0.075*                  0.020***                 -0.077**         
##                              (0.040)                  (0.003)                   (0.039)         
##                                                                                                 
## TEAM_PITCHING_SO            -0.043**                                           -0.047***        
##                              (0.021)                                            (0.017)         
##                                                                                                 
## TEAM_FIELDING_E             -0.121***                -0.048***                 -0.120***        
##                              (0.007)                  (0.005)                   (0.007)         
##                                                                                                 
## TEAM_FIELDING_DP            -0.111***                                          -0.111***        
##                              (0.012)                                            (0.012)         
##                                                                                                 
## Constant                    60.797***                51.788***                 60.856***        
##                              (6.066)                  (2.688)                   (6.057)         
##                                                                                                 
## ------------------------------------------------------------------------------------------------
## Observations                  1,835                    1,835                     1,835          
## R2                            0.408                    0.129                     0.408          
## Adjusted R2                   0.403                    0.128                     0.404          
## Residual Std. Error    10.165 (df = 1819)        12.289 (df = 1831)       10.161 (df = 1821)    
## F Statistic         83.597*** (df = 15; 1819) 90.539*** (df = 3; 1831) 96.522*** (df = 13; 1821)
## ================================================================================================
## Note:                                                                *p<0.1; **p<0.05; ***p<0.01

5 Select Models

Models 1 and 3 produced similar results. But I would select Model 3 as the best model, because it has a high R-squared value of 40% in addition to its high F-statistic of 96.5. Model 1 comes in second place for similar reasons, and Model 2 as well.

?par
par(mfrow = c(2,2)) 
plot(model3)

The residual VS fitted plot is evenly distributed across the line. The normal QQ plot is a straight positive line, so it indicates a uniform distribution. All three plots have the same trendline, meaning there is a consistent linear relationship between the two variables in all four cases. The slope of the trendline is the same in all 3 plots, indicating that the change in the response variable (y-axis) for a unit change in the predictor variable (x-axis) is consistent across all pairs of variables. However one possible issue to investigate further is the clumping to the left of the residuals vs leverage plot, there could be influential observations in the dataset that are having a strong impact on the regression model.

Homework 1

Bella Starlin

2023-04-10