Section 1 - Data Exploration

The original data was loaded containing 2276 rows and 16 columns related to batting, base run, pitching, and fielding. A summary and boxplots of all variables suggest some columns have missing data, and some may contain outliers.

A visual relationship of the target variable (TARGET_WINS) and the rest of the possible predictor variables were plotted; and correlation values suggest individual relationships with the target variable are not very strong, there appeared neither an obvious positive nor negative linear trend.

In contrast, a correlation matrix among possible predictors suggests some variables with strong relationships (e.g. TEAM_BATTING_SO and TEAM_PITCHING_SO).

library(ggcorrplot)
library(dplyr)
library(car)

# Read the training data
df <- read.csv("https://raw.githubusercontent.com/L-Velasco/DATA621_FA18/master/HW1/moneyball-training-data.csv", stringsAsFactors = FALSE)

# Exclude the INDEX column
tr <- df[-1]

1.1 Size, Summary and Boxplots of the Data

dim(tr)
## [1] 2276   16
summary(tr)
##   TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B 
##  Min.   :  0.00   Min.   : 891   Min.   : 69.0   Min.   :  0.00  
##  1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0   1st Qu.: 34.00  
##  Median : 82.00   Median :1454   Median :238.0   Median : 47.00  
##  Mean   : 80.79   Mean   :1469   Mean   :241.2   Mean   : 55.25  
##  3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0   3rd Qu.: 72.00  
##  Max.   :146.00   Max.   :2554   Max.   :458.0   Max.   :223.00  
##                                                                  
##  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO  TEAM_BASERUN_SB
##  Min.   :  0.00   Min.   :  0.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0   1st Qu.: 66.0  
##  Median :102.00   Median :512.0   Median : 750.0   Median :101.0  
##  Mean   : 99.61   Mean   :501.6   Mean   : 735.6   Mean   :124.8  
##  3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0   3rd Qu.:156.0  
##  Max.   :264.00   Max.   :878.0   Max.   :1399.0   Max.   :697.0  
##                                   NA's   :102      NA's   :131    
##  TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
##  Min.   :  0.0   Min.   :29.00    Min.   : 1137   Min.   :  0.0   
##  1st Qu.: 38.0   1st Qu.:50.50    1st Qu.: 1419   1st Qu.: 50.0   
##  Median : 49.0   Median :58.00    Median : 1518   Median :107.0   
##  Mean   : 52.8   Mean   :59.36    Mean   : 1779   Mean   :105.7   
##  3rd Qu.: 62.0   3rd Qu.:67.00    3rd Qu.: 1682   3rd Qu.:150.0   
##  Max.   :201.0   Max.   :95.00    Max.   :30132   Max.   :343.0   
##  NA's   :772     NA's   :2085                                     
##  TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E  TEAM_FIELDING_DP
##  Min.   :   0.0   Min.   :    0.0   Min.   :  65.0   Min.   : 52.0   
##  1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0   1st Qu.:131.0   
##  Median : 536.5   Median :  813.5   Median : 159.0   Median :149.0   
##  Mean   : 553.0   Mean   :  817.7   Mean   : 246.5   Mean   :146.4   
##  3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2   3rd Qu.:164.0   
##  Max.   :3645.0   Max.   :19278.0   Max.   :1898.0   Max.   :228.0   
##                   NA's   :102                        NA's   :286
boxplot(tr$TARGET_WINS, xlab = "TARGET_WINS")

par(mfrow=c(3,5))

x <- c(2:16)
for (val in x) {
  boxplot(tr[,val], xlab=names(tr[val]))
}

par(mfrow=c(1,1))

1.2 Relationships and Correlations with Response Variable (TARGET WINS)

#### Target Wins vs all variables
par(mfrow=c(3,5))

x <- c(2:16)
for (val in x) {
plot(tr[,val],tr$TARGET_WINS, xlab=names(tr[val]))
}

par(mfrow=c(1,1))

cor(tr, use = "complete.obs")[,1]
##      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B  TEAM_BATTING_3B 
##       1.00000000       0.46994665       0.31298400      -0.12434586 
##  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB 
##       0.42241683       0.46868793      -0.22889273       0.01483639 
##  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H TEAM_PITCHING_HR 
##      -0.17875598       0.07350424       0.47123431       0.42246683 
## TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
##       0.46839882      -0.22936481      -0.38668800      -0.19586601

1.3 Correlations Among Possible Predictor Variables

corr<- round(cor(tr[-1], use="complete.obs"),1)
ggcorrplot(corr, lab = TRUE)

corr
##                  TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## TEAM_BATTING_H              1.0             0.6             0.2
## TEAM_BATTING_2B             0.6             1.0             0.0
## TEAM_BATTING_3B             0.2             0.0             1.0
## TEAM_BATTING_HR             0.4             0.3            -0.2
## TEAM_BATTING_BB             0.2             0.2            -0.2
## TEAM_BATTING_SO            -0.3            -0.1            -0.2
## TEAM_BASERUN_SB             0.1            -0.2             0.2
## TEAM_BASERUN_CS            -0.1            -0.2             0.2
## TEAM_BATTING_HBP            0.0             0.0            -0.2
## TEAM_PITCHING_H             1.0             0.6             0.2
## TEAM_PITCHING_HR            0.4             0.2            -0.2
## TEAM_PITCHING_BB            0.2             0.2            -0.2
## TEAM_PITCHING_SO           -0.3            -0.1            -0.2
## TEAM_FIELDING_E            -0.3            -0.2            -0.1
## TEAM_FIELDING_DP            0.0             0.0             0.1
##                  TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## TEAM_BATTING_H               0.4             0.2            -0.3
## TEAM_BATTING_2B              0.3             0.2            -0.1
## TEAM_BATTING_3B             -0.2            -0.2            -0.2
## TEAM_BATTING_HR              1.0             0.5             0.2
## TEAM_BATTING_BB              0.5             1.0             0.2
## TEAM_BATTING_SO              0.2             0.2             1.0
## TEAM_BASERUN_SB             -0.2            -0.1            -0.1
## TEAM_BASERUN_CS             -0.3            -0.2            -0.1
## TEAM_BATTING_HBP             0.1             0.0             0.2
## TEAM_PITCHING_H              0.4             0.2            -0.3
## TEAM_PITCHING_HR             1.0             0.5             0.2
## TEAM_PITCHING_BB             0.5             1.0             0.2
## TEAM_PITCHING_SO             0.2             0.2             1.0
## TEAM_FIELDING_E              0.0            -0.1             0.3
## TEAM_FIELDING_DP            -0.1            -0.1            -0.1
##                  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP
## TEAM_BATTING_H               0.1            -0.1              0.0
## TEAM_BATTING_2B             -0.2            -0.2              0.0
## TEAM_BATTING_3B              0.2             0.2             -0.2
## TEAM_BATTING_HR             -0.2            -0.3              0.1
## TEAM_BATTING_BB             -0.1            -0.2              0.0
## TEAM_BATTING_SO             -0.1            -0.1              0.2
## TEAM_BASERUN_SB              1.0             0.6             -0.1
## TEAM_BASERUN_CS              0.6             1.0             -0.1
## TEAM_BATTING_HBP            -0.1            -0.1              1.0
## TEAM_PITCHING_H              0.1            -0.1              0.0
## TEAM_PITCHING_HR            -0.2            -0.3              0.1
## TEAM_PITCHING_BB            -0.1            -0.2              0.0
## TEAM_PITCHING_SO            -0.1            -0.1              0.2
## TEAM_FIELDING_E              0.0             0.2              0.0
## TEAM_FIELDING_DP            -0.1             0.0             -0.1
##                  TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## TEAM_BATTING_H               1.0              0.4              0.2
## TEAM_BATTING_2B              0.6              0.2              0.2
## TEAM_BATTING_3B              0.2             -0.2             -0.2
## TEAM_BATTING_HR              0.4              1.0              0.5
## TEAM_BATTING_BB              0.2              0.5              1.0
## TEAM_BATTING_SO             -0.3              0.2              0.2
## TEAM_BASERUN_SB              0.1             -0.2             -0.1
## TEAM_BASERUN_CS             -0.1             -0.3             -0.2
## TEAM_BATTING_HBP             0.0              0.1              0.0
## TEAM_PITCHING_H              1.0              0.4              0.2
## TEAM_PITCHING_HR             0.4              1.0              0.5
## TEAM_PITCHING_BB             0.2              0.5              1.0
## TEAM_PITCHING_SO            -0.3              0.2              0.2
## TEAM_FIELDING_E             -0.3              0.0             -0.1
## TEAM_FIELDING_DP             0.0             -0.1             -0.1
##                  TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## TEAM_BATTING_H               -0.3            -0.3              0.0
## TEAM_BATTING_2B              -0.1            -0.2              0.0
## TEAM_BATTING_3B              -0.2            -0.1              0.1
## TEAM_BATTING_HR               0.2             0.0             -0.1
## TEAM_BATTING_BB               0.2            -0.1             -0.1
## TEAM_BATTING_SO               1.0             0.3             -0.1
## TEAM_BASERUN_SB              -0.1             0.0             -0.1
## TEAM_BASERUN_CS              -0.1             0.2              0.0
## TEAM_BATTING_HBP              0.2             0.0             -0.1
## TEAM_PITCHING_H              -0.3            -0.3              0.0
## TEAM_PITCHING_HR              0.2             0.0             -0.1
## TEAM_PITCHING_BB              0.2            -0.1             -0.1
## TEAM_PITCHING_SO              1.0             0.3             -0.1
## TEAM_FIELDING_E               0.3             1.0              0.0
## TEAM_FIELDING_DP             -0.1             0.0              1.0

Section 2 - Data Preparation

To prepare the data for modeling, variables will be added, removed, imputed, and treated for outliers.

tr_prep <- tr

2.1 Add variable

A new variable TEAM_TOTAL_BASES will be added to represent number for total bases with formula derived from https://baseballtips.com/glossary/index.html/

Total Bases = [H + 2B + (2 X 3B) + (3 X HR)]

Below shows the first few rows after adding the new variable at the end:

#add variable

  tr_prep <- tr_prep %>% 
    mutate(TEAM_TOTAL_BASES = 
           TEAM_BATTING_H + TEAM_BATTING_2B 
           + (2 * TEAM_BATTING_3B) + (3 * TEAM_BATTING_HR))
head(tr_prep)
##   TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1          39           1445             194              39
## 2          70           1339             219              22
## 3          86           1377             232              35
## 4          70           1387             209              38
## 5          82           1297             186              27
## 6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842              NA
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1              NA               NA            9364               84
## 2              28               NA            1347              191
## 3              27               NA            1377              137
## 4              30               NA            1396               97
## 5              39               NA            1297              102
## 6              59               NA            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011               NA
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149
##   TEAM_TOTAL_BASES
## 1             1756
## 2             2172
## 3             2090
## 4             1960
## 5             1843
## 6             1827

2.2 Remove variables

Due to almost 92% missing values on TEAM_BATTING_HBP, it will be removed from the dataset.

Due to the addition of aggregate variable TEAM_TOTAL_BASES, the contributing variables will be removed: TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_3B, TEAM_BATTING_HR.

Below shows the dataset’s size and variables after this step:

#remove variable
tr_prep = select(tr_prep, 
                 -TEAM_BATTING_H, 
                 -TEAM_BATTING_2B, 
                 -TEAM_BATTING_3B, 
                 -TEAM_BATTING_HR, 
                 -TEAM_BATTING_HBP)

dim(tr_prep)
## [1] 2276   12
names(tr_prep)
##  [1] "TARGET_WINS"      "TEAM_BATTING_BB"  "TEAM_BATTING_SO" 
##  [4] "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS"  "TEAM_PITCHING_H" 
##  [7] "TEAM_PITCHING_HR" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [10] "TEAM_FIELDING_E"  "TEAM_FIELDING_DP" "TEAM_TOTAL_BASES"

2.3 Impute missing data

Below are the variables with missing data which will be populated with the variable’s mean.

colnames(tr_prep)[colSums(is.na(tr_prep)) > 0]
## [1] "TEAM_BATTING_SO"  "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS" 
## [4] "TEAM_PITCHING_SO" "TEAM_FIELDING_DP"

Below is a summary of the dataset after filling up the missing values:

#impute

tr_prep = tr_prep %>% 
  mutate(TEAM_BASERUN_CS = 
           ifelse(is.na(TEAM_BASERUN_CS), 
                  mean(TEAM_BASERUN_CS, na.rm=TRUE), TEAM_BASERUN_CS)) %>% 

  mutate(TEAM_BASERUN_SB = 
           ifelse(is.na(TEAM_BASERUN_SB), 
                  mean(TEAM_BASERUN_SB, na.rm=TRUE), TEAM_BASERUN_SB)) %>% 

  mutate(TEAM_PITCHING_SO = 
           ifelse(is.na(TEAM_PITCHING_SO), 
                  mean(TEAM_PITCHING_SO, na.rm=TRUE), TEAM_PITCHING_SO)) %>% 

  mutate(TEAM_BATTING_SO = 
           ifelse(is.na(TEAM_BATTING_SO), 
                  mean(TEAM_BATTING_SO, na.rm=TRUE), TEAM_BATTING_SO)) %>% 

  mutate(TEAM_FIELDING_DP = 
           ifelse(is.na(TEAM_FIELDING_DP), 
                  mean(TEAM_FIELDING_DP, na.rm=TRUE), TEAM_FIELDING_DP))

summary(tr_prep)
##   TARGET_WINS     TEAM_BATTING_BB TEAM_BATTING_SO  TEAM_BASERUN_SB
##  Min.   :  0.00   Min.   :  0.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 71.00   1st Qu.:451.0   1st Qu.: 556.8   1st Qu.: 67.0  
##  Median : 82.00   Median :512.0   Median : 735.6   Median :106.0  
##  Mean   : 80.79   Mean   :501.6   Mean   : 735.6   Mean   :124.8  
##  3rd Qu.: 92.00   3rd Qu.:580.0   3rd Qu.: 925.0   3rd Qu.:151.0  
##  Max.   :146.00   Max.   :878.0   Max.   :1399.0   Max.   :697.0  
##  TEAM_BASERUN_CS  TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
##  Min.   :  0.00   Min.   : 1137   Min.   :  0.0    Min.   :   0.0  
##  1st Qu.: 44.00   1st Qu.: 1419   1st Qu.: 50.0    1st Qu.: 476.0  
##  Median : 52.80   Median : 1518   Median :107.0    Median : 536.5  
##  Mean   : 52.80   Mean   : 1779   Mean   :105.7    Mean   : 553.0  
##  3rd Qu.: 54.25   3rd Qu.: 1682   3rd Qu.:150.0    3rd Qu.: 611.0  
##  Max.   :201.00   Max.   :30132   Max.   :343.0    Max.   :3645.0  
##  TEAM_PITCHING_SO  TEAM_FIELDING_E  TEAM_FIELDING_DP TEAM_TOTAL_BASES
##  Min.   :    0.0   Min.   :  65.0   Min.   : 52.0    Min.   :1026    
##  1st Qu.:  626.0   1st Qu.: 127.0   1st Qu.:134.0    1st Qu.:1947    
##  Median :  817.7   Median : 159.0   Median :146.4    Median :2126    
##  Mean   :  817.7   Mean   : 246.5   Mean   :146.4    Mean   :2120    
##  3rd Qu.:  957.0   3rd Qu.: 249.2   3rd Qu.:161.2    3rd Qu.:2285    
##  Max.   :19278.0   Max.   :1898.0   Max.   :228.0    Max.   :3290

2.4 Cap Outliers

Any outliers outside of lower 1.5IQR would be capped at 5th %ile, and observations above the upper 1.5IQR would be capped at 95th %ile. Reference from http://r-statistics.co/Outlier-Treatment-With-R.html

# Outlier Capping

tr_prep2 <- tr_prep

id <- c(2:12)
for (val in id) {
  qnt <- quantile(tr_prep2[,val], probs=c(.25, .75), na.rm = T)
  caps <- quantile(tr_prep2[,val], probs=c(.05, .95), na.rm = T)
  H <- 1.5 * IQR(tr_prep2[,val], na.rm = T)
  tr_prep2[,val][tr_prep2[,val] < (qnt[1] - H)] <- caps[1]
  tr_prep2[,val][tr_prep2[,val] > (qnt[2] + H)] <- caps[2]

}

After these 4 data preparation tasks, below displays the correlations of all possible predictor variables:

corr<- round(cor(tr_prep2[-1]),1)
ggcorrplot(corr, lab = TRUE)

corr
##                  TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## TEAM_BATTING_BB              1.0             0.3            -0.1
## TEAM_BATTING_SO              0.3             1.0            -0.2
## TEAM_BASERUN_SB             -0.1            -0.2             1.0
## TEAM_BASERUN_CS             -0.1            -0.2             0.4
## TEAM_PITCHING_H             -0.4            -0.5             0.3
## TEAM_PITCHING_HR             0.5             0.6            -0.4
## TEAM_PITCHING_BB             0.8             0.0             0.1
## TEAM_PITCHING_SO             0.1             0.9            -0.1
## TEAM_FIELDING_E             -0.5            -0.6             0.5
## TEAM_FIELDING_DP             0.3             0.1            -0.4
## TEAM_TOTAL_BASES             0.3             0.1            -0.2
##                  TEAM_BASERUN_CS TEAM_PITCHING_H TEAM_PITCHING_HR
## TEAM_BATTING_BB             -0.1            -0.4              0.5
## TEAM_BATTING_SO             -0.2            -0.5              0.6
## TEAM_BASERUN_SB              0.4             0.3             -0.4
## TEAM_BASERUN_CS              1.0             0.0             -0.3
## TEAM_PITCHING_H              0.0             1.0             -0.2
## TEAM_PITCHING_HR            -0.3            -0.2              1.0
## TEAM_PITCHING_BB            -0.1             0.1              0.3
## TEAM_PITCHING_SO            -0.2            -0.2              0.5
## TEAM_FIELDING_E              0.1             0.7             -0.6
## TEAM_FIELDING_DP            -0.1            -0.1              0.4
## TEAM_TOTAL_BASES            -0.2             0.2              0.7
##                  TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## TEAM_BATTING_BB               0.8              0.1            -0.5
## TEAM_BATTING_SO               0.0              0.9            -0.6
## TEAM_BASERUN_SB               0.1             -0.1             0.5
## TEAM_BASERUN_CS              -0.1             -0.2             0.1
## TEAM_PITCHING_H               0.1             -0.2             0.7
## TEAM_PITCHING_HR              0.3              0.5            -0.6
## TEAM_PITCHING_BB              1.0              0.1            -0.1
## TEAM_PITCHING_SO              0.1              1.0            -0.3
## TEAM_FIELDING_E              -0.1             -0.3             1.0
## TEAM_FIELDING_DP              0.3              0.0            -0.3
## TEAM_TOTAL_BASES              0.3              0.1            -0.2
##                  TEAM_FIELDING_DP TEAM_TOTAL_BASES
## TEAM_BATTING_BB               0.3              0.3
## TEAM_BATTING_SO               0.1              0.1
## TEAM_BASERUN_SB              -0.4             -0.2
## TEAM_BASERUN_CS              -0.1             -0.2
## TEAM_PITCHING_H              -0.1              0.2
## TEAM_PITCHING_HR              0.4              0.7
## TEAM_PITCHING_BB              0.3              0.3
## TEAM_PITCHING_SO              0.0              0.1
## TEAM_FIELDING_E              -0.3             -0.2
## TEAM_FIELDING_DP              1.0              0.4
## TEAM_TOTAL_BASES              0.4              1.0

Section 3 - Build Models

3.1 All variables Included

Below shows the summary, vif and diagnostics plot when all variables are included.

### Model 1 - All variables included
model1 <- lm(TARGET_WINS ~., data = tr_prep2)
summary(model1)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = tr_prep2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.040  -8.368   0.167   8.333  64.423 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      30.947444   5.025463   6.158 8.68e-10 ***
## TEAM_BATTING_BB   0.053735   0.008897   6.040 1.80e-09 ***
## TEAM_BATTING_SO  -0.015663   0.005052  -3.100  0.00196 ** 
## TEAM_BASERUN_SB   0.060483   0.006071   9.962  < 2e-16 ***
## TEAM_BASERUN_CS  -0.058439   0.023006  -2.540  0.01115 *  
## TEAM_PITCHING_H   0.008907   0.002095   4.253 2.20e-05 ***
## TEAM_PITCHING_HR -0.004984   0.012015  -0.415  0.67832    
## TEAM_PITCHING_BB -0.034361   0.007981  -4.305 1.74e-05 ***
## TEAM_PITCHING_SO  0.005110   0.004562   1.120  0.26285    
## TEAM_FIELDING_E  -0.022199   0.002982  -7.444 1.38e-13 ***
## TEAM_FIELDING_DP -0.109811   0.014798  -7.421 1.64e-13 ***
## TEAM_TOTAL_BASES  0.024669   0.002417  10.208  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.43 on 2264 degrees of freedom
## Multiple R-squared:  0.2765, Adjusted R-squared:  0.273 
## F-statistic: 78.66 on 11 and 2264 DF,  p-value: < 2.2e-16
vif(model1)
##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS 
##        11.582096        17.849283         2.294852         1.326458 
##  TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO 
##         6.475854         6.725598         8.705859        13.284509 
##  TEAM_FIELDING_E TEAM_FIELDING_DP TEAM_TOTAL_BASES 
##         4.325928         1.419217         4.555244
par(mfrow=c(2,2))
plot(model1)

par(mfrow=c(1,1))

3.2 Excludes variables based on possible Multicollinearity

Below shows the summary, vif and diagnostics plot when TEAM_BATTING_SO, TEAM_PITCHING_BB, TEAM_PITCHING_H, TEAM_PITCHING_HR variables are excluded.

### Model 2 - Excludes multicollinearity
model2 <- lm(TARGET_WINS ~ .
             - TEAM_BATTING_SO
             - TEAM_PITCHING_BB
             - TEAM_PITCHING_H
             - TEAM_PITCHING_HR, data = tr_prep2)

summary(model2)
## 
## Call:
## lm(formula = TARGET_WINS ~ . - TEAM_BATTING_SO - TEAM_PITCHING_BB - 
##     TEAM_PITCHING_H - TEAM_PITCHING_HR, data = tr_prep2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.259  -8.687   0.165   8.520  58.288 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      35.777053   3.731338   9.588  < 2e-16 ***
## TEAM_BATTING_BB   0.014466   0.003454   4.189 2.91e-05 ***
## TEAM_BASERUN_SB   0.051170   0.005770   8.869  < 2e-16 ***
## TEAM_BASERUN_CS  -0.030529   0.022776  -1.340     0.18    
## TEAM_PITCHING_SO -0.009154   0.001334  -6.864 8.59e-12 ***
## TEAM_FIELDING_E  -0.010721   0.002169  -4.944 8.22e-07 ***
## TEAM_FIELDING_DP -0.123429   0.014658  -8.421  < 2e-16 ***
## TEAM_TOTAL_BASES  0.028888   0.001281  22.559  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.57 on 2268 degrees of freedom
## Multiple R-squared:  0.2604, Adjusted R-squared:  0.2581 
## F-statistic:   114 on 7 and 2268 DF,  p-value: < 2.2e-16
vif(model2)
##  TEAM_BATTING_BB  TEAM_BASERUN_SB  TEAM_BASERUN_CS TEAM_PITCHING_SO 
##         1.710103         2.030921         1.273957         1.112149 
##  TEAM_FIELDING_E TEAM_FIELDING_DP TEAM_TOTAL_BASES 
##         2.241362         1.364467         1.253293
par(mfrow=c(2,2))
plot(model2)

par(mfrow=c(1,1))

3.3 Excludes variable based on insignificant P-value

Below shows the summary, vif and diagnostics plot when TEAM_BASERUN_CS variable is excluded.

### Model 3 - Excludes Insignificant variables
model3 <- lm(TARGET_WINS ~ .
             - TEAM_BATTING_SO
             - TEAM_PITCHING_BB
             - TEAM_PITCHING_H
             - TEAM_PITCHING_HR
             - TEAM_BASERUN_CS, data = tr_prep2)

summary(model3)
## 
## Call:
## lm(formula = TARGET_WINS ~ . - TEAM_BATTING_SO - TEAM_PITCHING_BB - 
##     TEAM_PITCHING_H - TEAM_PITCHING_HR - TEAM_BASERUN_CS, data = tr_prep2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.045  -8.733   0.227   8.558  58.198 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      33.637350   3.373213   9.972  < 2e-16 ***
## TEAM_BATTING_BB   0.015397   0.003384   4.551 5.63e-06 ***
## TEAM_BASERUN_SB   0.047982   0.005257   9.127  < 2e-16 ***
## TEAM_PITCHING_SO -0.008846   0.001314  -6.733 2.10e-11 ***
## TEAM_FIELDING_E  -0.009889   0.002078  -4.758 2.07e-06 ***
## TEAM_FIELDING_DP -0.124931   0.014617  -8.547  < 2e-16 ***
## TEAM_TOTAL_BASES  0.028997   0.001278  22.687  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.57 on 2269 degrees of freedom
## Multiple R-squared:  0.2598, Adjusted R-squared:  0.2578 
## F-statistic: 132.7 on 6 and 2269 DF,  p-value: < 2.2e-16
vif(model3)
##  TEAM_BATTING_BB  TEAM_BASERUN_SB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##         1.640901         1.685747         1.079243         2.058003 
## TEAM_FIELDING_DP TEAM_TOTAL_BASES 
##         1.356490         1.248182
par(mfrow=c(2,2))
plot(model3)

par(mfrow=c(1,1))

Section 4 - Select Model

Based on the 3 models, the 3rd one will be selected although the R-squared value is not the highest because possible multicollinearity is addressed and all included variables appear to contribute significantly to the model. The predictor coefficients makes sense

The TARGET_WINS of the evaluation data set will be predicted using this Model 3.