Introduction

Loading necessary packages

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(corrplot)
## corrplot 0.84 loaded
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(psych)
## Warning: package 'psych' was built under R version 3.5.2
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following object is masked from 'package:car':
## 
##     logit
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library (MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(faraway)
## 
## Attaching package: 'faraway'
## The following object is masked from 'package:psych':
## 
##     logit
## The following objects are masked from 'package:survival':
## 
##     rats, solder
## The following object is masked from 'package:lattice':
## 
##     melanoma
## The following objects are masked from 'package:car':
## 
##     logit, vif
# install.packages("faraway", dependencies = TRUE)

1-2. Data Exploration and Data Preparation

  1. Mean/St dev/ Median
train_data <- "/Users/Olga/Desktop/DataMining/assignment1/moneyball-training-data.csv"
moneyball_data <- read.csv(train_data, header=TRUE, stringsAsFactors=FALSE, fileEncoding="latin1")
head(moneyball_data)
##   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1     1          39           1445             194              39
## 2     2          70           1339             219              22
## 3     3          86           1377             232              35
## 4     4          70           1387             209              38
## 5     5          82           1297             186              27
## 6     6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842              NA
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1              NA               NA            9364               84
## 2              28               NA            1347              191
## 3              27               NA            1377              137
## 4              30               NA            1396               97
## 5              39               NA            1297              102
## 6              59               NA            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011               NA
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149
colnames(moneyball_data)
##  [1] "INDEX"            "TARGET_WINS"      "TEAM_BATTING_H"  
##  [4] "TEAM_BATTING_2B"  "TEAM_BATTING_3B"  "TEAM_BATTING_HR" 
##  [7] "TEAM_BATTING_BB"  "TEAM_BATTING_SO"  "TEAM_BASERUN_SB" 
## [10] "TEAM_BASERUN_CS"  "TEAM_BATTING_HBP" "TEAM_PITCHING_H" 
## [13] "TEAM_PITCHING_HR" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [16] "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"
dim(moneyball_data)
## [1] 2276   17
summary(moneyball_data)
##      INDEX         TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 630.8   1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0  
##  Median :1270.5   Median : 82.00   Median :1454   Median :238.0  
##  Mean   :1268.5   Mean   : 80.79   Mean   :1469   Mean   :241.2  
##  3rd Qu.:1915.5   3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                  
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO 
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0  
##  Median : 47.00   Median :102.00   Median :512.0   Median : 750.0  
##  Mean   : 55.25   Mean   : 99.61   Mean   :501.6   Mean   : 735.6  
##  3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.00   Max.   :878.0   Max.   :1399.0  
##                                                    NA's   :102     
##  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
##  Min.   :  0.0   Min.   :  0.0   Min.   :29.00    Min.   : 1137  
##  1st Qu.: 66.0   1st Qu.: 38.0   1st Qu.:50.50    1st Qu.: 1419  
##  Median :101.0   Median : 49.0   Median :58.00    Median : 1518  
##  Mean   :124.8   Mean   : 52.8   Mean   :59.36    Mean   : 1779  
##  3rd Qu.:156.0   3rd Qu.: 62.0   3rd Qu.:67.00    3rd Qu.: 1682  
##  Max.   :697.0   Max.   :201.0   Max.   :95.00    Max.   :30132  
##  NA's   :131     NA's   :772     NA's   :2085                    
##  TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##  Min.   :  0.0    Min.   :   0.0   Min.   :    0.0   Min.   :  65.0  
##  1st Qu.: 50.0    1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0  
##  Median :107.0    Median : 536.5   Median :  813.5   Median : 159.0  
##  Mean   :105.7    Mean   : 553.0   Mean   :  817.7   Mean   : 246.5  
##  3rd Qu.:150.0    3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2  
##  Max.   :343.0    Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                    NA's   :102                       
##  TEAM_FIELDING_DP
##  Min.   : 52.0   
##  1st Qu.:131.0   
##  Median :149.0   
##  Mean   :146.4   
##  3rd Qu.:164.0   
##  Max.   :228.0   
##  NA's   :286
str(moneyball_data)
## 'data.frame':    2276 obs. of  17 variables:
##  $ INDEX           : int  1 2 3 4 5 6 7 8 11 12 ...
##  $ TARGET_WINS     : int  39 70 86 70 82 75 80 85 86 76 ...
##  $ TEAM_BATTING_H  : int  1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
##  $ TEAM_BATTING_2B : int  194 219 232 209 186 200 179 171 197 213 ...
##  $ TEAM_BATTING_3B : int  39 22 35 38 27 36 54 37 40 18 ...
##  $ TEAM_BATTING_HR : int  13 190 137 96 102 92 122 115 114 96 ...
##  $ TEAM_BATTING_BB : int  143 685 602 451 472 443 525 456 447 441 ...
##  $ TEAM_BATTING_SO : int  842 1075 917 922 920 973 1062 1027 922 827 ...
##  $ TEAM_BASERUN_SB : int  NA 37 46 43 49 107 80 40 69 72 ...
##  $ TEAM_BASERUN_CS : int  NA 28 27 30 39 59 54 36 27 34 ...
##  $ TEAM_BATTING_HBP: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ TEAM_PITCHING_H : int  9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
##  $ TEAM_PITCHING_HR: int  84 191 137 97 102 92 122 116 114 96 ...
##  $ TEAM_PITCHING_BB: int  927 689 602 454 472 443 525 459 447 441 ...
##  $ TEAM_PITCHING_SO: int  5456 1082 917 928 920 973 1062 1033 922 827 ...
##  $ TEAM_FIELDING_E : int  1011 193 175 164 138 123 136 112 127 131 ...
##  $ TEAM_FIELDING_DP: int  NA 155 153 156 168 149 186 136 169 159 ...
  1. missing variables and missing values handling process
sapply(moneyball_data, function(y) sum(length(which(is.na(y)))))/nrow(moneyball_data)*100
##            INDEX      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B 
##         0.000000         0.000000         0.000000         0.000000 
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##         0.000000         0.000000         0.000000         4.481547 
##  TEAM_BASERUN_SB  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H 
##         5.755712        33.919156        91.608084         0.000000 
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##         0.000000         0.000000         4.481547         0.000000 
## TEAM_FIELDING_DP 
##        12.565905

removing “index” and “TEAM_BATTING_HBP” columns as “TEAM_BATTING_HBP” has 92% of missing values"

moneyball_data<-subset(moneyball_data, select = -c(INDEX))
moneyball<-subset(moneyball_data, select = -c(TEAM_BATTING_HBP))

replacing other missing values with mean

replace_mean <- function(x){
  x <- as.numeric(as.character(x))
  x[is.na(x)] = mean(x, na.rm=TRUE)
  return(x)
}
moneyball_filled <- apply(moneyball, 2, replace_mean)
moneyball_filled <- as.data.frame(moneyball_filled)
  1. Bar Chart/ Box Plot/ Histogram
par(mfrow = c(3,5))
plot(density(moneyball_filled$TARGET_WINS))
plot(density(moneyball_filled$TEAM_BATTING_H))
plot(density(moneyball_filled$TEAM_BATTING_2B))
plot(density(moneyball_filled$TEAM_BATTING_3B))
plot(density(moneyball_filled$TEAM_BATTING_HR))
plot(density(moneyball_filled$TEAM_BATTING_BB))
plot(density(moneyball_filled$TEAM_BATTING_SO))
plot(density(moneyball_filled$TEAM_BASERUN_SB))
plot(density(moneyball_filled$TEAM_BASERUN_CS))
plot(density(moneyball_filled$TEAM_PITCHING_H))
plot(density(moneyball_filled$TEAM_PITCHING_HR))
plot(density(moneyball_filled$TEAM_PITCHING_BB))
plot(density(moneyball_filled$TEAM_PITCHING_SO))
plot(density(moneyball_filled$TEAM_FIELDING_E))
plot(density(moneyball_filled$TEAM_FIELDING_DP))

Target variable plot

plot(moneyball_filled$TARGET_WINS)

hist(moneyball_filled$TARGET_WINS)

boxplot(moneyball_filled$TARGET_WINS)

summary(moneyball_filled$TARGET_WINS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   71.00   82.00   80.79   92.00  146.00
  1. correlation variables to each other and to target

Checking for variable dependencies, as all variables are numeric we will rely on correleation.

corr_moneyball<- cor(moneyball_filled)
round(corr_moneyball, digits =3)
##                  TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## TARGET_WINS            1.000          0.389           0.289
## TEAM_BATTING_H         0.389          1.000           0.563
## TEAM_BATTING_2B        0.289          0.563           1.000
## TEAM_BATTING_3B        0.143          0.428          -0.107
## TEAM_BATTING_HR        0.176         -0.007           0.435
## TEAM_BATTING_BB        0.233         -0.072           0.256
## TEAM_BATTING_SO       -0.031         -0.451           0.155
## TEAM_BASERUN_SB        0.123          0.114          -0.190
## TEAM_BASERUN_CS        0.016          0.012          -0.074
## TEAM_PITCHING_H       -0.110          0.303           0.024
## TEAM_PITCHING_HR       0.189          0.073           0.455
## TEAM_PITCHING_BB       0.124          0.094           0.178
## TEAM_PITCHING_SO      -0.076         -0.245           0.062
## TEAM_FIELDING_E       -0.176          0.265          -0.235
## TEAM_FIELDING_DP      -0.029          0.115           0.263
##                  TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
## TARGET_WINS                0.143           0.176           0.233
## TEAM_BATTING_H             0.428          -0.007          -0.072
## TEAM_BATTING_2B           -0.107           0.435           0.256
## TEAM_BATTING_3B            1.000          -0.636          -0.287
## TEAM_BATTING_HR           -0.636           1.000           0.514
## TEAM_BATTING_BB           -0.287           0.514           1.000
## TEAM_BATTING_SO           -0.657           0.697           0.373
## TEAM_BASERUN_SB            0.501          -0.428          -0.082
## TEAM_BASERUN_CS            0.195          -0.291          -0.085
## TEAM_PITCHING_H            0.195          -0.250          -0.450
## TEAM_PITCHING_HR          -0.568           0.969           0.460
## TEAM_PITCHING_BB          -0.002           0.137           0.489
## TEAM_PITCHING_SO          -0.254           0.177          -0.020
## TEAM_FIELDING_E            0.510          -0.587          -0.656
## TEAM_FIELDING_DP          -0.246           0.406           0.340
##                  TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## TARGET_WINS               -0.031           0.123           0.016
## TEAM_BATTING_H            -0.451           0.114           0.012
## TEAM_BATTING_2B            0.155          -0.190          -0.074
## TEAM_BATTING_3B           -0.657           0.501           0.195
## TEAM_BATTING_HR            0.697          -0.428          -0.291
## TEAM_BATTING_BB            0.373          -0.082          -0.085
## TEAM_BATTING_SO            1.000          -0.232          -0.157
## TEAM_BASERUN_SB           -0.232           1.000           0.279
## TEAM_BASERUN_CS           -0.157           0.279           1.000
## TEAM_PITCHING_H           -0.375           0.061          -0.037
## TEAM_PITCHING_HR           0.637          -0.398          -0.290
## TEAM_PITCHING_BB           0.037           0.119          -0.054
## TEAM_PITCHING_SO           0.416          -0.055          -0.069
## TEAM_FIELDING_E           -0.583           0.369           0.024
## TEAM_FIELDING_DP           0.131          -0.302          -0.140
##                  TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## TARGET_WINS               -0.110            0.189            0.124
## TEAM_BATTING_H             0.303            0.073            0.094
## TEAM_BATTING_2B            0.024            0.455            0.178
## TEAM_BATTING_3B            0.195           -0.568           -0.002
## TEAM_BATTING_HR           -0.250            0.969            0.137
## TEAM_BATTING_BB           -0.450            0.460            0.489
## TEAM_BATTING_SO           -0.375            0.637            0.037
## TEAM_BASERUN_SB            0.061           -0.398            0.119
## TEAM_BASERUN_CS           -0.037           -0.290           -0.054
## TEAM_PITCHING_H            1.000           -0.142            0.321
## TEAM_PITCHING_HR          -0.142            1.000            0.222
## TEAM_PITCHING_BB           0.321            0.222            1.000
## TEAM_PITCHING_SO           0.267            0.196            0.482
## TEAM_FIELDING_E            0.668           -0.493           -0.023
## TEAM_FIELDING_DP          -0.058            0.401            0.188
##                  TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## TARGET_WINS                -0.076          -0.176           -0.029
## TEAM_BATTING_H             -0.245           0.265            0.115
## TEAM_BATTING_2B             0.062          -0.235            0.263
## TEAM_BATTING_3B            -0.254           0.510           -0.246
## TEAM_BATTING_HR             0.177          -0.587            0.406
## TEAM_BATTING_BB            -0.020          -0.656            0.340
## TEAM_BATTING_SO             0.416          -0.583            0.131
## TEAM_BASERUN_SB            -0.055           0.369           -0.302
## TEAM_BASERUN_CS            -0.069           0.024           -0.140
## TEAM_PITCHING_H             0.267           0.668           -0.058
## TEAM_PITCHING_HR            0.196          -0.493            0.401
## TEAM_PITCHING_BB            0.482          -0.023            0.188
## TEAM_PITCHING_SO            1.000          -0.023            0.009
## TEAM_FIELDING_E            -0.023           1.000           -0.253
## TEAM_FIELDING_DP            0.009          -0.253            1.000
corrplot(corr_moneyball, method = "circle")

Possible variables to remove: TEAM_FIELDING_E, TEAM_BASERUN_CS, TEAM_BATTING_SO, TEAM_BATTING_3B, TEAM_BATTING_HR

3. Modeling

Backward elimination method:

model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
##     data = moneyball_filled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.994  -8.576   0.136   8.345  58.628 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.502e+01  5.397e+00   4.636 3.75e-06 ***
## TEAM_BATTING_H    4.824e-02  3.687e-03  13.085  < 2e-16 ***
## TEAM_BATTING_2B  -2.006e-02  9.152e-03  -2.192 0.028486 *  
## TEAM_BATTING_3B   6.047e-02  1.676e-02   3.608 0.000315 ***
## TEAM_BATTING_HR   5.299e-02  2.743e-02   1.932 0.053488 .  
## TEAM_BATTING_BB   1.042e-02  5.818e-03   1.790 0.073544 .  
## TEAM_BATTING_SO  -9.349e-03  2.551e-03  -3.665 0.000253 ***
## TEAM_BASERUN_SB   2.949e-02  4.462e-03   6.610 4.78e-11 ***
## TEAM_BASERUN_CS  -1.188e-02  1.614e-02  -0.736 0.461905    
## TEAM_PITCHING_H  -7.342e-04  3.676e-04  -1.997 0.045946 *  
## TEAM_PITCHING_HR  1.480e-02  2.432e-02   0.609 0.542877    
## TEAM_PITCHING_BB  8.891e-05  4.145e-03   0.021 0.982891    
## TEAM_PITCHING_SO  2.843e-03  9.187e-04   3.095 0.001994 ** 
## TEAM_FIELDING_E  -2.112e-02  2.480e-03  -8.516  < 2e-16 ***
## TEAM_FIELDING_DP -1.210e-01  1.302e-02  -9.297  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.04 on 2261 degrees of freedom
## Multiple R-squared:  0.3189, Adjusted R-squared:  0.3147 
## F-statistic: 75.63 on 14 and 2261 DF,  p-value: < 2.2e-16

Removing TEAM_PITCHING_BB

model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = moneyball_filled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.994  -8.576   0.136   8.345  58.626 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      25.0145796  5.3904993   4.640 3.67e-06 ***
## TEAM_BATTING_H    0.0482393  0.0036807  13.106  < 2e-16 ***
## TEAM_BATTING_2B  -0.0200575  0.0091490  -2.192 0.028457 *  
## TEAM_BATTING_3B   0.0604730  0.0167556   3.609 0.000314 ***
## TEAM_BATTING_HR   0.0527106  0.0240710   2.190 0.028641 *  
## TEAM_BATTING_BB   0.0105175  0.0033664   3.124 0.001805 ** 
## TEAM_BATTING_SO  -0.0093631  0.0024585  -3.809 0.000144 ***
## TEAM_BASERUN_SB   0.0295055  0.0044087   6.693 2.76e-11 ***
## TEAM_BASERUN_CS  -0.0118872  0.0161276  -0.737 0.461155    
## TEAM_PITCHING_H  -0.0007306  0.0003283  -2.225 0.026147 *  
## TEAM_PITCHING_HR  0.0150659  0.0209923   0.718 0.473025    
## TEAM_PITCHING_SO  0.0028567  0.0006717   4.253 2.20e-05 ***
## TEAM_FIELDING_E  -0.0211192  0.0024784  -8.521  < 2e-16 ***
## TEAM_FIELDING_DP -0.1210298  0.0130139  -9.300  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.04 on 2262 degrees of freedom
## Multiple R-squared:  0.3189, Adjusted R-squared:  0.315 
## F-statistic: 81.49 on 13 and 2262 DF,  p-value: < 2.2e-16

Removing TEAM_BASERUN_CS

model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB  + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP, data = moneyball_filled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.905  -8.584   0.124   8.406  58.593 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      24.2348098  5.2851330   4.585 4.78e-06 ***
## TEAM_BATTING_H    0.0482055  0.0036800  13.099  < 2e-16 ***
## TEAM_BATTING_2B  -0.0203302  0.0091405  -2.224 0.026235 *  
## TEAM_BATTING_3B   0.0608466  0.0167463   3.633 0.000286 ***
## TEAM_BATTING_HR   0.0543985  0.0239594   2.270 0.023274 *  
## TEAM_BATTING_BB   0.0107643  0.0033494   3.214 0.001328 ** 
## TEAM_BATTING_SO  -0.0093418  0.0024580  -3.800 0.000148 ***
## TEAM_BASERUN_SB   0.0287600  0.0042906   6.703 2.57e-11 ***
## TEAM_PITCHING_H  -0.0007390  0.0003281  -2.253 0.024372 *  
## TEAM_PITCHING_HR  0.0147103  0.0209846   0.701 0.483372    
## TEAM_PITCHING_SO  0.0028640  0.0006716   4.265 2.08e-05 ***
## TEAM_FIELDING_E  -0.0207217  0.0024188  -8.567  < 2e-16 ***
## TEAM_FIELDING_DP -0.1211603  0.0130114  -9.312  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.04 on 2263 degrees of freedom
## Multiple R-squared:  0.3188, Adjusted R-squared:  0.3152 
## F-statistic: 88.25 on 12 and 2263 DF,  p-value: < 2.2e-16

Removing TEAM_PITCHING_HR

model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB  + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP, data = moneyball_filled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.899  -8.568   0.091   8.397  58.651 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      23.6666983  5.2220414   4.532 6.14e-06 ***
## TEAM_BATTING_H    0.0484570  0.0036621  13.232  < 2e-16 ***
## TEAM_BATTING_2B  -0.0205123  0.0091358  -2.245 0.024847 *  
## TEAM_BATTING_3B   0.0624661  0.0165843   3.767 0.000170 ***
## TEAM_BATTING_HR   0.0697785  0.0096266   7.249 5.75e-13 ***
## TEAM_BATTING_BB   0.0107446  0.0033489   3.208 0.001354 ** 
## TEAM_BATTING_SO  -0.0093019  0.0024571  -3.786 0.000157 ***
## TEAM_BASERUN_SB   0.0287708  0.0042901   6.706 2.51e-11 ***
## TEAM_PITCHING_H  -0.0006920  0.0003211  -2.155 0.031253 *  
## TEAM_PITCHING_SO  0.0028867  0.0006707   4.304 1.75e-05 ***
## TEAM_FIELDING_E  -0.0205973  0.0024120  -8.540  < 2e-16 ***
## TEAM_FIELDING_DP -0.1210083  0.0130082  -9.302  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.03 on 2264 degrees of freedom
## Multiple R-squared:  0.3186, Adjusted R-squared:  0.3153 
## F-statistic: 96.25 on 11 and 2264 DF,  p-value: < 2.2e-16

Checking for non-linearity

crPlots(model)

TEAM_PITCHING_H, TEAM_PITCHING_SO did not pass the check for non-linerity.

Let’s transform TEAM_PITCHING_H, TEAM_PITCHING_SO and re-fit the model

moneyball_filled$TEAM_PITCHING_H<- log10(moneyball_filled$TEAM_PITCHING_H+0.1)
moneyball_filled$TEAM_PITCHING_SO<- log10(moneyball_filled$TEAM_PITCHING_SO+0.1)
model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB  + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP, data = moneyball_filled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -53.500  -8.353   0.050   8.276  63.152 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -18.385299  13.648651  -1.347 0.178102    
## TEAM_BATTING_H     0.041874   0.003784  11.065  < 2e-16 ***
## TEAM_BATTING_2B   -0.020476   0.009106  -2.249 0.024630 *  
## TEAM_BATTING_3B    0.087638   0.016862   5.197 2.20e-07 ***
## TEAM_BATTING_HR    0.058540   0.009697   6.037 1.83e-09 ***
## TEAM_BATTING_BB    0.012944   0.003388   3.821 0.000137 ***
## TEAM_BATTING_SO   -0.001186   0.002534  -0.468 0.639742    
## TEAM_BASERUN_SB    0.031437   0.004300   7.311 3.65e-13 ***
## TEAM_PITCHING_H   17.140905   4.594173   3.731 0.000195 ***
## TEAM_PITCHING_SO  -2.656620   0.914734  -2.904 0.003717 ** 
## TEAM_FIELDING_E   -0.030455   0.002954 -10.309  < 2e-16 ***
## TEAM_FIELDING_DP  -0.120505   0.013004  -9.267  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.03 on 2264 degrees of freedom
## Multiple R-squared:  0.319,  Adjusted R-squared:  0.3157 
## F-statistic: 96.42 on 11 and 2264 DF,  p-value: < 2.2e-16
crPlots(model)

Removing TEAM_BATTING_SO with p-value > 0.05

model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB  + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
##     data = moneyball_filled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -53.382  -8.328   0.025   8.211  62.933 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -20.516367  12.864856  -1.595 0.110905    
## TEAM_BATTING_H     0.042628   0.003424  12.448  < 2e-16 ***
## TEAM_BATTING_2B   -0.021583   0.008792  -2.455 0.014167 *  
## TEAM_BATTING_3B    0.089227   0.016513   5.403 7.23e-08 ***
## TEAM_BATTING_HR    0.055774   0.007688   7.255 5.50e-13 ***
## TEAM_BATTING_BB    0.013293   0.003304   4.023 5.93e-05 ***
## TEAM_BASERUN_SB    0.030879   0.004130   7.476 1.09e-13 ***
## TEAM_PITCHING_H   17.440250   4.548668   3.834 0.000129 ***
## TEAM_PITCHING_SO  -2.847123   0.819083  -3.476 0.000519 ***
## TEAM_FIELDING_E   -0.030494   0.002953 -10.328  < 2e-16 ***
## TEAM_FIELDING_DP  -0.119878   0.012932  -9.270  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.03 on 2265 degrees of freedom
## Multiple R-squared:  0.319,  Adjusted R-squared:  0.316 
## F-statistic: 106.1 on 10 and 2265 DF,  p-value: < 2.2e-16

Eliminating extreme values

(using Cook’s distance plot as 4/(n-k-1), identify D values > cutoff)

cutoff<-4/((nrow(moneyball_filled)-length(model$coefficients)-2))
plot(model, which = 4, cook.levels = cutoff)

plot(model, which = 5, cook.levels = cutoff)

moneyball_filled<-moneyball_filled[-which(rownames(moneyball_filled)
                                          %in% c ("1828","1342","2233")),]

refit the model:

model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB  + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP, data = moneyball_filled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.117  -8.396   0.026   8.238  64.496 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -18.936246  14.300248  -1.324 0.185574    
## TEAM_BATTING_H     0.039743   0.003901  10.187  < 2e-16 ***
## TEAM_BATTING_2B   -0.021808   0.009043  -2.412 0.015957 *  
## TEAM_BATTING_3B    0.101540   0.016844   6.028 1.93e-09 ***
## TEAM_BATTING_HR    0.065471   0.009640   6.791 1.41e-11 ***
## TEAM_BATTING_BB    0.012322   0.003354   3.674 0.000245 ***
## TEAM_BATTING_SO   -0.001465   0.002507  -0.584 0.559106    
## TEAM_BASERUN_SB    0.032302   0.004275   7.556 5.99e-14 ***
## TEAM_PITCHING_H   18.926032   4.961698   3.814 0.000140 ***
## TEAM_PITCHING_SO  -3.560746   0.932952  -3.817 0.000139 ***
## TEAM_FIELDING_E   -0.031367   0.003048 -10.291  < 2e-16 ***
## TEAM_FIELDING_DP  -0.120207   0.012872  -9.339  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.89 on 2261 degrees of freedom
## Multiple R-squared:  0.3245, Adjusted R-squared:  0.3212 
## F-statistic: 98.72 on 11 and 2261 DF,  p-value: < 2.2e-16

Check and eliminate further extrimes if any

cutoff<-4/((nrow(moneyball_filled)-length(model$coefficients)-2))
plot(model, which = 4, cook.levels = cutoff)

plot(model, which = 5, cook.levels = cutoff)

# moneyball_filled<-moneyball_filled[-which(rownames(moneyball_filled)
#                                           %in% c ("1211","299","2012")),]
# summary(model)

Checking for multicolliniarity

vif(model)
##   TEAM_BATTING_H  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR 
##         4.322088         2.438948         3.007291         4.650721 
##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H 
##         2.294133         5.045176         1.812464         5.617980 
## TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
##         1.722976         6.490171         1.364366
plot(model)

TEAM_FIELDING_E is withing the range 5-10, but eliminating TEAM_FIELDING_E does not improve the model.

model_basic<-lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB  + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_data)
summary(model_basic)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP, data = moneyball_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.317  -7.199   0.121   7.045  29.766 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      58.312951   6.019406   9.687  < 2e-16 ***
## TEAM_BATTING_H   -0.010007   0.010615  -0.943  0.34594    
## TEAM_BATTING_2B  -0.049989   0.008875  -5.633 2.05e-08 ***
## TEAM_BATTING_3B   0.181788   0.018982   9.577  < 2e-16 ***
## TEAM_BATTING_HR   0.100845   0.009158  11.012  < 2e-16 ***
## TEAM_BATTING_BB   0.034055   0.003133  10.870  < 2e-16 ***
## TEAM_BATTING_SO   0.045928   0.016420   2.797  0.00521 ** 
## TEAM_BASERUN_SB   0.069889   0.005535  12.626  < 2e-16 ***
## TEAM_PITCHING_H   0.037438   0.009239   4.052 5.29e-05 ***
## TEAM_PITCHING_SO -0.065427   0.015514  -4.217 2.59e-05 ***
## TEAM_FIELDING_E  -0.116444   0.007029 -16.566  < 2e-16 ***
## TEAM_FIELDING_DP -0.112850   0.012279  -9.190  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.19 on 1823 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.4045, Adjusted R-squared:  0.4009 
## F-statistic: 112.6 on 11 and 1823 DF,  p-value: < 2.2e-16