Loading necessary packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(corrplot)
## corrplot 0.84 loaded
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych)
## Warning: package 'psych' was built under R version 3.5.2
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following object is masked from 'package:car':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library (MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(faraway)
##
## Attaching package: 'faraway'
## The following object is masked from 'package:psych':
##
## logit
## The following objects are masked from 'package:survival':
##
## rats, solder
## The following object is masked from 'package:lattice':
##
## melanoma
## The following objects are masked from 'package:car':
##
## logit, vif
# install.packages("faraway", dependencies = TRUE)
1-2. Data Exploration and Data Preparation
train_data <- "/Users/Olga/Desktop/DataMining/assignment1/moneyball-training-data.csv"
moneyball_data <- read.csv(train_data, header=TRUE, stringsAsFactors=FALSE, fileEncoding="latin1")
head(moneyball_data)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 1 39 1445 194 39
## 2 2 70 1339 219 22
## 3 3 86 1377 232 35
## 4 4 70 1387 209 38
## 5 5 82 1297 186 27
## 6 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 NA
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 NA NA 9364 84
## 2 28 NA 1347 191
## 3 27 NA 1377 137
## 4 30 NA 1396 97
## 5 39 NA 1297 102
## 6 59 NA 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 NA
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
colnames(moneyball_data)
## [1] "INDEX" "TARGET_WINS" "TEAM_BATTING_H"
## [4] "TEAM_BATTING_2B" "TEAM_BATTING_3B" "TEAM_BATTING_HR"
## [7] "TEAM_BATTING_BB" "TEAM_BATTING_SO" "TEAM_BASERUN_SB"
## [10] "TEAM_BASERUN_CS" "TEAM_BATTING_HBP" "TEAM_PITCHING_H"
## [13] "TEAM_PITCHING_HR" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [16] "TEAM_FIELDING_E" "TEAM_FIELDING_DP"
dim(moneyball_data)
## [1] 2276 17
summary(moneyball_data)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0
## Median :1270.5 Median : 82.00 Median :1454 Median :238.0
## Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2
## 3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0
## Median : 47.00 Median :102.00 Median :512.0 Median : 750.0
## Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6
## 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0
## NA's :102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137
## 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419
## Median :101.0 Median : 49.0 Median :58.00 Median : 1518
## Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779
## 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682
## Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132
## NA's :131 NA's :772 NA's :2085
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0
## Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0
## Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5
## 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
## NA's :102
## TEAM_FIELDING_DP
## Min. : 52.0
## 1st Qu.:131.0
## Median :149.0
## Mean :146.4
## 3rd Qu.:164.0
## Max. :228.0
## NA's :286
str(moneyball_data)
## 'data.frame': 2276 obs. of 17 variables:
## $ INDEX : int 1 2 3 4 5 6 7 8 11 12 ...
## $ TARGET_WINS : int 39 70 86 70 82 75 80 85 86 76 ...
## $ TEAM_BATTING_H : int 1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
## $ TEAM_BATTING_2B : int 194 219 232 209 186 200 179 171 197 213 ...
## $ TEAM_BATTING_3B : int 39 22 35 38 27 36 54 37 40 18 ...
## $ TEAM_BATTING_HR : int 13 190 137 96 102 92 122 115 114 96 ...
## $ TEAM_BATTING_BB : int 143 685 602 451 472 443 525 456 447 441 ...
## $ TEAM_BATTING_SO : int 842 1075 917 922 920 973 1062 1027 922 827 ...
## $ TEAM_BASERUN_SB : int NA 37 46 43 49 107 80 40 69 72 ...
## $ TEAM_BASERUN_CS : int NA 28 27 30 39 59 54 36 27 34 ...
## $ TEAM_BATTING_HBP: int NA NA NA NA NA NA NA NA NA NA ...
## $ TEAM_PITCHING_H : int 9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
## $ TEAM_PITCHING_HR: int 84 191 137 97 102 92 122 116 114 96 ...
## $ TEAM_PITCHING_BB: int 927 689 602 454 472 443 525 459 447 441 ...
## $ TEAM_PITCHING_SO: int 5456 1082 917 928 920 973 1062 1033 922 827 ...
## $ TEAM_FIELDING_E : int 1011 193 175 164 138 123 136 112 127 131 ...
## $ TEAM_FIELDING_DP: int NA 155 153 156 168 149 186 136 169 159 ...
sapply(moneyball_data, function(y) sum(length(which(is.na(y)))))/nrow(moneyball_data)*100
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## 0.000000 0.000000 0.000000 0.000000
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## 0.000000 0.000000 0.000000 4.481547
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## 5.755712 33.919156 91.608084 0.000000
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## 0.000000 0.000000 4.481547 0.000000
## TEAM_FIELDING_DP
## 12.565905
removing “index” and “TEAM_BATTING_HBP” columns as “TEAM_BATTING_HBP” has 92% of missing values"
moneyball_data<-subset(moneyball_data, select = -c(INDEX))
moneyball<-subset(moneyball_data, select = -c(TEAM_BATTING_HBP))
replacing other missing values with mean
replace_mean <- function(x){
x <- as.numeric(as.character(x))
x[is.na(x)] = mean(x, na.rm=TRUE)
return(x)
}
moneyball_filled <- apply(moneyball, 2, replace_mean)
moneyball_filled <- as.data.frame(moneyball_filled)
par(mfrow = c(3,5))
plot(density(moneyball_filled$TARGET_WINS))
plot(density(moneyball_filled$TEAM_BATTING_H))
plot(density(moneyball_filled$TEAM_BATTING_2B))
plot(density(moneyball_filled$TEAM_BATTING_3B))
plot(density(moneyball_filled$TEAM_BATTING_HR))
plot(density(moneyball_filled$TEAM_BATTING_BB))
plot(density(moneyball_filled$TEAM_BATTING_SO))
plot(density(moneyball_filled$TEAM_BASERUN_SB))
plot(density(moneyball_filled$TEAM_BASERUN_CS))
plot(density(moneyball_filled$TEAM_PITCHING_H))
plot(density(moneyball_filled$TEAM_PITCHING_HR))
plot(density(moneyball_filled$TEAM_PITCHING_BB))
plot(density(moneyball_filled$TEAM_PITCHING_SO))
plot(density(moneyball_filled$TEAM_FIELDING_E))
plot(density(moneyball_filled$TEAM_FIELDING_DP))
Target variable plot
plot(moneyball_filled$TARGET_WINS)
hist(moneyball_filled$TARGET_WINS)
boxplot(moneyball_filled$TARGET_WINS)
summary(moneyball_filled$TARGET_WINS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 71.00 82.00 80.79 92.00 146.00
Checking for variable dependencies, as all variables are numeric we will rely on correleation.
corr_moneyball<- cor(moneyball_filled)
round(corr_moneyball, digits =3)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## TARGET_WINS 1.000 0.389 0.289
## TEAM_BATTING_H 0.389 1.000 0.563
## TEAM_BATTING_2B 0.289 0.563 1.000
## TEAM_BATTING_3B 0.143 0.428 -0.107
## TEAM_BATTING_HR 0.176 -0.007 0.435
## TEAM_BATTING_BB 0.233 -0.072 0.256
## TEAM_BATTING_SO -0.031 -0.451 0.155
## TEAM_BASERUN_SB 0.123 0.114 -0.190
## TEAM_BASERUN_CS 0.016 0.012 -0.074
## TEAM_PITCHING_H -0.110 0.303 0.024
## TEAM_PITCHING_HR 0.189 0.073 0.455
## TEAM_PITCHING_BB 0.124 0.094 0.178
## TEAM_PITCHING_SO -0.076 -0.245 0.062
## TEAM_FIELDING_E -0.176 0.265 -0.235
## TEAM_FIELDING_DP -0.029 0.115 0.263
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
## TARGET_WINS 0.143 0.176 0.233
## TEAM_BATTING_H 0.428 -0.007 -0.072
## TEAM_BATTING_2B -0.107 0.435 0.256
## TEAM_BATTING_3B 1.000 -0.636 -0.287
## TEAM_BATTING_HR -0.636 1.000 0.514
## TEAM_BATTING_BB -0.287 0.514 1.000
## TEAM_BATTING_SO -0.657 0.697 0.373
## TEAM_BASERUN_SB 0.501 -0.428 -0.082
## TEAM_BASERUN_CS 0.195 -0.291 -0.085
## TEAM_PITCHING_H 0.195 -0.250 -0.450
## TEAM_PITCHING_HR -0.568 0.969 0.460
## TEAM_PITCHING_BB -0.002 0.137 0.489
## TEAM_PITCHING_SO -0.254 0.177 -0.020
## TEAM_FIELDING_E 0.510 -0.587 -0.656
## TEAM_FIELDING_DP -0.246 0.406 0.340
## TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## TARGET_WINS -0.031 0.123 0.016
## TEAM_BATTING_H -0.451 0.114 0.012
## TEAM_BATTING_2B 0.155 -0.190 -0.074
## TEAM_BATTING_3B -0.657 0.501 0.195
## TEAM_BATTING_HR 0.697 -0.428 -0.291
## TEAM_BATTING_BB 0.373 -0.082 -0.085
## TEAM_BATTING_SO 1.000 -0.232 -0.157
## TEAM_BASERUN_SB -0.232 1.000 0.279
## TEAM_BASERUN_CS -0.157 0.279 1.000
## TEAM_PITCHING_H -0.375 0.061 -0.037
## TEAM_PITCHING_HR 0.637 -0.398 -0.290
## TEAM_PITCHING_BB 0.037 0.119 -0.054
## TEAM_PITCHING_SO 0.416 -0.055 -0.069
## TEAM_FIELDING_E -0.583 0.369 0.024
## TEAM_FIELDING_DP 0.131 -0.302 -0.140
## TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## TARGET_WINS -0.110 0.189 0.124
## TEAM_BATTING_H 0.303 0.073 0.094
## TEAM_BATTING_2B 0.024 0.455 0.178
## TEAM_BATTING_3B 0.195 -0.568 -0.002
## TEAM_BATTING_HR -0.250 0.969 0.137
## TEAM_BATTING_BB -0.450 0.460 0.489
## TEAM_BATTING_SO -0.375 0.637 0.037
## TEAM_BASERUN_SB 0.061 -0.398 0.119
## TEAM_BASERUN_CS -0.037 -0.290 -0.054
## TEAM_PITCHING_H 1.000 -0.142 0.321
## TEAM_PITCHING_HR -0.142 1.000 0.222
## TEAM_PITCHING_BB 0.321 0.222 1.000
## TEAM_PITCHING_SO 0.267 0.196 0.482
## TEAM_FIELDING_E 0.668 -0.493 -0.023
## TEAM_FIELDING_DP -0.058 0.401 0.188
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## TARGET_WINS -0.076 -0.176 -0.029
## TEAM_BATTING_H -0.245 0.265 0.115
## TEAM_BATTING_2B 0.062 -0.235 0.263
## TEAM_BATTING_3B -0.254 0.510 -0.246
## TEAM_BATTING_HR 0.177 -0.587 0.406
## TEAM_BATTING_BB -0.020 -0.656 0.340
## TEAM_BATTING_SO 0.416 -0.583 0.131
## TEAM_BASERUN_SB -0.055 0.369 -0.302
## TEAM_BASERUN_CS -0.069 0.024 -0.140
## TEAM_PITCHING_H 0.267 0.668 -0.058
## TEAM_PITCHING_HR 0.196 -0.493 0.401
## TEAM_PITCHING_BB 0.482 -0.023 0.188
## TEAM_PITCHING_SO 1.000 -0.023 0.009
## TEAM_FIELDING_E -0.023 1.000 -0.253
## TEAM_FIELDING_DP 0.009 -0.253 1.000
corrplot(corr_moneyball, method = "circle")
poor correlation with target (p<0.1): TEAM_FIELDING_E, TEAM_BASERUN_CS, TEAM_BATTING_SO, TEAM_BATTING_3B
strong correlation between each others (>0.6): TEAM_PITCHING_HR vs TEAM_BATTING_HR (0.969); TEAM_BATTING_HR VS TEAM_BATTING_SO (0.693), TEAM_BATTING_3B VS TEAM_BATTING_SO (-0.656)
Possible variables to remove: TEAM_FIELDING_E, TEAM_BASERUN_CS, TEAM_BATTING_SO, TEAM_BATTING_3B, TEAM_BATTING_HR
Backward elimination method:
model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = moneyball_filled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.994 -8.576 0.136 8.345 58.628
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.502e+01 5.397e+00 4.636 3.75e-06 ***
## TEAM_BATTING_H 4.824e-02 3.687e-03 13.085 < 2e-16 ***
## TEAM_BATTING_2B -2.006e-02 9.152e-03 -2.192 0.028486 *
## TEAM_BATTING_3B 6.047e-02 1.676e-02 3.608 0.000315 ***
## TEAM_BATTING_HR 5.299e-02 2.743e-02 1.932 0.053488 .
## TEAM_BATTING_BB 1.042e-02 5.818e-03 1.790 0.073544 .
## TEAM_BATTING_SO -9.349e-03 2.551e-03 -3.665 0.000253 ***
## TEAM_BASERUN_SB 2.949e-02 4.462e-03 6.610 4.78e-11 ***
## TEAM_BASERUN_CS -1.188e-02 1.614e-02 -0.736 0.461905
## TEAM_PITCHING_H -7.342e-04 3.676e-04 -1.997 0.045946 *
## TEAM_PITCHING_HR 1.480e-02 2.432e-02 0.609 0.542877
## TEAM_PITCHING_BB 8.891e-05 4.145e-03 0.021 0.982891
## TEAM_PITCHING_SO 2.843e-03 9.187e-04 3.095 0.001994 **
## TEAM_FIELDING_E -2.112e-02 2.480e-03 -8.516 < 2e-16 ***
## TEAM_FIELDING_DP -1.210e-01 1.302e-02 -9.297 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.04 on 2261 degrees of freedom
## Multiple R-squared: 0.3189, Adjusted R-squared: 0.3147
## F-statistic: 75.63 on 14 and 2261 DF, p-value: < 2.2e-16
Removing TEAM_PITCHING_BB
model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = moneyball_filled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.994 -8.576 0.136 8.345 58.626
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.0145796 5.3904993 4.640 3.67e-06 ***
## TEAM_BATTING_H 0.0482393 0.0036807 13.106 < 2e-16 ***
## TEAM_BATTING_2B -0.0200575 0.0091490 -2.192 0.028457 *
## TEAM_BATTING_3B 0.0604730 0.0167556 3.609 0.000314 ***
## TEAM_BATTING_HR 0.0527106 0.0240710 2.190 0.028641 *
## TEAM_BATTING_BB 0.0105175 0.0033664 3.124 0.001805 **
## TEAM_BATTING_SO -0.0093631 0.0024585 -3.809 0.000144 ***
## TEAM_BASERUN_SB 0.0295055 0.0044087 6.693 2.76e-11 ***
## TEAM_BASERUN_CS -0.0118872 0.0161276 -0.737 0.461155
## TEAM_PITCHING_H -0.0007306 0.0003283 -2.225 0.026147 *
## TEAM_PITCHING_HR 0.0150659 0.0209923 0.718 0.473025
## TEAM_PITCHING_SO 0.0028567 0.0006717 4.253 2.20e-05 ***
## TEAM_FIELDING_E -0.0211192 0.0024784 -8.521 < 2e-16 ***
## TEAM_FIELDING_DP -0.1210298 0.0130139 -9.300 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.04 on 2262 degrees of freedom
## Multiple R-squared: 0.3189, Adjusted R-squared: 0.315
## F-statistic: 81.49 on 13 and 2262 DF, p-value: < 2.2e-16
Removing TEAM_BASERUN_CS
model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = moneyball_filled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.905 -8.584 0.124 8.406 58.593
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.2348098 5.2851330 4.585 4.78e-06 ***
## TEAM_BATTING_H 0.0482055 0.0036800 13.099 < 2e-16 ***
## TEAM_BATTING_2B -0.0203302 0.0091405 -2.224 0.026235 *
## TEAM_BATTING_3B 0.0608466 0.0167463 3.633 0.000286 ***
## TEAM_BATTING_HR 0.0543985 0.0239594 2.270 0.023274 *
## TEAM_BATTING_BB 0.0107643 0.0033494 3.214 0.001328 **
## TEAM_BATTING_SO -0.0093418 0.0024580 -3.800 0.000148 ***
## TEAM_BASERUN_SB 0.0287600 0.0042906 6.703 2.57e-11 ***
## TEAM_PITCHING_H -0.0007390 0.0003281 -2.253 0.024372 *
## TEAM_PITCHING_HR 0.0147103 0.0209846 0.701 0.483372
## TEAM_PITCHING_SO 0.0028640 0.0006716 4.265 2.08e-05 ***
## TEAM_FIELDING_E -0.0207217 0.0024188 -8.567 < 2e-16 ***
## TEAM_FIELDING_DP -0.1211603 0.0130114 -9.312 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.04 on 2263 degrees of freedom
## Multiple R-squared: 0.3188, Adjusted R-squared: 0.3152
## F-statistic: 88.25 on 12 and 2263 DF, p-value: < 2.2e-16
Removing TEAM_PITCHING_HR
model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP, data = moneyball_filled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.899 -8.568 0.091 8.397 58.651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.6666983 5.2220414 4.532 6.14e-06 ***
## TEAM_BATTING_H 0.0484570 0.0036621 13.232 < 2e-16 ***
## TEAM_BATTING_2B -0.0205123 0.0091358 -2.245 0.024847 *
## TEAM_BATTING_3B 0.0624661 0.0165843 3.767 0.000170 ***
## TEAM_BATTING_HR 0.0697785 0.0096266 7.249 5.75e-13 ***
## TEAM_BATTING_BB 0.0107446 0.0033489 3.208 0.001354 **
## TEAM_BATTING_SO -0.0093019 0.0024571 -3.786 0.000157 ***
## TEAM_BASERUN_SB 0.0287708 0.0042901 6.706 2.51e-11 ***
## TEAM_PITCHING_H -0.0006920 0.0003211 -2.155 0.031253 *
## TEAM_PITCHING_SO 0.0028867 0.0006707 4.304 1.75e-05 ***
## TEAM_FIELDING_E -0.0205973 0.0024120 -8.540 < 2e-16 ***
## TEAM_FIELDING_DP -0.1210083 0.0130082 -9.302 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.03 on 2264 degrees of freedom
## Multiple R-squared: 0.3186, Adjusted R-squared: 0.3153
## F-statistic: 96.25 on 11 and 2264 DF, p-value: < 2.2e-16
crPlots(model)
TEAM_PITCHING_H, TEAM_PITCHING_SO did not pass the check for non-linerity.
Let’s transform TEAM_PITCHING_H, TEAM_PITCHING_SO and re-fit the model
moneyball_filled$TEAM_PITCHING_H<- log10(moneyball_filled$TEAM_PITCHING_H+0.1)
moneyball_filled$TEAM_PITCHING_SO<- log10(moneyball_filled$TEAM_PITCHING_SO+0.1)
model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP, data = moneyball_filled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.500 -8.353 0.050 8.276 63.152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.385299 13.648651 -1.347 0.178102
## TEAM_BATTING_H 0.041874 0.003784 11.065 < 2e-16 ***
## TEAM_BATTING_2B -0.020476 0.009106 -2.249 0.024630 *
## TEAM_BATTING_3B 0.087638 0.016862 5.197 2.20e-07 ***
## TEAM_BATTING_HR 0.058540 0.009697 6.037 1.83e-09 ***
## TEAM_BATTING_BB 0.012944 0.003388 3.821 0.000137 ***
## TEAM_BATTING_SO -0.001186 0.002534 -0.468 0.639742
## TEAM_BASERUN_SB 0.031437 0.004300 7.311 3.65e-13 ***
## TEAM_PITCHING_H 17.140905 4.594173 3.731 0.000195 ***
## TEAM_PITCHING_SO -2.656620 0.914734 -2.904 0.003717 **
## TEAM_FIELDING_E -0.030455 0.002954 -10.309 < 2e-16 ***
## TEAM_FIELDING_DP -0.120505 0.013004 -9.267 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.03 on 2264 degrees of freedom
## Multiple R-squared: 0.319, Adjusted R-squared: 0.3157
## F-statistic: 96.42 on 11 and 2264 DF, p-value: < 2.2e-16
crPlots(model)
Removing TEAM_BATTING_SO with p-value > 0.05
model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = moneyball_filled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.382 -8.328 0.025 8.211 62.933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.516367 12.864856 -1.595 0.110905
## TEAM_BATTING_H 0.042628 0.003424 12.448 < 2e-16 ***
## TEAM_BATTING_2B -0.021583 0.008792 -2.455 0.014167 *
## TEAM_BATTING_3B 0.089227 0.016513 5.403 7.23e-08 ***
## TEAM_BATTING_HR 0.055774 0.007688 7.255 5.50e-13 ***
## TEAM_BATTING_BB 0.013293 0.003304 4.023 5.93e-05 ***
## TEAM_BASERUN_SB 0.030879 0.004130 7.476 1.09e-13 ***
## TEAM_PITCHING_H 17.440250 4.548668 3.834 0.000129 ***
## TEAM_PITCHING_SO -2.847123 0.819083 -3.476 0.000519 ***
## TEAM_FIELDING_E -0.030494 0.002953 -10.328 < 2e-16 ***
## TEAM_FIELDING_DP -0.119878 0.012932 -9.270 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.03 on 2265 degrees of freedom
## Multiple R-squared: 0.319, Adjusted R-squared: 0.316
## F-statistic: 106.1 on 10 and 2265 DF, p-value: < 2.2e-16
(using Cook’s distance plot as 4/(n-k-1), identify D values > cutoff)
cutoff<-4/((nrow(moneyball_filled)-length(model$coefficients)-2))
plot(model, which = 4, cook.levels = cutoff)
plot(model, which = 5, cook.levels = cutoff)
moneyball_filled<-moneyball_filled[-which(rownames(moneyball_filled)
%in% c ("1828","1342","2233")),]
refit the model:
model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP, data = moneyball_filled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.117 -8.396 0.026 8.238 64.496
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.936246 14.300248 -1.324 0.185574
## TEAM_BATTING_H 0.039743 0.003901 10.187 < 2e-16 ***
## TEAM_BATTING_2B -0.021808 0.009043 -2.412 0.015957 *
## TEAM_BATTING_3B 0.101540 0.016844 6.028 1.93e-09 ***
## TEAM_BATTING_HR 0.065471 0.009640 6.791 1.41e-11 ***
## TEAM_BATTING_BB 0.012322 0.003354 3.674 0.000245 ***
## TEAM_BATTING_SO -0.001465 0.002507 -0.584 0.559106
## TEAM_BASERUN_SB 0.032302 0.004275 7.556 5.99e-14 ***
## TEAM_PITCHING_H 18.926032 4.961698 3.814 0.000140 ***
## TEAM_PITCHING_SO -3.560746 0.932952 -3.817 0.000139 ***
## TEAM_FIELDING_E -0.031367 0.003048 -10.291 < 2e-16 ***
## TEAM_FIELDING_DP -0.120207 0.012872 -9.339 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.89 on 2261 degrees of freedom
## Multiple R-squared: 0.3245, Adjusted R-squared: 0.3212
## F-statistic: 98.72 on 11 and 2261 DF, p-value: < 2.2e-16
Check and eliminate further extrimes if any
cutoff<-4/((nrow(moneyball_filled)-length(model$coefficients)-2))
plot(model, which = 4, cook.levels = cutoff)
plot(model, which = 5, cook.levels = cutoff)
# moneyball_filled<-moneyball_filled[-which(rownames(moneyball_filled)
# %in% c ("1211","299","2012")),]
# summary(model)
vif(model)
## TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 4.322088 2.438948 3.007291 4.650721
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_H
## 2.294133 5.045176 1.812464 5.617980
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1.722976 6.490171 1.364366
plot(model)
TEAM_FIELDING_E is withing the range 5-10, but eliminating TEAM_FIELDING_E does not improve the model.
model_basic<-lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_data)
summary(model_basic)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP, data = moneyball_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.317 -7.199 0.121 7.045 29.766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.312951 6.019406 9.687 < 2e-16 ***
## TEAM_BATTING_H -0.010007 0.010615 -0.943 0.34594
## TEAM_BATTING_2B -0.049989 0.008875 -5.633 2.05e-08 ***
## TEAM_BATTING_3B 0.181788 0.018982 9.577 < 2e-16 ***
## TEAM_BATTING_HR 0.100845 0.009158 11.012 < 2e-16 ***
## TEAM_BATTING_BB 0.034055 0.003133 10.870 < 2e-16 ***
## TEAM_BATTING_SO 0.045928 0.016420 2.797 0.00521 **
## TEAM_BASERUN_SB 0.069889 0.005535 12.626 < 2e-16 ***
## TEAM_PITCHING_H 0.037438 0.009239 4.052 5.29e-05 ***
## TEAM_PITCHING_SO -0.065427 0.015514 -4.217 2.59e-05 ***
## TEAM_FIELDING_E -0.116444 0.007029 -16.566 < 2e-16 ***
## TEAM_FIELDING_DP -0.112850 0.012279 -9.190 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.19 on 1823 degrees of freedom
## (441 observations deleted due to missingness)
## Multiple R-squared: 0.4045, Adjusted R-squared: 0.4009
## F-statistic: 112.6 on 11 and 1823 DF, p-value: < 2.2e-16