library(psych)
library(dplyr)
library(corrplot)
library(knitr)
train <- read.csv("https://raw.githubusercontent.com/bkreis84/Business-Analytics/master/HW1/moneyball-training-data.csv")
#remove the leading text "TEAM_" on the variable names to make our plots look less cluttered
colnames(train) = gsub("TEAM_", "", colnames(train))
kable(describe(train))
| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| INDEX | 1 | 2276 | 1268.46353 | 736.34904 | 1270.5 | 1268.56970 | 952.5705 | 1 | 2535 | 2534 | 0.0042149 | -1.2167564 | 15.4346788 |
| TARGET_WINS | 2 | 2276 | 80.79086 | 15.75215 | 82.0 | 81.31229 | 14.8260 | 0 | 146 | 146 | -0.3987232 | 1.0274757 | 0.3301823 |
| BATTING_H | 3 | 2276 | 1469.26977 | 144.59120 | 1454.0 | 1459.04116 | 114.1602 | 891 | 2554 | 1663 | 1.5713335 | 7.2785261 | 3.0307891 |
| BATTING_2B | 4 | 2276 | 241.24692 | 46.80141 | 238.0 | 240.39627 | 47.4432 | 69 | 458 | 389 | 0.2151018 | 0.0061609 | 0.9810087 |
| BATTING_3B | 5 | 2276 | 55.25000 | 27.93856 | 47.0 | 52.17563 | 23.7216 | 0 | 223 | 223 | 1.1094652 | 1.5032418 | 0.5856226 |
| BATTING_HR | 6 | 2276 | 99.61204 | 60.54687 | 102.0 | 97.38529 | 78.5778 | 0 | 264 | 264 | 0.1860421 | -0.9631189 | 1.2691285 |
| BATTING_BB | 7 | 2276 | 501.55888 | 122.67086 | 512.0 | 512.18331 | 94.8864 | 0 | 878 | 878 | -1.0257599 | 2.1828544 | 2.5713150 |
| BATTING_SO | 8 | 2174 | 735.60534 | 248.52642 | 750.0 | 742.31322 | 284.6592 | 0 | 1399 | 1399 | -0.2978001 | -0.3207992 | 5.3301912 |
| BASERUN_SB | 9 | 2145 | 124.76177 | 87.79117 | 101.0 | 110.81188 | 60.7866 | 0 | 697 | 697 | 1.9724140 | 5.4896754 | 1.8955584 |
| BASERUN_CS | 10 | 1504 | 52.80386 | 22.95634 | 49.0 | 50.35963 | 17.7912 | 0 | 201 | 201 | 1.9762180 | 7.6203818 | 0.5919414 |
| BATTING_HBP | 11 | 191 | 59.35602 | 12.96712 | 58.0 | 58.86275 | 11.8608 | 29 | 95 | 66 | 0.3185754 | -0.1119828 | 0.9382681 |
| PITCHING_H | 12 | 2276 | 1779.21046 | 1406.84293 | 1518.0 | 1555.89517 | 174.9468 | 1137 | 30132 | 28995 | 10.3295111 | 141.8396985 | 29.4889618 |
| PITCHING_HR | 13 | 2276 | 105.69859 | 61.29875 | 107.0 | 103.15697 | 74.1300 | 0 | 343 | 343 | 0.2877877 | -0.6046311 | 1.2848886 |
| PITCHING_BB | 14 | 2276 | 553.00791 | 166.35736 | 536.5 | 542.62459 | 98.5929 | 0 | 3645 | 3645 | 6.7438995 | 96.9676398 | 3.4870317 |
| PITCHING_SO | 15 | 2174 | 817.73045 | 553.08503 | 813.5 | 796.93391 | 257.2311 | 0 | 19278 | 19278 | 22.1745535 | 671.1891292 | 11.8621151 |
| FIELDING_E | 16 | 2276 | 246.48067 | 227.77097 | 159.0 | 193.43798 | 62.2692 | 65 | 1898 | 1833 | 2.9904656 | 10.9702717 | 4.7743279 |
| FIELDING_DP | 17 | 1990 | 146.38794 | 26.22639 | 149.0 | 147.57789 | 23.7216 | 52 | 228 | 176 | -0.3889390 | 0.1817397 | 0.5879114 |
#There are some extreme outliers that should be addressed. Consulting the baseball almanac (http://www.baseball-almanac.com/rb_menu.shtml),
#we were able to find historical teams minimum and maximums values for a number of historical team stats. These were then adjusted to be
#comparable to a 162 game season. Because of the unlikelihood of these values, we question the vailidity of the entire observation and remove it
train <- train %>%
filter(TARGET_WINS >=22 & TARGET_WINS <= 124 & BATTING_H <= 1876 & BATTING_2B >= 116 & BATTING_2B <= 376 & BATTING_BB >= 292 &
BATTING_BB <= 879 & BATTING_SO >= 326 & BATTING_SO <= 1535 & PITCHING_HR <= 258 & PITCHING_SO <= 1450)
describe(train)
## vars n mean sd median trimmed mad min max range
## INDEX 1 1982 1281.51 732.36 1287.5 1284.57 936.26 2 2534 2532
## TARGET_WINS 2 1982 80.94 13.90 82.0 81.28 14.83 27 123 96
## BATTING_H 3 1982 1461.22 112.85 1452.0 1456.92 105.26 1137 1876 739
## BATTING_2B 4 1982 244.25 43.29 241.0 243.34 44.48 118 373 255
## BATTING_3B 5 1982 51.98 25.59 44.0 48.93 22.24 11 165 154
## BATTING_HR 6 1982 109.48 56.11 112.0 108.49 65.23 4 257 253
## BATTING_BB 7 1982 527.58 88.18 524.0 526.51 87.47 294 878 584
## BATTING_SO 8 1982 762.89 222.29 783.5 762.80 272.06 326 1399 1073
## BASERUN_SB 9 1966 118.05 83.88 97.0 104.18 56.34 18 697 679
## BASERUN_CS 10 1466 53.01 22.91 50.0 50.48 17.79 11 201 190
## BATTING_HBP 11 190 59.42 12.97 58.0 58.93 11.86 29 95 66
## PITCHING_H 12 1982 1548.83 198.68 1505.0 1523.04 151.97 1137 2656 1519
## PITCHING_HR 13 1982 113.63 56.39 115.0 112.55 63.75 4 257 253
## PITCHING_BB 14 1982 557.29 100.20 545.5 550.19 91.18 325 1123 798
## PITCHING_SO 15 1982 799.20 216.89 811.0 796.57 247.59 345 1434 1089
## FIELDING_E 16 1982 192.81 124.43 149.0 163.82 47.44 65 965 900
## FIELDING_DP 17 1811 150.43 22.69 151.0 150.76 22.24 72 228 156
## skew kurtosis se
## INDEX -0.02 -1.19 16.45
## TARGET_WINS -0.23 -0.03 0.31
## BATTING_H 0.38 0.26 2.53
## BATTING_2B 0.21 -0.35 0.97
## BATTING_3B 1.06 0.75 0.57
## BATTING_HR 0.08 -0.84 1.26
## BATTING_BB 0.19 0.24 1.98
## BATTING_SO 0.00 -1.00 4.99
## BASERUN_SB 2.24 7.25 1.89
## BASERUN_CS 2.03 7.80 0.60
## BATTING_HBP 0.31 -0.11 0.94
## PITCHING_H 1.44 2.70 4.46
## PITCHING_HR 0.10 -0.79 1.27
## PITCHING_BB 0.84 1.43 2.25
## PITCHING_SO 0.12 -0.63 4.87
## FIELDING_E 2.43 5.91 2.79
## FIELDING_DP -0.12 0.22 0.53
#There are a number of valus with a minimum of 0. A 0 value is equivalent to an NA value for each variable in the data set.
train[train == 0] <- NA
#https://stackoverflow.com/questions/4787332/how-to-remove-outliers-from-a-dataset
kable(describe(train))
| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| INDEX | 1 | 1982 | 1281.51413 | 732.35847 | 1287.5 | 1284.56620 | 936.2619 | 2 | 2534 | 2532 | -0.0222668 | -1.1935109 | 16.4502265 |
| TARGET_WINS | 2 | 1982 | 80.94147 | 13.90015 | 82.0 | 81.27995 | 14.8260 | 27 | 123 | 96 | -0.2318537 | -0.0317418 | 0.3122250 |
| BATTING_H | 3 | 1982 | 1461.22200 | 112.85252 | 1452.0 | 1456.91677 | 105.2646 | 1137 | 1876 | 739 | 0.3833650 | 0.2584343 | 2.5348918 |
| BATTING_2B | 4 | 1982 | 244.25429 | 43.28734 | 241.0 | 243.33733 | 44.4780 | 118 | 373 | 255 | 0.2061959 | -0.3473562 | 0.9723197 |
| BATTING_3B | 5 | 1982 | 51.97830 | 25.59300 | 44.0 | 48.92749 | 22.2390 | 11 | 165 | 154 | 1.0646465 | 0.7519322 | 0.5748696 |
| BATTING_HR | 6 | 1982 | 109.47931 | 56.10561 | 112.0 | 108.48802 | 65.2344 | 4 | 257 | 253 | 0.0766060 | -0.8395430 | 1.2602436 |
| BATTING_BB | 7 | 1982 | 527.58426 | 88.18040 | 524.0 | 526.51324 | 87.4734 | 294 | 878 | 584 | 0.1929183 | 0.2403791 | 1.9807070 |
| BATTING_SO | 8 | 1982 | 762.89455 | 222.29120 | 783.5 | 762.79887 | 272.0571 | 326 | 1399 | 1073 | -0.0003035 | -1.0015071 | 4.9931021 |
| BASERUN_SB | 9 | 1966 | 118.05137 | 83.88476 | 97.0 | 104.17662 | 56.3388 | 18 | 697 | 679 | 2.2434765 | 7.2507184 | 1.8918700 |
| BASERUN_CS | 10 | 1466 | 53.00955 | 22.91444 | 50.0 | 50.48041 | 17.7912 | 11 | 201 | 190 | 2.0259505 | 7.7988460 | 0.5984698 |
| BATTING_HBP | 11 | 190 | 59.41579 | 12.97498 | 58.0 | 58.93421 | 11.8608 | 29 | 95 | 66 | 0.3094528 | -0.1127227 | 0.9413037 |
| PITCHING_H | 12 | 1982 | 1548.82846 | 198.68054 | 1505.0 | 1523.03594 | 151.9665 | 1137 | 2656 | 1519 | 1.4410110 | 2.7010412 | 4.4627598 |
| PITCHING_HR | 13 | 1982 | 113.62563 | 56.38617 | 115.0 | 112.55359 | 63.7518 | 4 | 257 | 253 | 0.0956767 | -0.7914657 | 1.2665455 |
| PITCHING_BB | 14 | 1982 | 557.29162 | 100.19764 | 545.5 | 550.18726 | 91.1799 | 325 | 1123 | 798 | 0.8418509 | 1.4287787 | 2.2506382 |
| PITCHING_SO | 15 | 1982 | 799.19728 | 216.88816 | 811.0 | 796.56620 | 247.5942 | 345 | 1434 | 1089 | 0.1180771 | -0.6339600 | 4.8717391 |
| FIELDING_E | 16 | 1982 | 192.81282 | 124.42942 | 149.0 | 163.82346 | 47.4432 | 65 | 965 | 900 | 2.4348990 | 5.9141829 | 2.7949321 |
| FIELDING_DP | 17 | 1811 | 150.43457 | 22.68564 | 151.0 | 150.75500 | 22.2390 | 72 | 228 | 156 | -0.1228728 | 0.2216871 | 0.5330792 |
trainA <- train %>%
select_if(colSums(is.na(train))>0)
library(ggplot2)
plot_Missing <- function(data_in, title = NULL){
temp_df <- as.data.frame(ifelse(is.na(data_in), 0, 1))
temp_df <- temp_df[,order(colSums(temp_df))]
data_temp <- expand.grid(list(x = 1:nrow(temp_df), y = colnames(temp_df)))
data_temp$m <- as.vector(as.matrix(temp_df))
data_temp <- data.frame(x = unlist(data_temp$x), y = unlist(data_temp$y), m = unlist(data_temp$m))
ggplot(data_temp) + geom_tile(aes(x=x, y=y, fill=factor(m))) + scale_fill_manual(values=c("red", "white"), name="Missing\n(0=Yes, 1=No)") + theme_light() + ylab("") + xlab("") + ggtitle(title)
}
#https://www.kaggle.com/notaapple/detailed-exploratory-data-analysis-using-r
plot_Missing(trainA)
#Percentage of missing values per variable
kable(colSums(is.na(train))/colSums(count(train))*100)
| x | |
|---|---|
| INDEX | 0.0000000 |
| TARGET_WINS | 0.0000000 |
| BATTING_H | 0.0000000 |
| BATTING_2B | 0.0000000 |
| BATTING_3B | 0.0000000 |
| BATTING_HR | 0.0000000 |
| BATTING_BB | 0.0000000 |
| BATTING_SO | 0.0000000 |
| BASERUN_SB | 0.8072654 |
| BASERUN_CS | 26.0343088 |
| BATTING_HBP | 90.4137235 |
| PITCHING_H | 0.0000000 |
| PITCHING_HR | 0.0000000 |
| PITCHING_BB | 0.0000000 |
| PITCHING_SO | 0.0000000 |
| FIELDING_E | 0.0000000 |
| FIELDING_DP | 8.6276488 |
| Less than 1 pe | rcent of the HR and BB data is missing, while 91.6% of the hit by pitch data is missing. The extent to which this data is missing leads me to suggest removing the variable altogether |
For the other data, we can impute values using k nearest neighbors, which takes the average of the 10 nearest values weighted by their euclidean distance
library(DMwR)
newData <- train %>%
select(-BATTING_HBP)
newData <- knnImputation(newData, k=10)
anyNA(newData)
## [1] FALSE
ggplot(stack(newData[2:16]), aes(values))+
facet_wrap(~ind, scales = "free") +
geom_histogram() +
theme(legend.position="none")
ggplot(stack(newData[2:16]), aes(x = ind, y = values, fill=ind))+
facet_wrap(~ind, scales = "free") +
geom_boxplot() +
theme(legend.position="none")
One of the questions that comes to mind is whether our test set contains years that are more current and our training set contains all of the earlier data. If that is the case, the game of baseball was much different in the past, with less power-hitters just being one example.
#correlation plot
par(mfrow=c(1,1))
corr <- cor(newData[2:16])
corrplot(corr, order = "hclust")
Home runs for hitters and home runs allowed by pitchers are highly correlated (0.97). In baseball there are what are referred to as pitchers parks and hitters parks. Because teams play half their games at home, if they are in a hitters park, you would expect the team to hit more home runs and for their pitchers to give up more home runs. The variables with a positive correlation to wins appear to be hits, doubles and walks to a lesser degree
Total bases is an additional stat that we can add. It seems most other additional statistics can not be achieved without knowing the number of at bats or innings pitched. Total bases takes into account the production underlying each hit giving a heavier weighting to doubles, triples and home runs
Baseball reference has a list of the average at bats per game since 1871. We average these values and multiply by 162 to get an approximation of the average at bats.
newData <- newData %>%
mutate(TB = (BATTING_H - BATTING_2B - BATTING_3B - BATTING_HR + BATTING_2B * 2 + BATTING_3B * 3 + BATTING_HR * 4)) %>%
mutate(WHGP = (PITCHING_H + PITCHING_BB)/162) %>%
mutate(PITCHING_SO_BB = PITCHING_SO/PITCHING_BB) %>%
mutate(BATTING_BB_SO = BATTING_BB/BATTING_SO)
#Baseball Reference data
AVG <- read.csv("https://raw.githubusercontent.com/bkreis84/Business-Analytics/master/HW1/Baseball%20Reference%20AVG.csv")
avgAB <- mean(AVG$AB, na.rm = TRUE)*162
newData <- newData %>%
mutate(BsR = (((BATTING_H + BATTING_BB - BATTING_HR) * ((1.4*TB - .6*BATTING_H -3*BATTING_HR +0.1*BATTING_BB)*1.02)) /
(((1.4*TB - .6*BATTING_H -3*BATTING_HR +.1*BATTING_BB)*1.02) + avgAB - BATTING_H)) + BATTING_HR)
#correlation plot
par(mfrow=c(1,1))
corr <- cor(newData[2:18])
corrplot(corr, order = "hclust")
The new variables have some collinearity which we can investigate further
Next we can check to see which variables should be removed based on having realtively few unique values compared to the sample size and the large ratio of the most common value to the second most common value. There were none to remove in this case
library(caret)
nearZeroVar(newData)
## integer(0)
BoxCox transformation applied
#BoxCox transformation applied
set.seed(4234)
trans <- preProcess(newData, method = "BoxCox")
transformed <- predict(trans, newData)
ggplot(stack(transformed[2:21]), aes(values))+
facet_wrap(~ind, scales = "free") +
geom_histogram() +
theme(legend.position="none")
ggplot(stack(transformed[2:21]), aes(x = ind, y = values, fill=ind))+
facet_wrap(~ind, scales = "free") +
geom_boxplot() +
theme(legend.position="none")
The following creates a model from all of the candidate variables and then a stepwise method is used to find out which variables improve the model
#All variables
model <- lm(TARGET_WINS ~ . -INDEX, data = transformed)
summary(model)
##
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = transformed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -230.128 -43.745 -1.317 42.153 310.821
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.505e+04 1.975e+04 2.787 0.005363 **
## BATTING_H -2.595e+04 1.308e+04 -1.984 0.047441 *
## BATTING_2B -6.173e+00 1.338e+00 -4.614 4.20e-06 ***
## BATTING_3B 2.952e+01 8.555e+00 3.451 0.000571 ***
## BATTING_HR 8.997e+00 2.683e+00 3.353 0.000816 ***
## BATTING_BB 1.323e+00 1.034e+00 1.279 0.200899
## BATTING_SO -5.770e-01 9.734e-02 -5.928 3.61e-09 ***
## BASERUN_SB 1.485e+01 4.155e+00 3.574 0.000360 ***
## BASERUN_CS 2.258e+01 6.242e+00 3.617 0.000306 ***
## PITCHING_H NA NA NA NA
## PITCHING_HR -7.975e+00 2.315e+00 -3.445 0.000584 ***
## PITCHING_BB -1.382e+03 9.998e+02 -1.382 0.167099
## PITCHING_SO 2.396e+00 9.406e-01 2.548 0.010917 *
## FIELDING_E -1.633e+04 1.202e+03 -13.591 < 2e-16 ***
## FIELDING_DP -6.199e-01 7.777e-02 -7.971 2.65e-15 ***
## TB -2.177e-01 1.435e-01 -1.516 0.129605
## WHGP 1.345e+04 1.531e+04 0.879 0.379716
## PITCHING_SO_BB -6.295e+03 2.391e+03 -2.633 0.008526 **
## BATTING_BB_SO -6.379e+03 2.392e+03 -2.667 0.007712 **
## BsR 7.516e+00 2.390e+00 3.144 0.001689 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65.08 on 1963 degrees of freedom
## Multiple R-squared: 0.34, Adjusted R-squared: 0.334
## F-statistic: 56.18 on 18 and 1963 DF, p-value: < 2.2e-16
#stepwise approach
step <- lm(TARGET_WINS ~ BsR + TB + BASERUN_CS + FIELDING_E + FIELDING_DP + BATTING_2B + BATTING_H + BATTING_SO + BATTING_BB_SO + BASERUN_SB + BATTING_3B +BASERUN_SB + PITCHING_SO_BB + BATTING_BB_SO, data = transformed)
summary(step)
##
## Call:
## lm(formula = TARGET_WINS ~ BsR + TB + BASERUN_CS + FIELDING_E +
## FIELDING_DP + BATTING_2B + BATTING_H + BATTING_SO + BATTING_BB_SO +
## BASERUN_SB + BATTING_3B + BASERUN_SB + PITCHING_SO_BB + BATTING_BB_SO,
## data = transformed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -241.06 -43.18 -0.59 41.32 349.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.767e+04 1.035e+04 7.504 9.34e-14 ***
## BsR 9.258e+00 1.139e+00 8.128 7.64e-16 ***
## TB -2.296e-01 5.510e-02 -4.166 3.23e-05 ***
## BASERUN_CS 2.439e+01 6.073e+00 4.016 6.13e-05 ***
## FIELDING_E -1.587e+04 1.125e+03 -14.106 < 2e-16 ***
## FIELDING_DP -6.188e-01 7.695e-02 -8.041 1.52e-15 ***
## BATTING_2B -7.176e+00 8.600e-01 -8.344 < 2e-16 ***
## BATTING_H -3.775e+04 6.273e+03 -6.018 2.10e-09 ***
## BATTING_SO -2.843e-01 4.060e-02 -7.002 3.45e-12 ***
## BATTING_BB_SO -6.454e+03 2.400e+03 -2.689 0.007230 **
## BASERUN_SB 1.428e+01 3.973e+00 3.596 0.000332 ***
## BATTING_3B 2.261e+01 5.444e+00 4.154 3.41e-05 ***
## PITCHING_SO_BB -6.342e+03 2.399e+03 -2.643 0.008281 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65.46 on 1969 degrees of freedom
## Multiple R-squared: 0.3303, Adjusted R-squared: 0.3262
## F-statistic: 80.92 on 12 and 1969 DF, p-value: < 2.2e-16
plot(step)