Attempt

library(psych)
library(dplyr)
library(corrplot)
library(knitr)


train <- read.csv("https://raw.githubusercontent.com/bkreis84/Business-Analytics/master/HW1/moneyball-training-data.csv")

#remove the leading text "TEAM_" on the variable names to make our plots look less cluttered
colnames(train) = gsub("TEAM_", "", colnames(train))


kable(describe(train))
vars n mean sd median trimmed mad min max range skew kurtosis se
INDEX 1 2276 1268.46353 736.34904 1270.5 1268.56970 952.5705 1 2535 2534 0.0042149 -1.2167564 15.4346788
TARGET_WINS 2 2276 80.79086 15.75215 82.0 81.31229 14.8260 0 146 146 -0.3987232 1.0274757 0.3301823
BATTING_H 3 2276 1469.26977 144.59120 1454.0 1459.04116 114.1602 891 2554 1663 1.5713335 7.2785261 3.0307891
BATTING_2B 4 2276 241.24692 46.80141 238.0 240.39627 47.4432 69 458 389 0.2151018 0.0061609 0.9810087
BATTING_3B 5 2276 55.25000 27.93856 47.0 52.17563 23.7216 0 223 223 1.1094652 1.5032418 0.5856226
BATTING_HR 6 2276 99.61204 60.54687 102.0 97.38529 78.5778 0 264 264 0.1860421 -0.9631189 1.2691285
BATTING_BB 7 2276 501.55888 122.67086 512.0 512.18331 94.8864 0 878 878 -1.0257599 2.1828544 2.5713150
BATTING_SO 8 2174 735.60534 248.52642 750.0 742.31322 284.6592 0 1399 1399 -0.2978001 -0.3207992 5.3301912
BASERUN_SB 9 2145 124.76177 87.79117 101.0 110.81188 60.7866 0 697 697 1.9724140 5.4896754 1.8955584
BASERUN_CS 10 1504 52.80386 22.95634 49.0 50.35963 17.7912 0 201 201 1.9762180 7.6203818 0.5919414
BATTING_HBP 11 191 59.35602 12.96712 58.0 58.86275 11.8608 29 95 66 0.3185754 -0.1119828 0.9382681
PITCHING_H 12 2276 1779.21046 1406.84293 1518.0 1555.89517 174.9468 1137 30132 28995 10.3295111 141.8396985 29.4889618
PITCHING_HR 13 2276 105.69859 61.29875 107.0 103.15697 74.1300 0 343 343 0.2877877 -0.6046311 1.2848886
PITCHING_BB 14 2276 553.00791 166.35736 536.5 542.62459 98.5929 0 3645 3645 6.7438995 96.9676398 3.4870317
PITCHING_SO 15 2174 817.73045 553.08503 813.5 796.93391 257.2311 0 19278 19278 22.1745535 671.1891292 11.8621151
FIELDING_E 16 2276 246.48067 227.77097 159.0 193.43798 62.2692 65 1898 1833 2.9904656 10.9702717 4.7743279
FIELDING_DP 17 1990 146.38794 26.22639 149.0 147.57789 23.7216 52 228 176 -0.3889390 0.1817397 0.5879114
#There are some extreme outliers that should be addressed. Consulting the baseball almanac (http://www.baseball-almanac.com/rb_menu.shtml), 
#we were able to find historical teams minimum and maximums values for a number of historical team stats. These were then adjusted to be 
#comparable to a 162 game season. Because of the unlikelihood of these values, we question the vailidity of the entire observation and remove it

train <- train %>% 
  filter(TARGET_WINS >=22 & TARGET_WINS <= 124 & BATTING_H <= 1876 & BATTING_2B >= 116 & BATTING_2B <= 376 & BATTING_BB >= 292 &
           BATTING_BB <= 879 & BATTING_SO >= 326 & BATTING_SO <= 1535 & PITCHING_HR <= 258 & PITCHING_SO <= 1450)

describe(train)
##             vars    n    mean     sd median trimmed    mad  min  max range
## INDEX          1 1982 1281.51 732.36 1287.5 1284.57 936.26    2 2534  2532
## TARGET_WINS    2 1982   80.94  13.90   82.0   81.28  14.83   27  123    96
## BATTING_H      3 1982 1461.22 112.85 1452.0 1456.92 105.26 1137 1876   739
## BATTING_2B     4 1982  244.25  43.29  241.0  243.34  44.48  118  373   255
## BATTING_3B     5 1982   51.98  25.59   44.0   48.93  22.24   11  165   154
## BATTING_HR     6 1982  109.48  56.11  112.0  108.49  65.23    4  257   253
## BATTING_BB     7 1982  527.58  88.18  524.0  526.51  87.47  294  878   584
## BATTING_SO     8 1982  762.89 222.29  783.5  762.80 272.06  326 1399  1073
## BASERUN_SB     9 1966  118.05  83.88   97.0  104.18  56.34   18  697   679
## BASERUN_CS    10 1466   53.01  22.91   50.0   50.48  17.79   11  201   190
## BATTING_HBP   11  190   59.42  12.97   58.0   58.93  11.86   29   95    66
## PITCHING_H    12 1982 1548.83 198.68 1505.0 1523.04 151.97 1137 2656  1519
## PITCHING_HR   13 1982  113.63  56.39  115.0  112.55  63.75    4  257   253
## PITCHING_BB   14 1982  557.29 100.20  545.5  550.19  91.18  325 1123   798
## PITCHING_SO   15 1982  799.20 216.89  811.0  796.57 247.59  345 1434  1089
## FIELDING_E    16 1982  192.81 124.43  149.0  163.82  47.44   65  965   900
## FIELDING_DP   17 1811  150.43  22.69  151.0  150.76  22.24   72  228   156
##              skew kurtosis    se
## INDEX       -0.02    -1.19 16.45
## TARGET_WINS -0.23    -0.03  0.31
## BATTING_H    0.38     0.26  2.53
## BATTING_2B   0.21    -0.35  0.97
## BATTING_3B   1.06     0.75  0.57
## BATTING_HR   0.08    -0.84  1.26
## BATTING_BB   0.19     0.24  1.98
## BATTING_SO   0.00    -1.00  4.99
## BASERUN_SB   2.24     7.25  1.89
## BASERUN_CS   2.03     7.80  0.60
## BATTING_HBP  0.31    -0.11  0.94
## PITCHING_H   1.44     2.70  4.46
## PITCHING_HR  0.10    -0.79  1.27
## PITCHING_BB  0.84     1.43  2.25
## PITCHING_SO  0.12    -0.63  4.87
## FIELDING_E   2.43     5.91  2.79
## FIELDING_DP -0.12     0.22  0.53
#There are a number of valus with a minimum of 0. A 0 value is equivalent to an NA value for each variable in the data set. 

train[train == 0] <- NA


#https://stackoverflow.com/questions/4787332/how-to-remove-outliers-from-a-dataset


kable(describe(train))
vars n mean sd median trimmed mad min max range skew kurtosis se
INDEX 1 1982 1281.51413 732.35847 1287.5 1284.56620 936.2619 2 2534 2532 -0.0222668 -1.1935109 16.4502265
TARGET_WINS 2 1982 80.94147 13.90015 82.0 81.27995 14.8260 27 123 96 -0.2318537 -0.0317418 0.3122250
BATTING_H 3 1982 1461.22200 112.85252 1452.0 1456.91677 105.2646 1137 1876 739 0.3833650 0.2584343 2.5348918
BATTING_2B 4 1982 244.25429 43.28734 241.0 243.33733 44.4780 118 373 255 0.2061959 -0.3473562 0.9723197
BATTING_3B 5 1982 51.97830 25.59300 44.0 48.92749 22.2390 11 165 154 1.0646465 0.7519322 0.5748696
BATTING_HR 6 1982 109.47931 56.10561 112.0 108.48802 65.2344 4 257 253 0.0766060 -0.8395430 1.2602436
BATTING_BB 7 1982 527.58426 88.18040 524.0 526.51324 87.4734 294 878 584 0.1929183 0.2403791 1.9807070
BATTING_SO 8 1982 762.89455 222.29120 783.5 762.79887 272.0571 326 1399 1073 -0.0003035 -1.0015071 4.9931021
BASERUN_SB 9 1966 118.05137 83.88476 97.0 104.17662 56.3388 18 697 679 2.2434765 7.2507184 1.8918700
BASERUN_CS 10 1466 53.00955 22.91444 50.0 50.48041 17.7912 11 201 190 2.0259505 7.7988460 0.5984698
BATTING_HBP 11 190 59.41579 12.97498 58.0 58.93421 11.8608 29 95 66 0.3094528 -0.1127227 0.9413037
PITCHING_H 12 1982 1548.82846 198.68054 1505.0 1523.03594 151.9665 1137 2656 1519 1.4410110 2.7010412 4.4627598
PITCHING_HR 13 1982 113.62563 56.38617 115.0 112.55359 63.7518 4 257 253 0.0956767 -0.7914657 1.2665455
PITCHING_BB 14 1982 557.29162 100.19764 545.5 550.18726 91.1799 325 1123 798 0.8418509 1.4287787 2.2506382
PITCHING_SO 15 1982 799.19728 216.88816 811.0 796.56620 247.5942 345 1434 1089 0.1180771 -0.6339600 4.8717391
FIELDING_E 16 1982 192.81282 124.42942 149.0 163.82346 47.4432 65 965 900 2.4348990 5.9141829 2.7949321
FIELDING_DP 17 1811 150.43457 22.68564 151.0 150.75500 22.2390 72 228 156 -0.1228728 0.2216871 0.5330792
trainA <- train %>% 
  select_if(colSums(is.na(train))>0)

library(ggplot2)

plot_Missing <- function(data_in, title = NULL){
  temp_df <- as.data.frame(ifelse(is.na(data_in), 0, 1))
  temp_df <- temp_df[,order(colSums(temp_df))]
  data_temp <- expand.grid(list(x = 1:nrow(temp_df), y = colnames(temp_df)))
  data_temp$m <- as.vector(as.matrix(temp_df))
  data_temp <- data.frame(x = unlist(data_temp$x), y = unlist(data_temp$y), m = unlist(data_temp$m))
  ggplot(data_temp) + geom_tile(aes(x=x, y=y, fill=factor(m))) + scale_fill_manual(values=c("red", "white"), name="Missing\n(0=Yes, 1=No)") + theme_light() + ylab("") + xlab("") + ggtitle(title)
}
#https://www.kaggle.com/notaapple/detailed-exploratory-data-analysis-using-r

plot_Missing(trainA)

#Percentage of missing values per variable
kable(colSums(is.na(train))/colSums(count(train))*100)
x
INDEX 0.0000000
TARGET_WINS 0.0000000
BATTING_H 0.0000000
BATTING_2B 0.0000000
BATTING_3B 0.0000000
BATTING_HR 0.0000000
BATTING_BB 0.0000000
BATTING_SO 0.0000000
BASERUN_SB 0.8072654
BASERUN_CS 26.0343088
BATTING_HBP 90.4137235
PITCHING_H 0.0000000
PITCHING_HR 0.0000000
PITCHING_BB 0.0000000
PITCHING_SO 0.0000000
FIELDING_E 0.0000000
FIELDING_DP 8.6276488
Less than 1 pe rcent of the HR and BB data is missing, while 91.6% of the hit by pitch data is missing. The extent to which this data is missing leads me to suggest removing the variable altogether

For the other data, we can impute values using k nearest neighbors, which takes the average of the 10 nearest values weighted by their euclidean distance

library(DMwR)

newData <- train %>% 
  select(-BATTING_HBP)
  
newData <- knnImputation(newData, k=10)
anyNA(newData)
## [1] FALSE
ggplot(stack(newData[2:16]), aes(values))+
  facet_wrap(~ind, scales = "free") + 
  geom_histogram() +
  theme(legend.position="none")

ggplot(stack(newData[2:16]), aes(x = ind, y = values, fill=ind))+
  facet_wrap(~ind, scales = "free") + 
  geom_boxplot() +
  theme(legend.position="none")

One of the questions that comes to mind is whether our test set contains years that are more current and our training set contains all of the earlier data. If that is the case, the game of baseball was much different in the past, with less power-hitters just being one example.

#correlation plot
par(mfrow=c(1,1)) 
corr <- cor(newData[2:16])
corrplot(corr, order = "hclust")

Home runs for hitters and home runs allowed by pitchers are highly correlated (0.97). In baseball there are what are referred to as pitchers parks and hitters parks. Because teams play half their games at home, if they are in a hitters park, you would expect the team to hit more home runs and for their pitchers to give up more home runs. The variables with a positive correlation to wins appear to be hits, doubles and walks to a lesser degree

Total bases is an additional stat that we can add. It seems most other additional statistics can not be achieved without knowing the number of at bats or innings pitched. Total bases takes into account the production underlying each hit giving a heavier weighting to doubles, triples and home runs

Baseball reference has a list of the average at bats per game since 1871. We average these values and multiply by 162 to get an approximation of the average at bats.

newData <- newData %>% 
  mutate(TB = (BATTING_H - BATTING_2B - BATTING_3B - BATTING_HR + BATTING_2B * 2 + BATTING_3B * 3 + BATTING_HR * 4)) %>% 
  mutate(WHGP = (PITCHING_H + PITCHING_BB)/162) %>% 
  mutate(PITCHING_SO_BB = PITCHING_SO/PITCHING_BB) %>% 
  mutate(BATTING_BB_SO = BATTING_BB/BATTING_SO)


#Baseball Reference data
AVG <- read.csv("https://raw.githubusercontent.com/bkreis84/Business-Analytics/master/HW1/Baseball%20Reference%20AVG.csv")
avgAB <- mean(AVG$AB, na.rm = TRUE)*162


newData <- newData %>% 
  mutate(BsR = (((BATTING_H + BATTING_BB - BATTING_HR) * ((1.4*TB - .6*BATTING_H -3*BATTING_HR +0.1*BATTING_BB)*1.02)) /
                  (((1.4*TB - .6*BATTING_H -3*BATTING_HR +.1*BATTING_BB)*1.02) + avgAB - BATTING_H)) + BATTING_HR)

#correlation plot
par(mfrow=c(1,1)) 
corr <- cor(newData[2:18])
corrplot(corr, order = "hclust")

The new variables have some collinearity which we can investigate further

Next we can check to see which variables should be removed based on having realtively few unique values compared to the sample size and the large ratio of the most common value to the second most common value. There were none to remove in this case

library(caret)
nearZeroVar(newData)
## integer(0)

BoxCox transformation applied

#BoxCox transformation applied
set.seed(4234)
trans <- preProcess(newData, method = "BoxCox")

transformed <- predict(trans, newData)


ggplot(stack(transformed[2:21]), aes(values))+
  facet_wrap(~ind, scales = "free") + 
  geom_histogram() +
  theme(legend.position="none")

ggplot(stack(transformed[2:21]), aes(x = ind, y = values, fill=ind))+
  facet_wrap(~ind, scales = "free") + 
  geom_boxplot() +
  theme(legend.position="none")

The following creates a model from all of the candidate variables and then a stepwise method is used to find out which variables improve the model

#All variables
model <- lm(TARGET_WINS ~ . -INDEX, data = transformed)
summary(model)
## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = transformed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -230.128  -43.745   -1.317   42.153  310.821 
## 
## Coefficients: (1 not defined because of singularities)
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.505e+04  1.975e+04   2.787 0.005363 ** 
## BATTING_H      -2.595e+04  1.308e+04  -1.984 0.047441 *  
## BATTING_2B     -6.173e+00  1.338e+00  -4.614 4.20e-06 ***
## BATTING_3B      2.952e+01  8.555e+00   3.451 0.000571 ***
## BATTING_HR      8.997e+00  2.683e+00   3.353 0.000816 ***
## BATTING_BB      1.323e+00  1.034e+00   1.279 0.200899    
## BATTING_SO     -5.770e-01  9.734e-02  -5.928 3.61e-09 ***
## BASERUN_SB      1.485e+01  4.155e+00   3.574 0.000360 ***
## BASERUN_CS      2.258e+01  6.242e+00   3.617 0.000306 ***
## PITCHING_H             NA         NA      NA       NA    
## PITCHING_HR    -7.975e+00  2.315e+00  -3.445 0.000584 ***
## PITCHING_BB    -1.382e+03  9.998e+02  -1.382 0.167099    
## PITCHING_SO     2.396e+00  9.406e-01   2.548 0.010917 *  
## FIELDING_E     -1.633e+04  1.202e+03 -13.591  < 2e-16 ***
## FIELDING_DP    -6.199e-01  7.777e-02  -7.971 2.65e-15 ***
## TB             -2.177e-01  1.435e-01  -1.516 0.129605    
## WHGP            1.345e+04  1.531e+04   0.879 0.379716    
## PITCHING_SO_BB -6.295e+03  2.391e+03  -2.633 0.008526 ** 
## BATTING_BB_SO  -6.379e+03  2.392e+03  -2.667 0.007712 ** 
## BsR             7.516e+00  2.390e+00   3.144 0.001689 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 65.08 on 1963 degrees of freedom
## Multiple R-squared:   0.34,  Adjusted R-squared:  0.334 
## F-statistic: 56.18 on 18 and 1963 DF,  p-value: < 2.2e-16
#stepwise approach
step <- lm(TARGET_WINS ~ BsR + TB + BASERUN_CS + FIELDING_E + FIELDING_DP + BATTING_2B + BATTING_H + BATTING_SO + BATTING_BB_SO + BASERUN_SB + BATTING_3B +BASERUN_SB + PITCHING_SO_BB + BATTING_BB_SO, data = transformed)
summary(step)
## 
## Call:
## lm(formula = TARGET_WINS ~ BsR + TB + BASERUN_CS + FIELDING_E + 
##     FIELDING_DP + BATTING_2B + BATTING_H + BATTING_SO + BATTING_BB_SO + 
##     BASERUN_SB + BATTING_3B + BASERUN_SB + PITCHING_SO_BB + BATTING_BB_SO, 
##     data = transformed)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -241.06  -43.18   -0.59   41.32  349.94 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     7.767e+04  1.035e+04   7.504 9.34e-14 ***
## BsR             9.258e+00  1.139e+00   8.128 7.64e-16 ***
## TB             -2.296e-01  5.510e-02  -4.166 3.23e-05 ***
## BASERUN_CS      2.439e+01  6.073e+00   4.016 6.13e-05 ***
## FIELDING_E     -1.587e+04  1.125e+03 -14.106  < 2e-16 ***
## FIELDING_DP    -6.188e-01  7.695e-02  -8.041 1.52e-15 ***
## BATTING_2B     -7.176e+00  8.600e-01  -8.344  < 2e-16 ***
## BATTING_H      -3.775e+04  6.273e+03  -6.018 2.10e-09 ***
## BATTING_SO     -2.843e-01  4.060e-02  -7.002 3.45e-12 ***
## BATTING_BB_SO  -6.454e+03  2.400e+03  -2.689 0.007230 ** 
## BASERUN_SB      1.428e+01  3.973e+00   3.596 0.000332 ***
## BATTING_3B      2.261e+01  5.444e+00   4.154 3.41e-05 ***
## PITCHING_SO_BB -6.342e+03  2.399e+03  -2.643 0.008281 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 65.46 on 1969 degrees of freedom
## Multiple R-squared:  0.3303, Adjusted R-squared:  0.3262 
## F-statistic: 80.92 on 12 and 1969 DF,  p-value: < 2.2e-16
plot(step)