KREIS HW1

Attempt

library(psych)
library(dplyr)
library(corrplot)
library(knitr)


train <- read.csv("https://raw.githubusercontent.com/bkreis84/Business-Analytics/master/HW1/moneyball-training-data.csv")

#remove the leading text "TEAM_" on the variable names to make our plots look less cluttered
colnames(train) = gsub("TEAM_", "", colnames(train))


kable(describe(train))

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
INDEX	1	2276	1268.46353	736.34904	1270.5	1268.56970	952.5705	1	2535	2534	0.0042149	-1.2167564	15.4346788
TARGET_WINS	2	2276	80.79086	15.75215	82.0	81.31229	14.8260	0	146	146	-0.3987232	1.0274757	0.3301823
BATTING_H	3	2276	1469.26977	144.59120	1454.0	1459.04116	114.1602	891	2554	1663	1.5713335	7.2785261	3.0307891
BATTING_2B	4	2276	241.24692	46.80141	238.0	240.39627	47.4432	69	458	389	0.2151018	0.0061609	0.9810087
BATTING_3B	5	2276	55.25000	27.93856	47.0	52.17563	23.7216	0	223	223	1.1094652	1.5032418	0.5856226
BATTING_HR	6	2276	99.61204	60.54687	102.0	97.38529	78.5778	0	264	264	0.1860421	-0.9631189	1.2691285
BATTING_BB	7	2276	501.55888	122.67086	512.0	512.18331	94.8864	0	878	878	-1.0257599	2.1828544	2.5713150
BATTING_SO	8	2174	735.60534	248.52642	750.0	742.31322	284.6592	0	1399	1399	-0.2978001	-0.3207992	5.3301912
BASERUN_SB	9	2145	124.76177	87.79117	101.0	110.81188	60.7866	0	697	697	1.9724140	5.4896754	1.8955584
BASERUN_CS	10	1504	52.80386	22.95634	49.0	50.35963	17.7912	0	201	201	1.9762180	7.6203818	0.5919414
BATTING_HBP	11	191	59.35602	12.96712	58.0	58.86275	11.8608	29	95	66	0.3185754	-0.1119828	0.9382681
PITCHING_H	12	2276	1779.21046	1406.84293	1518.0	1555.89517	174.9468	1137	30132	28995	10.3295111	141.8396985	29.4889618
PITCHING_HR	13	2276	105.69859	61.29875	107.0	103.15697	74.1300	0	343	343	0.2877877	-0.6046311	1.2848886
PITCHING_BB	14	2276	553.00791	166.35736	536.5	542.62459	98.5929	0	3645	3645	6.7438995	96.9676398	3.4870317
PITCHING_SO	15	2174	817.73045	553.08503	813.5	796.93391	257.2311	0	19278	19278	22.1745535	671.1891292	11.8621151
FIELDING_E	16	2276	246.48067	227.77097	159.0	193.43798	62.2692	65	1898	1833	2.9904656	10.9702717	4.7743279
FIELDING_DP	17	1990	146.38794	26.22639	149.0	147.57789	23.7216	52	228	176	-0.3889390	0.1817397	0.5879114

#There are some extreme outliers that should be addressed. Consulting the baseball almanac (http://www.baseball-almanac.com/rb_menu.shtml), 
#we were able to find historical teams minimum and maximums values for a number of historical team stats. These were then adjusted to be 
#comparable to a 162 game season. Because of the unlikelihood of these values, we question the vailidity of the entire observation and remove it

train <- train %>% 
  filter(TARGET_WINS >=22 & TARGET_WINS <= 124 & BATTING_H <= 1876 & BATTING_2B >= 116 & BATTING_2B <= 376 & BATTING_BB >= 292 &
           BATTING_BB <= 879 & BATTING_SO >= 326 & BATTING_SO <= 1535 & PITCHING_HR <= 258 & PITCHING_SO <= 1450)

describe(train)

##             vars    n    mean     sd median trimmed    mad  min  max range
## INDEX          1 1982 1281.51 732.36 1287.5 1284.57 936.26    2 2534  2532
## TARGET_WINS    2 1982   80.94  13.90   82.0   81.28  14.83   27  123    96
## BATTING_H      3 1982 1461.22 112.85 1452.0 1456.92 105.26 1137 1876   739
## BATTING_2B     4 1982  244.25  43.29  241.0  243.34  44.48  118  373   255
## BATTING_3B     5 1982   51.98  25.59   44.0   48.93  22.24   11  165   154
## BATTING_HR     6 1982  109.48  56.11  112.0  108.49  65.23    4  257   253
## BATTING_BB     7 1982  527.58  88.18  524.0  526.51  87.47  294  878   584
## BATTING_SO     8 1982  762.89 222.29  783.5  762.80 272.06  326 1399  1073
## BASERUN_SB     9 1966  118.05  83.88   97.0  104.18  56.34   18  697   679
## BASERUN_CS    10 1466   53.01  22.91   50.0   50.48  17.79   11  201   190
## BATTING_HBP   11  190   59.42  12.97   58.0   58.93  11.86   29   95    66
## PITCHING_H    12 1982 1548.83 198.68 1505.0 1523.04 151.97 1137 2656  1519
## PITCHING_HR   13 1982  113.63  56.39  115.0  112.55  63.75    4  257   253
## PITCHING_BB   14 1982  557.29 100.20  545.5  550.19  91.18  325 1123   798
## PITCHING_SO   15 1982  799.20 216.89  811.0  796.57 247.59  345 1434  1089
## FIELDING_E    16 1982  192.81 124.43  149.0  163.82  47.44   65  965   900
## FIELDING_DP   17 1811  150.43  22.69  151.0  150.76  22.24   72  228   156
##              skew kurtosis    se
## INDEX       -0.02    -1.19 16.45
## TARGET_WINS -0.23    -0.03  0.31
## BATTING_H    0.38     0.26  2.53
## BATTING_2B   0.21    -0.35  0.97
## BATTING_3B   1.06     0.75  0.57
## BATTING_HR   0.08    -0.84  1.26
## BATTING_BB   0.19     0.24  1.98
## BATTING_SO   0.00    -1.00  4.99
## BASERUN_SB   2.24     7.25  1.89
## BASERUN_CS   2.03     7.80  0.60
## BATTING_HBP  0.31    -0.11  0.94
## PITCHING_H   1.44     2.70  4.46
## PITCHING_HR  0.10    -0.79  1.27
## PITCHING_BB  0.84     1.43  2.25
## PITCHING_SO  0.12    -0.63  4.87
## FIELDING_E   2.43     5.91  2.79
## FIELDING_DP -0.12     0.22  0.53

#There are a number of valus with a minimum of 0. A 0 value is equivalent to an NA value for each variable in the data set. 

train[train == 0] <- NA


#https://stackoverflow.com/questions/4787332/how-to-remove-outliers-from-a-dataset


kable(describe(train))

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
INDEX	1	1982	1281.51413	732.35847	1287.5	1284.56620	936.2619	2	2534	2532	-0.0222668	-1.1935109	16.4502265
TARGET_WINS	2	1982	80.94147	13.90015	82.0	81.27995	14.8260	27	123	96	-0.2318537	-0.0317418	0.3122250
BATTING_H	3	1982	1461.22200	112.85252	1452.0	1456.91677	105.2646	1137	1876	739	0.3833650	0.2584343	2.5348918
BATTING_2B	4	1982	244.25429	43.28734	241.0	243.33733	44.4780	118	373	255	0.2061959	-0.3473562	0.9723197
BATTING_3B	5	1982	51.97830	25.59300	44.0	48.92749	22.2390	11	165	154	1.0646465	0.7519322	0.5748696
BATTING_HR	6	1982	109.47931	56.10561	112.0	108.48802	65.2344	4	257	253	0.0766060	-0.8395430	1.2602436
BATTING_BB	7	1982	527.58426	88.18040	524.0	526.51324	87.4734	294	878	584	0.1929183	0.2403791	1.9807070
BATTING_SO	8	1982	762.89455	222.29120	783.5	762.79887	272.0571	326	1399	1073	-0.0003035	-1.0015071	4.9931021
BASERUN_SB	9	1966	118.05137	83.88476	97.0	104.17662	56.3388	18	697	679	2.2434765	7.2507184	1.8918700
BASERUN_CS	10	1466	53.00955	22.91444	50.0	50.48041	17.7912	11	201	190	2.0259505	7.7988460	0.5984698
BATTING_HBP	11	190	59.41579	12.97498	58.0	58.93421	11.8608	29	95	66	0.3094528	-0.1127227	0.9413037
PITCHING_H	12	1982	1548.82846	198.68054	1505.0	1523.03594	151.9665	1137	2656	1519	1.4410110	2.7010412	4.4627598
PITCHING_HR	13	1982	113.62563	56.38617	115.0	112.55359	63.7518	4	257	253	0.0956767	-0.7914657	1.2665455
PITCHING_BB	14	1982	557.29162	100.19764	545.5	550.18726	91.1799	325	1123	798	0.8418509	1.4287787	2.2506382
PITCHING_SO	15	1982	799.19728	216.88816	811.0	796.56620	247.5942	345	1434	1089	0.1180771	-0.6339600	4.8717391
FIELDING_E	16	1982	192.81282	124.42942	149.0	163.82346	47.4432	65	965	900	2.4348990	5.9141829	2.7949321
FIELDING_DP	17	1811	150.43457	22.68564	151.0	150.75500	22.2390	72	228	156	-0.1228728	0.2216871	0.5330792

trainA <- train %>% 
  select_if(colSums(is.na(train))>0)

library(ggplot2)

plot_Missing <- function(data_in, title = NULL){
  temp_df <- as.data.frame(ifelse(is.na(data_in), 0, 1))
  temp_df <- temp_df[,order(colSums(temp_df))]
  data_temp <- expand.grid(list(x = 1:nrow(temp_df), y = colnames(temp_df)))
  data_temp$m <- as.vector(as.matrix(temp_df))
  data_temp <- data.frame(x = unlist(data_temp$x), y = unlist(data_temp$y), m = unlist(data_temp$m))
  ggplot(data_temp) + geom_tile(aes(x=x, y=y, fill=factor(m))) + scale_fill_manual(values=c("red", "white"), name="Missing\n(0=Yes, 1=No)") + theme_light() + ylab("") + xlab("") + ggtitle(title)
}
#https://www.kaggle.com/notaapple/detailed-exploratory-data-analysis-using-r

plot_Missing(trainA)

#Percentage of missing values per variable
kable(colSums(is.na(train))/colSums(count(train))*100)

	x
INDEX	0.0000000
TARGET_WINS	0.0000000
BATTING_H	0.0000000
BATTING_2B	0.0000000
BATTING_3B	0.0000000
BATTING_HR	0.0000000
BATTING_BB	0.0000000
BATTING_SO	0.0000000
BASERUN_SB	0.8072654
BASERUN_CS	26.0343088
BATTING_HBP	90.4137235
PITCHING_H	0.0000000
PITCHING_HR	0.0000000
PITCHING_BB	0.0000000
PITCHING_SO	0.0000000
FIELDING_E	0.0000000
FIELDING_DP	8.6276488
Less than 1 pe	rcent of the HR and BB data is missing, while 91.6% of the hit by pitch data is missing. The extent to which this data is missing leads me to suggest removing the variable altogether

For the other data, we can impute values using k nearest neighbors, which takes the average of the 10 nearest values weighted by their euclidean distance

library(DMwR)

newData <- train %>% 
  select(-BATTING_HBP)
  
newData <- knnImputation(newData, k=10)
anyNA(newData)

## [1] FALSE

ggplot(stack(newData[2:16]), aes(values))+
  facet_wrap(~ind, scales = "free") + 
  geom_histogram() +
  theme(legend.position="none")

ggplot(stack(newData[2:16]), aes(x = ind, y = values, fill=ind))+
  facet_wrap(~ind, scales = "free") + 
  geom_boxplot() +
  theme(legend.position="none")

One of the questions that comes to mind is whether our test set contains years that are more current and our training set contains all of the earlier data. If that is the case, the game of baseball was much different in the past, with less power-hitters just being one example.

#correlation plot
par(mfrow=c(1,1)) 
corr <- cor(newData[2:16])
corrplot(corr, order = "hclust")

Home runs for hitters and home runs allowed by pitchers are highly correlated (0.97). In baseball there are what are referred to as pitchers parks and hitters parks. Because teams play half their games at home, if they are in a hitters park, you would expect the team to hit more home runs and for their pitchers to give up more home runs. The variables with a positive correlation to wins appear to be hits, doubles and walks to a lesser degree

Total bases is an additional stat that we can add. It seems most other additional statistics can not be achieved without knowing the number of at bats or innings pitched. Total bases takes into account the production underlying each hit giving a heavier weighting to doubles, triples and home runs

Baseball reference has a list of the average at bats per game since 1871. We average these values and multiply by 162 to get an approximation of the average at bats.

newData <- newData %>% 
  mutate(TB = (BATTING_H - BATTING_2B - BATTING_3B - BATTING_HR + BATTING_2B * 2 + BATTING_3B * 3 + BATTING_HR * 4)) %>% 
  mutate(WHGP = (PITCHING_H + PITCHING_BB)/162) %>% 
  mutate(PITCHING_SO_BB = PITCHING_SO/PITCHING_BB) %>% 
  mutate(BATTING_BB_SO = BATTING_BB/BATTING_SO)


#Baseball Reference data
AVG <- read.csv("https://raw.githubusercontent.com/bkreis84/Business-Analytics/master/HW1/Baseball%20Reference%20AVG.csv")
avgAB <- mean(AVG$AB, na.rm = TRUE)*162


newData <- newData %>% 
  mutate(BsR = (((BATTING_H + BATTING_BB - BATTING_HR) * ((1.4*TB - .6*BATTING_H -3*BATTING_HR +0.1*BATTING_BB)*1.02)) /
                  (((1.4*TB - .6*BATTING_H -3*BATTING_HR +.1*BATTING_BB)*1.02) + avgAB - BATTING_H)) + BATTING_HR)

#correlation plot
par(mfrow=c(1,1)) 
corr <- cor(newData[2:18])
corrplot(corr, order = "hclust")

The new variables have some collinearity which we can investigate further

Next we can check to see which variables should be removed based on having realtively few unique values compared to the sample size and the large ratio of the most common value to the second most common value. There were none to remove in this case

library(caret)
nearZeroVar(newData)

## integer(0)

BoxCox transformation applied

#BoxCox transformation applied
set.seed(4234)
trans <- preProcess(newData, method = "BoxCox")

transformed <- predict(trans, newData)


ggplot(stack(transformed[2:21]), aes(values))+
  facet_wrap(~ind, scales = "free") + 
  geom_histogram() +
  theme(legend.position="none")

ggplot(stack(transformed[2:21]), aes(x = ind, y = values, fill=ind))+
  facet_wrap(~ind, scales = "free") + 
  geom_boxplot() +
  theme(legend.position="none")

The following creates a model from all of the candidate variables and then a stepwise method is used to find out which variables improve the model

#All variables
model <- lm(TARGET_WINS ~ . -INDEX, data = transformed)
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = transformed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -230.128  -43.745   -1.317   42.153  310.821 
## 
## Coefficients: (1 not defined because of singularities)
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.505e+04  1.975e+04   2.787 0.005363 ** 
## BATTING_H      -2.595e+04  1.308e+04  -1.984 0.047441 *  
## BATTING_2B     -6.173e+00  1.338e+00  -4.614 4.20e-06 ***
## BATTING_3B      2.952e+01  8.555e+00   3.451 0.000571 ***
## BATTING_HR      8.997e+00  2.683e+00   3.353 0.000816 ***
## BATTING_BB      1.323e+00  1.034e+00   1.279 0.200899    
## BATTING_SO     -5.770e-01  9.734e-02  -5.928 3.61e-09 ***
## BASERUN_SB      1.485e+01  4.155e+00   3.574 0.000360 ***
## BASERUN_CS      2.258e+01  6.242e+00   3.617 0.000306 ***
## PITCHING_H             NA         NA      NA       NA    
## PITCHING_HR    -7.975e+00  2.315e+00  -3.445 0.000584 ***
## PITCHING_BB    -1.382e+03  9.998e+02  -1.382 0.167099    
## PITCHING_SO     2.396e+00  9.406e-01   2.548 0.010917 *  
## FIELDING_E     -1.633e+04  1.202e+03 -13.591  < 2e-16 ***
## FIELDING_DP    -6.199e-01  7.777e-02  -7.971 2.65e-15 ***
## TB             -2.177e-01  1.435e-01  -1.516 0.129605    
## WHGP            1.345e+04  1.531e+04   0.879 0.379716    
## PITCHING_SO_BB -6.295e+03  2.391e+03  -2.633 0.008526 ** 
## BATTING_BB_SO  -6.379e+03  2.392e+03  -2.667 0.007712 ** 
## BsR             7.516e+00  2.390e+00   3.144 0.001689 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 65.08 on 1963 degrees of freedom
## Multiple R-squared:   0.34,  Adjusted R-squared:  0.334 
## F-statistic: 56.18 on 18 and 1963 DF,  p-value: < 2.2e-16

#stepwise approach
step <- lm(TARGET_WINS ~ BsR + TB + BASERUN_CS + FIELDING_E + FIELDING_DP + BATTING_2B + BATTING_H + BATTING_SO + BATTING_BB_SO + BASERUN_SB + BATTING_3B +BASERUN_SB + PITCHING_SO_BB + BATTING_BB_SO, data = transformed)
summary(step)

## 
## Call:
## lm(formula = TARGET_WINS ~ BsR + TB + BASERUN_CS + FIELDING_E + 
##     FIELDING_DP + BATTING_2B + BATTING_H + BATTING_SO + BATTING_BB_SO + 
##     BASERUN_SB + BATTING_3B + BASERUN_SB + PITCHING_SO_BB + BATTING_BB_SO, 
##     data = transformed)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -241.06  -43.18   -0.59   41.32  349.94 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     7.767e+04  1.035e+04   7.504 9.34e-14 ***
## BsR             9.258e+00  1.139e+00   8.128 7.64e-16 ***
## TB             -2.296e-01  5.510e-02  -4.166 3.23e-05 ***
## BASERUN_CS      2.439e+01  6.073e+00   4.016 6.13e-05 ***
## FIELDING_E     -1.587e+04  1.125e+03 -14.106  < 2e-16 ***
## FIELDING_DP    -6.188e-01  7.695e-02  -8.041 1.52e-15 ***
## BATTING_2B     -7.176e+00  8.600e-01  -8.344  < 2e-16 ***
## BATTING_H      -3.775e+04  6.273e+03  -6.018 2.10e-09 ***
## BATTING_SO     -2.843e-01  4.060e-02  -7.002 3.45e-12 ***
## BATTING_BB_SO  -6.454e+03  2.400e+03  -2.689 0.007230 ** 
## BASERUN_SB      1.428e+01  3.973e+00   3.596 0.000332 ***
## BATTING_3B      2.261e+01  5.444e+00   4.154 3.41e-05 ***
## PITCHING_SO_BB -6.342e+03  2.399e+03  -2.643 0.008281 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 65.46 on 1969 degrees of freedom
## Multiple R-squared:  0.3303, Adjusted R-squared:  0.3262 
## F-statistic: 80.92 on 12 and 1969 DF,  p-value: < 2.2e-16

plot(step)

KREIS HW1

Brian Kreis

February 24, 2018

Attempt