##            installed_and_loaded.packages.
## prettydoc                            TRUE
## psych                                TRUE
## knitr                                TRUE
## tidyverse                            TRUE
## ggthemes                             TRUE
## corrplot                             TRUE
## Hmisc                                TRUE
## data.table                           TRUE
## missForest                           TRUE
## mltools                              TRUE
## htmlTable                            TRUE
## broom                                TRUE
## MLmetrics                            TRUE

Prompt and Data Overview

In this homework assignment, you will explore, analyze and model a data set containing approximately 2200 records.

Each record represents a professional baseball team from the years 1871 to 2006 inclusive. Each record has the performance of the team for the given year, with all of the statistics adjusted to match the performance of a 162 game season.

Your objective is to build a multiple linear regression model on the training data to predict the number of wins for the team. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set:

1. DATA EXPLORATION

Non-visual exploration

The data set contains 15 explanatory variables, and they can be categorized into four groups:

7 batting metrics
2 baserunning metrics
2 fielding metrics
4 pitching metrics

Data Inspection

Let’s take a look at the data’s non-statistical aspects.

In the table below, we that the data set contains all integers.
The data set has less than 10% complete cases.
Six variables are missing values with TEAM_BATTING_HBP, TEAM_BASERUN_CS & TEAM_FIELDING_DP being the most sparse with 92%, 34% & 13% NAs, respectively.
We will have to drop them or try to impute the missing values.

vars	class_type	n_rows	complete_cases	NA_ct	NA_pct	unique_value_ct	most_common_values
TARGET_WINS	integer	2276	191	0	0.0	108	83; 79; 85; 86; 87
TEAM_BATTING_H	integer	2276	191	0	0.0	569	1458; 1410; 1475; 1400
TEAM_BATTING_2B	integer	2276	191	0	0.0	240	227; 235; 259; 220
TEAM_BATTING_3B	integer	2276	191	0	0.0	144	35; 29; 38; 40
TEAM_BATTING_HR	integer	2276	191	0	0.0	243	21; 108; 17; 29
TEAM_BATTING_BB	integer	2276	191	0	0.0	533	502; 492; 497; 503
TEAM_BATTING_SO	integer	2276	191	102	4.5	823	NA’s; 0; 952; 504
TEAM_BASERUN_SB	integer	2276	191	131	5.8	349	NA’s; 65; 69; 73
TEAM_BASERUN_CS	integer	2276	191	772	33.9	129	NA’s; 52; 46; 50; 37
TEAM_BATTING_HBP	integer	2276	191	2085	91.6	56	NA’s; 54; 56; 72; 50
TEAM_PITCHING_H	integer	2276	191	0	0.0	843	1494; 1400; 1398; 1404
TEAM_PITCHING_HR	integer	2276	191	0	0.0	256	114; 22; 46; 138
TEAM_PITCHING_BB	integer	2276	191	0	0.0	535	536; 467; 495; 529
TEAM_PITCHING_SO	integer	2276	191	102	4.5	824	NA’s; 0; 797; 825
TEAM_FIELDING_E	integer	2276	191	0	0.0	549	122; 134; 139; 116
TEAM_FIELDING_DP	integer	2276	191	286	12.6	145	NA’s; 148; 156; 144

Statistical Summary

	n	min	Q.1st	median	mean	Q.3rd	max	range	sd	se	skew	kurtosis
TARGET_WINS	2,276	0	71	82	80.8	92	146	146	15.8	0.3	-0.4	1
TEAM_BATTING_H	2,276	891	1,383	1,454	1,469.3	1,537.2	2,554	1,663	144.6	3	1.6	7.3
TEAM_BATTING_2B	2,276	69	208	238	241.2	273	458	389	46.8	1	0.2	0
TEAM_BATTING_3B	2,276	0	34	47	55.2	72	223	223	27.9	0.6	1.1	1.5
TEAM_BATTING_HR	2,276	0	42	102	99.6	147	264	264	60.5	1.3	0.2	-1
TEAM_BATTING_BB	2,276	0	451	512	501.6	580	878	878	122.7	2.6	-1	2.2
TEAM_BATTING_SO	2,174	0	548	750	735.6	930	1,399	1,399	248.5	5.3	-0.3	-0.3
TEAM_BASERUN_SB	2,145	0	66	101	124.8	156	697	697	87.8	1.9	2	5.5
TEAM_BASERUN_CS	1,504	0	38	49	52.8	62	201	201	23	0.6	2	7.6
TEAM_BATTING_HBP	191	29	50.5	58	59.4	67	95	66	13	0.9	0.3	-0.1
TEAM_PITCHING_H	2,276	1,137	1,419	1,518	1,779.2	1,682.5	30,132	28,995	1,406.8	29.5	10.3	141.8
TEAM_PITCHING_HR	2,276	0	50	107	105.7	150	343	343	61.3	1.3	0.3	-0.6
TEAM_PITCHING_BB	2,276	0	476	536.5	553	611	3,645	3,645	166.4	3.5	6.7	97
TEAM_PITCHING_SO	2,174	0	615	813.5	817.7	968	19,278	19,278	553.1	11.9	22.2	671.2
TEAM_FIELDING_E	2,276	65	127	159	246.5	249.2	1,898	1,833	227.8	4.8	3	11
TEAM_FIELDING_DP	1,990	52	131	149	146.4	164	228	176	26.2	0.6	-0.4	0.2

Visual Exploration

Boxplots

In the boxplots below, we see that the variance of some of the explanatory variables greatly exceeds the variance of the response games-won variable.
According to the summary statistics table above, the TARGET_WIN’s standard deviation is the 2nd smallest of all the variables.
A few of them have so many outliers that we may be dealing with non-unimodal distributions.
The data set has 193 observations that are more extreme than the 1.5 * IQR of the boxplot whiskers.

Histograms

The histograms below confirm that both the batting & pitching home-run variables are bimodal as well as the batting strike-out variable.
Many variables have significant right skews.
TEAM_BATTING_BB & TEAM_FIELDING_DP are left-skewed.
The distributions for TEAM_BATTING_HBP, TEAM_BASERUN_CS & TEAM_FIELDING_DP are centered away from zero, so it seems unlikely that all the NAs are mislabelled zero values.

Correlations

In the correlation table and plot below, we see 4 pairs of highly correlated variables. They are corresponding metrics for batting and pitching: home runs, walks, strike outs & hits (rows 1-4). These may present multicollinearity issues in our modeling.
Rows 8 to 11 show batting & pitching hits and walks are moderately correlated to our response variable. Let’s consider using them for one our models.

Top 12 Correlated Variable Pairs
	Var1	Var2	Correlation
1	TEAM_PITCHING_HR	TEAM_BATTING_HR	0.9999
2	TEAM_PITCHING_BB	TEAM_BATTING_BB	0.9999
3	TEAM_PITCHING_SO	TEAM_BATTING_SO	0.9998
4	TEAM_PITCHING_H	TEAM_BATTING_H	0.9992
5	TEAM_BASERUN_CS	TEAM_BASERUN_SB	0.6247
6	TEAM_BATTING_2B	TEAM_BATTING_H	0.5618
7	TEAM_PITCHING_H	TEAM_BATTING_2B	0.5605
8	TEAM_PITCHING_H	TARGET_WINS	0.4712
9	TEAM_BATTING_H	TARGET_WINS	0.4699
10	TEAM_BATTING_BB	TARGET_WINS	0.4687
11	TEAM_PITCHING_BB	TARGET_WINS	0.4684
12	TEAM_PITCHING_HR	TEAM_BATTING_BB	0.4566

2. DATA PREPARATION

Creating Bins

From the histograms and summary statitics, TEAM_PITCHING_H seems to have the extreme outliers, so let’s put these values into quintile bins to mitigate their effect.

train_raw$TEAM_PITCHING_H <- bin_data(train_raw$TEAM_PITCHING_H, bins = 5, binType = "quantile") 

levels(train_raw$TEAM_PITCHING_H) <- c("One","Two","Three","Four","Five")

Missing Value Imputation

Let’s use the missForest package to do nonparametric missing-value imputation using Random Forest.

First, let’s try leaving in TEAM_BATTING_HPB, even though it has all of those NAs.

When we check the out-of-the-bag error, we would like to the normalized root mean-squares error (NRMSE) near zero, which would indicate a well-fitted imputation. However, as suspected, the normalized root mean-squares error (NRMSE) for TEAM_BATTING_HBP is nearly twice as large as the next highest NRMSE. Let’s remove it and impute the missing values again.

Source: Accuracy and Errors for Models

##   missForest iteration 1 in progress...done!
##   missForest iteration 2 in progress...done!
##   missForest iteration 3 in progress...done!
##   missForest iteration 4 in progress...done!
##   missForest iteration 5 in progress...done!

variable	NA_ct	NA_pct	MSE	RMSE	NRMSE
TEAM_BATTING_HBP	2085	91.61	194.34	13.94	0.21
TEAM_FIELDING_DP	286	12.57	341.17	18.47	0.10
TEAM_BASERUN_CS	772	33.92	178.53	13.36	0.07
TEAM_BASERUN_SB	131	5.76	1637.94	40.47	0.06
TEAM_BATTING_SO	102	4.48	1827.35	42.75	0.03
TEAM_PITCHING_SO	102	4.48	130365.14	361.06	0.02

The remaining 5 imputed variables have NRMSE values have better fits and range from .02 to .11.

##   missForest iteration 1 in progress...done!
##   missForest iteration 2 in progress...done!
##   missForest iteration 3 in progress...done!
##   missForest iteration 4 in progress...done!
##   missForest iteration 5 in progress...done!

variable	NA_ct	NA_pct	MSE	RMSE	NRMSE
TEAM_FIELDING_DP	286	12.57	346.87	18.62	0.11
TEAM_BASERUN_CS	772	33.92	185.08	13.60	0.07
TEAM_BASERUN_SB	131	5.76	1629.75	40.37	0.06
TEAM_BATTING_SO	102	4.48	2467.60	49.67	0.04
TEAM_PITCHING_SO	102	4.48	172102.31	414.85	0.02

3. BUILD MODELS

Model 1: All Variables

First, let’s build a model with all variables.

## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = train_imputed)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.389  -8.298   0.195   8.167  54.913 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          28.6409850  5.7749125   4.960 7.59e-07 ***
## TEAM_BATTING_H        0.0467974  0.0036512  12.817  < 2e-16 ***
## TEAM_BATTING_2B      -0.0080999  0.0091590  -0.884  0.37659    
## TEAM_BATTING_3B       0.0248277  0.0166190   1.494  0.13533    
## TEAM_BATTING_HR       0.0655984  0.0277124   2.367  0.01801 *  
## TEAM_BATTING_BB       0.0123925  0.0052297   2.370  0.01789 *  
## TEAM_BATTING_SO      -0.0195892  0.0026891  -7.285 4.43e-13 ***
## TEAM_BASERUN_SB       0.0470115  0.0054356   8.649  < 2e-16 ***
## TEAM_BASERUN_CS       0.0262674  0.0156964   1.673  0.09437 .  
## TEAM_PITCHING_HTwo   -0.9872101  0.8882665  -1.111  0.26652    
## TEAM_PITCHING_HThree -1.0974513  0.9532332  -1.151  0.24973    
## TEAM_PITCHING_HFour  -4.3120963  1.1048503  -3.903 9.78e-05 ***
## TEAM_PITCHING_HFive  -3.1433220  1.4102628  -2.229  0.02592 *  
## TEAM_PITCHING_HR      0.0291162  0.0247491   1.176  0.23954    
## TEAM_PITCHING_BB     -0.0031523  0.0035931  -0.877  0.38041    
## TEAM_PITCHING_SO      0.0024559  0.0009026   2.721  0.00656 ** 
## TEAM_FIELDING_E      -0.0322373  0.0023362 -13.799  < 2e-16 ***
## TEAM_FIELDING_DP     -0.1123643  0.0140611  -7.991 2.11e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.65 on 2258 degrees of freedom
## Multiple R-squared:  0.3602, Adjusted R-squared:  0.3554 
## F-statistic: 74.77 on 17 and 2258 DF,  p-value: < 2.2e-16

In the model output above, we notice the following:

Of the 14 continuous & intercept variables, 9 have statistically significant p-values at the 5% significance level. In the 3rd model, we will explore which of the variables are actually necessary to a parsimonious model.
Only 2 out of the 4 TEAM_PITCHING_H quintile dummy variables are statistically significant, so let’s check an ANOVA summary to make sure that the group means are significantly different. The F-statistic’s p-value is near zero, so we would reject the null hypothesis that the means are equal and keep the dummy variables in the model.

##                   Df Sum Sq Mean Sq F value Pr(>F)    
## TEAM_PITCHING_H    4  22687    5672   23.77 <2e-16 ***
## Residuals       2271 541810     239                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

While we would expect negative coefficients for Fielding Errors, Walks Allowed & Batting Strikeouts, we wouldn’t expect negative values for Double Plays & Batting Doubles. Additionally, it’s surprising that Caught Stealing & Home Runs Allowed do not have negative coefficients. While none of the variables are perfectly collinear and were automatically removed from the model, these unexpected signs may indicate that we have some.
At 0.3602, the adjusted \(R^2\) is not as high as we would prefer, and it indicates that only 36% of the variance in the response variable can be explained by the predictor variables.
At 75, the F-statistic is large, and the model’s p-value is near zero. If the model’s diagnostics are sufficient, these values indicate that we would reject the null hypothesis that there is no relationship between the explanatory & response variables.

Model 2: Only Highly Correlated Variables

From our Correlated Variable Pairs table, let’s use only the 4 explanatory variables that were most correlated with the response variable.

##               Var1        Var2 Correlation
## 1  TEAM_PITCHING_H TARGET_WINS   0.4712343
## 2   TEAM_BATTING_H TARGET_WINS   0.4699467
## 3  TEAM_BATTING_BB TARGET_WINS   0.4686879
## 4 TEAM_PITCHING_BB TARGET_WINS   0.4683988

## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H + TEAM_BATTING_H + 
##     TEAM_BATTING_BB + TEAM_PITCHING_BB, data = train_imputed)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.728  -8.963   0.394   9.085  65.012 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -6.052105   3.788542  -1.597  0.11030    
## TEAM_PITCHING_HTwo    0.092667   0.955466   0.097  0.92275    
## TEAM_PITCHING_HThree -0.034295   1.005300  -0.034  0.97279    
## TEAM_PITCHING_HFour  -3.315098   1.095721  -3.025  0.00251 ** 
## TEAM_PITCHING_HFive  -1.341073   1.301417  -1.030  0.30290    
## TEAM_BATTING_H        0.049052   0.002722  18.018  < 2e-16 ***
## TEAM_BATTING_BB       0.036671   0.003172  11.563  < 2e-16 ***
## TEAM_PITCHING_BB     -0.004882   0.002201  -2.218  0.02667 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.87 on 2268 degrees of freedom
## Multiple R-squared:  0.2275, Adjusted R-squared:  0.2251 
## F-statistic: 95.41 on 7 and 2268 DF,  p-value: < 2.2e-16

In the Model #2 output above, we notice the following:

All 3 continuous variables are statistically significant, but this time the intercept is not.
Only 1 of the 4 TEAM_PITCHING_H dummy variables is statistically significant.
We would expect a negative coefficient for Walks Allowed and positive ones for Batting Walks & Batting Hits. However, 1 of the 4 Hits Allowed dummy variables does not have a negative coefficient, and this unexpected coefficient sign may indicate that we still have some collinearity.
At 0.2251, the adjusted \(R^2\) indicates that this model explains less of variance in the response variable than Model #1.
At 95, the F-statistic is larger than model #1, and the model’s p-value is near zero. If the model’s diagnostics are sufficient, these values indicate that we would reject the null hypothesis that there is no relationship between the explanatory & response variables.

Model 3: Backwards Elimination

Let’s start with Model #1, but let’s remove the 4 variables that appeared to have multicollinearity issues because of their unexpected coefficient signs.
Then we will remove one variable at a time by greatest coefficient p-value until we have a parsimonious model with only statistically significant variables.
Along the way, we will take note if the removal of a variable causes a significant changes in adjusted R-squared and coefficient estimates.
Lastly, TEAM_PITCHING_SO was removed because it didn’t have the expected sign.

## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HR + 
##     TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_FIELDING_E, data = train_imputed)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.551  -8.678   0.128   8.514  53.122 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     25.629114   4.237642   6.048 1.71e-09 ***
## TEAM_BATTING_H   0.039267   0.002512  15.631  < 2e-16 ***
## TEAM_BATTING_HR  0.071958   0.008755   8.219 3.41e-16 ***
## TEAM_BATTING_SO -0.014925   0.002190  -6.814 1.21e-11 ***
## TEAM_BASERUN_SB  0.066361   0.003767  17.616  < 2e-16 ***
## TEAM_FIELDING_E -0.030775   0.001665 -18.482  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.93 on 2270 degrees of freedom
## Multiple R-squared:  0.3273, Adjusted R-squared:  0.3258 
## F-statistic: 220.9 on 5 and 2270 DF,  p-value: < 2.2e-16

In the Model #3 output above, we notice the following:

All 5 continuous variables & and the intercept are statistically significant. Their p-values are near zero.
Unlike the previous models, we do not have any unexpected coefficient signs, which means that we have removed at least some of the multilinearity.
At 0.3258, the adjusted \(R^2\) indicates that this model explains 33% of the variance in the response variable.
At 221, the F-statistic is larger than the other 2 models, and the model’s p-value is near zero. If the model’s diagnostics are sufficient, these values indicate that we would reject the null hypothesis that there is no relationship between the explanatory & response variables.

4. SELECT MODEL

Evaluation

The table below summarizes the 3 models.

While Model #1 has the highest adjusted R-squared, it also had 5 variables that were not statistically significant, and 4 variables with unexpected signs, which indicated multicollinearity issues.
Model #2 used fewer variables, but the intercept coefficient was not statistically significant. We also saw some evidence of collinearity.
Of the 3, Model #3 is the best. Removing the non-significant and collinear variables makes it more parsimonious with an adjusted R-squared that is less than Model #1 but much greater than Model #2. This model’s F-statistic is also significantly larger than the other 2. Additionally, the difference between Model #3’s predicted R-squared and R-squared is smaller than the other models, giving us more confidence that the model is not overfit.

model_name	n_predictors	pred.r.squared	r.squared	adj.r.squared	sigma	statistic	df
mod1	14	0.3326	0.3602	0.3554	12.6474	74.7676	18
mod2	4	0.2146	0.2275	0.2251	13.8663	95.4145	8
mod3	5	0.3225	0.3273	0.3258	12.9336	220.9222	6

Residual Plots

In the Q-Q plot, we see some deviations for the normal distribution at the ends. Some of the extreme cases have been labeled. However, as the LMR text suggests, perhaps we can relax the normality assumption since we have more than 2200 observations.
In the residuals-fitted and standardzied residuals-fitted plots, the best fitted line has a curve in the main cluster, which indicates that we do not have constant variance.
Ideally, prior to making any inferences, we would want to attempt to transform the data in order to better meet the regression assumptions.

Test Model

Explore

Let’s load the data, and look at our NA rates.

vars	class_type	n_rows	complete_cases	NA_ct	NA_pct	unique_value_ct	most_common_values
TEAM_BATTING_H	integer	259	19	0	0.0	199	1420; 1450; 1381; 1385
TEAM_BATTING_2B	integer	259	19	0	0.0	137	218; 223; 225; 239
TEAM_BATTING_3B	integer	259	19	0	0.0	91	36; 39; 46; 52; 53
TEAM_BATTING_HR	integer	259	19	0	0.0	141	130; 29; 98; 18
TEAM_BATTING_BB	integer	259	19	0	0.0	185	506; 511; 435; 463
TEAM_BATTING_SO	integer	259	19	18	6.9	207	NA’s; 629; 677; 1008
TEAM_BASERUN_SB	integer	259	19	13	5.0	154	NA’s; 72; 54; 59
TEAM_BASERUN_CS	integer	259	19	87	33.6	69	NA’s; 56; 32; 39; 40
TEAM_BATTING_HBP	integer	259	19	240	92.7	18	NA’s; 62; 63; 42; 46
TEAM_PITCHING_H	integer	259	19	0	0.0	210	1450; 1453; 1457; 1489
TEAM_PITCHING_HR	integer	259	19	0	0.0	144	62; 112; 25; 43
TEAM_PITCHING_BB	integer	259	19	0	0.0	199	521; 438; 471; 474
TEAM_PITCHING_SO	integer	259	19	18	6.9	206	NA’s; 527; 554; 715
TEAM_FIELDING_E	integer	259	19	0	0.0	168	130; 137; 126; 135
TEAM_FIELDING_DP	integer	259	19	31	12.0	91	NA’s; 146; 174; 156; 157

Transform & Impute Missing Values

We see similar patterns of missing values, so let’s do the following:

bin TEAM_PITCHING_H
drop the sparsely populated TEAM_BATTING_HBP
and impute the missing values

Since we have fewer cases, the NRMSE values are a little higher with the test data.

##   missForest iteration 1 in progress...done!
##   missForest iteration 2 in progress...done!
##   missForest iteration 3 in progress...done!
##   missForest iteration 4 in progress...done!

variable	NA_ct	NA_pct	MSE	RMSE	NRMSE
TEAM_FIELDING_DP	31	11.97	345.79	18.60	0.14
TEAM_BASERUN_CS	87	33.59	298.59	17.28	0.11
TEAM_BASERUN_SB	13	5.02	2924.34	54.08	0.09
TEAM_BATTING_SO	18	6.95	7168.02	84.66	0.07
TEAM_PITCHING_SO	18	6.95	377291.26	614.24	0.06
### Predict

test_results <- predict(mod3, newdata = test_imputed)

Code Appendix

knitr::opts_chunk$set(
                      error = FALSE
                      ,message = FALSE
                      #,tidy = TRUE
                      ,cache = TRUE
                      )
#required packages
packages <- c("prettydoc", "psych", "knitr", "tidyverse", "ggthemes", "corrplot", "Hmisc", "data.table", "missForest", "mltools", "htmlTable", "broom", "MLmetrics") 
#, "pastecs"

#see if we need to install any of them
installed_and_loaded <- function(pkg){
  #CODE SOURCE: https://gist.github.com/stevenworthington/3178163
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg)) install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE, quietly = TRUE, warn.conflicts = FALSE)
}

#excute function and display the loaded packages
data.frame(installed_and_loaded(packages))
train_raw <- read.csv("https://raw.githubusercontent.com/kylegilde/D621-Data-Mining/master/HW1%20Moneyball/moneyball-training-data.csv")

train_raw <- train_raw %>% select(-INDEX)

test_raw <- dplyr::select(read.csv("https://raw.githubusercontent.com/kylegilde/D621-Data-Mining/master/HW1%20Moneyball/moneyball-evaluation-data.csv"), - INDEX)

set.seed(5) 


metadata <- function(df){
  #Takes a data frame & Checks NAs, class types, inspects the unique values
  df_len <- nrow(df)
  NA_ct = as.vector(rapply(df, function(x) sum(is.na(x))))

  #create dataframe  
  df_metadata <- data.frame(
    vars = names(df),
    class_type = rapply(lapply(df, class), function(x) x[[1]]),
    n_rows = rapply(df, length),
    complete_cases = sum(complete.cases(df)),
    NA_ct = NA_ct,
    NA_pct = NA_ct / df_len * 100,
    unique_value_ct = rapply(df, function(x) length(unique(x))),
    most_common_values = rapply(df, function(x) str_replace(paste(names(sort(summary(as.factor(x)), decreasing=T))[1:5], collapse = '; '), "\\(Other\\); ", ""))
  )
 rownames(df_metadata) <- NULL
 return(df_metadata)
}

meta_df <- metadata(train_raw)


kable(meta_df, digits = 1) 
metrics <- function(df){
  ###Creates summary metrics table
  metrics_only <- df[, which(rapply(lapply(df, class), function(x) x[[1]]) %in% c("numeric", "integer"))]
  
  df_metrics <- psych::describe(metrics_only, quant = c(.25,.75))
  
  df_metrics <- 
    dplyr::select(df_metrics, n, min, Q.1st = Q0.25, median, mean, Q.3rd = Q0.75, 
    max, range, sd, se, skew, kurtosis
  )
  
  return(df_metrics)
}

metrics_df <- metrics(train_raw)

kable(metrics_df, digits = 1, format.args = list(big.mark = ',', scientific = F, drop0trailing = T))
#calculate some parameters to deal with the outliers
train_stacked <- na.omit(stack(train_raw))
bpstats <- boxplot(values ~ ind, data = train_stacked, plot = F)$stats
ylimits <- c(0, ceiling(max(bpstats) / 200)) * 200
ybreaks <- seq(ylimits[1], ylimits[2], by = 200)
outliers_not_shown <- paste(sum(train_stacked$values > max(ylimits)), "outlier(s) not displayed")

ggplot(data = train_stacked, mapping = aes(x = ind, y = values)) + 
  geom_boxplot(outlier.size = 1) +
  labs(caption = paste("Red dot = mean", outliers_not_shown, sep = "\n")) +
  scale_x_discrete(limits = rev(levels(train_stacked$ind))) +
  scale_y_continuous(breaks = ybreaks) +
  stat_summary(fun.y=mean, geom="point", size=2, color = "red") +
  coord_flip(ylim = ylimits) +
  theme_fivethirtyeight()


hist.data.frame(train_raw)
cormatrix <- cor(drop_na(train_raw))

#find the top correlations
cor_df <- data.frame(Var1=rownames(cormatrix)[row(cormatrix)],
                     Var2=colnames(cormatrix)[col(cormatrix)],
                     Correlation=c(cormatrix))


corr_list <- 
  cor_df %>% 
  filter(Var1 != Var2) %>% 
  arrange(-Correlation)

#dedupe the rows
sort_rows <- t(apply(corr_list, 1, sort, decreasing = T))
fin_list <- corr_list[!duplicated(sort_rows), ]
rownames(fin_list) <- 1:nrow(fin_list)
#print table
kable(head(fin_list, 12), digits=4, row.names = T, caption = "Top 12 Correlated Variable Pairs")

#https://stackoverflow.com/questions/28035001/transform-correlation-matrix-into-dataframe-with-records-for-each-row-column-pai

#plot
corrplot(cormatrix, method = "square", type = "upper")

train_raw$TEAM_PITCHING_H <- bin_data(train_raw$TEAM_PITCHING_H, bins = 5, binType = "quantile") 

levels(train_raw$TEAM_PITCHING_H) <- c("One","Two","Three","Four","Five")

impute_missing <- missForest(train_raw, variablewise = T)
 
# check imputation error
impute_df <- cbind(meta_df, 
                   range = metrics_df$range,
                   MSE = as.numeric(impute_missing$OOBerror),
                   variable = names(impute_missing$ximp)) %>% 
  select(variable, NA_ct, NA_pct, MSE) %>% 
  mutate(RMSE = sqrt(as.numeric(impute_missing$OOBerror)),
         NRMSE = sqrt(as.numeric(impute_missing$OOBerror))/metrics_df$range) %>% 
  filter(MSE > 0) %>% 
  arrange(-NRMSE)

kable(impute_df, digits = 2) 
train_raw_less_one <- select(train_raw, -TEAM_BATTING_HBP)
impute_missing <- missForest(train_raw_less_one, variablewise = T)
 
# check imputation error
impute_df <- cbind(metadata(train_raw_less_one), 
                   range = rapply(impute_missing$ximp, max) - rapply(impute_missing$ximp, min),
                   MSE = as.numeric(impute_missing$OOBerror),
                   variable = names(impute_missing$ximp)) %>% 
  select(variable, NA_ct, NA_pct, MSE) %>% 
  mutate(RMSE = sqrt(as.numeric(impute_missing$OOBerror)),
         NRMSE = sqrt(as.numeric(impute_missing$OOBerror))/(rapply(impute_missing$ximp, max) - rapply(impute_missing$ximp, min))) %>% 
  filter(MSE > 0) %>% 
  arrange(-NRMSE)

kable(impute_df, digits = 2)

train_imputed <- impute_missing$ximp 
class(train_imputed$TEAM_PITCHING_H) <- "factor"

mod1 <- lm(TARGET_WINS ~ ., data = train_imputed)

summary(mod1) 
summary(aov(TARGET_WINS~TEAM_PITCHING_H, data = train_imputed))
filter(fin_list, Var2 == "TARGET_WINS")[1:4,]

mod2 <- lm(TARGET_WINS ~ TEAM_PITCHING_H + TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_PITCHING_BB, data = train_imputed)

summary(mod2)

mod3 <- update(mod1, .~. -TEAM_FIELDING_DP -TEAM_BATTING_2B -TEAM_BASERUN_CS -TEAM_PITCHING_HR, data = train_imputed)
#summary(mod3) #R-sq = 0.334 

#remove TEAM_PITCHING_BB at 0.598258 
mod3 <- update(mod3, .~. -TEAM_PITCHING_BB, data = train_imputed)
#summary(mod3) #new R-sq = 0.3342

#remove TEAM_BATTING_BB at 0.169551
mod3 <- update(mod3, .~. -TEAM_BATTING_BB, data = train_imputed)
#summary(mod3) #new R-sq = 0.3341

#remove TEAM_PITCHING_H  
mod3 <- update(mod3, .~. -TEAM_PITCHING_H, data = train_imputed)
#summary(mod3) #new R-sq = 0.3284

#remove TEAM_PITCHING_SO, it appears collinear  
mod3 <- update(mod3, .~. -TEAM_PITCHING_SO, data = train_imputed)
#summary(mod3) #new R-sq = 0.3273

#remove TEAM_BATTING_3B,  
mod3 <- update(mod3, .~. -TEAM_BATTING_3B, data = train_imputed)
summary(mod3) #new R-sq = 0.3273


PRESS <- function(linear.model) {
  #source:  https://gist.github.com/tomhopper/8c204d978c4a0cbcb8c0#file-press-r
  #' calculate the predictive residuals
  pr <- residuals(linear.model)/(1-lm.influence(linear.model)$hat)
  #' calculate the PRESS
  PRESS <- sum(pr^2)
  
  return(PRESS)
}
pred_r_squared <- function(linear.model) {
  #source: https://gist.github.com/tomhopper/8c204d978c4a0cbcb8c0#file-pred_r_squared-r
  #' Use anova() to get the sum of squares for the linear model
  lm.anova <- anova(linear.model)
  #' Calculate the total sum of squares
  tss <- sum(lm.anova$'Sum Sq')
  # Calculate the predictive R^2
  pred.r.squared <- 1-PRESS(linear.model)/(tss)
  
  return(pred.r.squared)
}


model_summary <- function(model, y_var) {
    ### Summarizes the model's key statistics in one row
    df_summary <- glance(summary(model))
    model_name <- deparse(substitute(model))
    n_predictors <- ncol(model$model) - 1
    pred.r.squared <- pred_r_squared(model)
    df_summary <- cbind(model_name, n_predictors, pred.r.squared, df_summary)
    return(df_summary)
}
mod_sum_df1 <- model_summary(mod1, "TARGET_WINS")
mod_sum_df2 <- model_summary(mod2, "TARGET_WINS")
mod_sum_df3 <- model_summary(mod3, "TARGET_WINS")

kable(all_results <- rbind(mod_sum_df1, mod_sum_df2, mod_sum_df3), digits = 4)
par(mfrow=c(2,2))
plot(mod3)
kable(metadata(test_raw), digits = 1)

test_raw$TEAM_PITCHING_H <- bin_data(test_raw$TEAM_PITCHING_H, bins = 5, binType = "quantile") 

levels(test_raw$TEAM_PITCHING_H) <- c("One","Two","Three","Four","Five")

test_raw_less_one <- select(test_raw, -TEAM_BATTING_HBP)

impute_missing_test <- missForest(test_raw_less_one, variablewise = T)
 
# check imputation error
impute_df <- cbind(metadata(test_raw_less_one), 
                   range = rapply(impute_missing_test$ximp, max) - rapply(impute_missing_test$ximp, min),
                   MSE = as.numeric(impute_missing_test$OOBerror),
                   variable = names(impute_missing_test$ximp)
                   ) %>% 
  select(variable, NA_ct, NA_pct, MSE) %>% 
  mutate(RMSE = sqrt(as.numeric(impute_missing_test$OOBerror)),
         NRMSE = sqrt(as.numeric(impute_missing_test$OOBerror))/(rapply(impute_missing_test$ximp, max) - rapply(impute_missing_test$ximp, min))) %>% 
  filter(MSE > 0) %>% 
  arrange(-NRMSE)

kable(impute_df, digits = 2)

test_imputed <- impute_missing$ximp 
class(test_imputed$TEAM_PITCHING_H) <- "factor"
test_results <- predict(mod3, newdata = test_imputed)

DATA 621 Business Analytics & Data Mining

Homework #1 - Moneyball

Kyle Gilde

2/18/2018