We are given records that pertain to a professional baseball team from 1871 to 2006. Each record (row) states the team performance with records aggregated annualy. The goal is to build linear regression models using training data to account for the variability between variables and to predict against the evaluation data set.

The data exists under its own github URL. This allows for increased reproducibility without depending on loading in data from a local machine.

Read in the trainig data data:

moneyball_training <- read.csv(url("https://raw.githubusercontent.com/vindication09/DATA-621-Week-1-/master/moneyball-training-data.csv"), header =TRUE)

head(moneyball_training, 10)
##    INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1      1          39           1445             194              39
## 2      2          70           1339             219              22
## 3      3          86           1377             232              35
## 4      4          70           1387             209              38
## 5      5          82           1297             186              27
## 6      6          75           1279             200              36
## 7      7          80           1244             179              54
## 8      8          85           1273             171              37
## 9     11          86           1391             197              40
## 10    12          76           1271             213              18
##    TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1               13             143             842              NA
## 2              190             685            1075              37
## 3              137             602             917              46
## 4               96             451             922              43
## 5              102             472             920              49
## 6               92             443             973             107
## 7              122             525            1062              80
## 8              115             456            1027              40
## 9              114             447             922              69
## 10              96             441             827              72
##    TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1               NA               NA            9364               84
## 2               28               NA            1347              191
## 3               27               NA            1377              137
## 4               30               NA            1396               97
## 5               39               NA            1297              102
## 6               59               NA            1279               92
## 7               54               NA            1244              122
## 8               36               NA            1281              116
## 9               27               NA            1391              114
## 10              34               NA            1271               96
##    TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1               927             5456            1011               NA
## 2               689             1082             193              155
## 3               602              917             175              153
## 4               454              928             164              156
## 5               472              920             138              168
## 6               443              973             123              149
## 7               525             1062             136              186
## 8               459             1033             112              136
## 9               447              922             127              169
## 10              441              827             131              159

The evaluation data set will be used to test the predictive power of the models, however any data preperation such as column removals or missing value treatment will be applied to the evaluation dataset. Read in the evaluation (test) data:

moneyball_evaluation <- read.csv(url("https://raw.githubusercontent.com/vindication09/DATA-621-Week-1-/master/moneyball-evaluation-data.csv"), header =TRUE)

head(moneyball_evaluation, 10)
##    INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1      9           1209             170              33              83
## 2     10           1221             151              29              88
## 3     14           1395             183              29              93
## 4     47           1539             309              29             159
## 5     60           1445             203              68               5
## 6     63           1431             236              53              10
## 7     74           1430             219              55              37
## 8     83           1385             158              42              33
## 9     98           1259             177              78              23
## 10   120           1397             212              42              58
##    TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1              447            1080              62              50
## 2              516             929              54              39
## 3              509             816              59              47
## 4              486             914             148              57
## 5               95             416              NA              NA
## 6              215             377              NA              NA
## 7              568             527             365              NA
## 8              356             609             185              NA
## 9              466             689             150              NA
## 10             452             584              52              NA
##    TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 1                NA            1209               83              447
## 2                NA            1221               88              516
## 3                NA            1395               93              509
## 4                42            1539              159              486
## 5                NA            3902               14              257
## 6                NA            2793               20              420
## 7                NA            1544               40              613
## 8                NA            1626               39              418
## 9                NA            1342               25              497
## 10               NA            1489               62              482
##    TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              1080             140              156
## 2               929             135              164
## 3               816             156              153
## 4               914             124              154
## 5              1123             616              130
## 6               736             572              105
## 7               569             490               NA
## 8               715             328              104
## 9               734             226              132
## 10              622             184              145
  1. Data Exploration
## [1] 2276
## [1] 17

The moneyball training data has 2276 rows and 17 columns (variables). The response variable is TARGET_WINS which simply is the number of wins for a team. The other variables of interest describe baseball records in the form of doubles, triples, errors etc.

The variable INDEX will be left out of the analysis. After removing INDEX, it is possible to examine the distribution of the other variables using a facet feature for plots.

The distribution of the remaining 16 variables are as follows:

## Warning: Removed 3478 rows containing non-finite values (stat_bin).

From the collection of plots, TARGET_WINS appears to have a close to normal distribution. There are a few variables that have a right skew such as TEAM_BATTING_3B and TEAM_BASERUN_SB. The variable TEAM_BATTING_BB has the most prominant left skew. The collection of histograms can also help identify if outliers should be explored for certain variables. The columns that start with TEAM_ all seem to exhibit extreme values. Those variables will be closely examined during the outlier investigation of EDA.

The spread of the response variable should be cloesly examined. The response variable can be overlayed against a normal historgram in order to visually see if there is a large deviation from normality.

The histograms provides a good top level look at the spread for each variable. The response variable is close to normal with a mean between 70 and 100. The plots are furthur complimented by providing descriptive statistics. Descriptive statistics also highlight where there might be some irregularities within the data.

##   TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B 
##  Min.   :  0.00   Min.   : 891   Min.   : 69.0   Min.   :  0.00  
##  1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0   1st Qu.: 34.00  
##  Median : 82.00   Median :1454   Median :238.0   Median : 47.00  
##  Mean   : 80.79   Mean   :1469   Mean   :241.2   Mean   : 55.25  
##  3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0   3rd Qu.: 72.00  
##  Max.   :146.00   Max.   :2554   Max.   :458.0   Max.   :223.00  
##                                                                  
##  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO  TEAM_BASERUN_SB
##  Min.   :  0.00   Min.   :  0.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0   1st Qu.: 66.0  
##  Median :102.00   Median :512.0   Median : 750.0   Median :101.0  
##  Mean   : 99.61   Mean   :501.6   Mean   : 735.6   Mean   :124.8  
##  3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0   3rd Qu.:156.0  
##  Max.   :264.00   Max.   :878.0   Max.   :1399.0   Max.   :697.0  
##                                   NA's   :102      NA's   :131    
##  TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
##  Min.   :  0.0   Min.   :29.00    Min.   : 1137   Min.   :  0.0   
##  1st Qu.: 38.0   1st Qu.:50.50    1st Qu.: 1419   1st Qu.: 50.0   
##  Median : 49.0   Median :58.00    Median : 1518   Median :107.0   
##  Mean   : 52.8   Mean   :59.36    Mean   : 1779   Mean   :105.7   
##  3rd Qu.: 62.0   3rd Qu.:67.00    3rd Qu.: 1682   3rd Qu.:150.0   
##  Max.   :201.0   Max.   :95.00    Max.   :30132   Max.   :343.0   
##  NA's   :772     NA's   :2085                                     
##  TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E  TEAM_FIELDING_DP
##  Min.   :   0.0   Min.   :    0.0   Min.   :  65.0   Min.   : 52.0   
##  1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0   1st Qu.:131.0   
##  Median : 536.5   Median :  813.5   Median : 159.0   Median :149.0   
##  Mean   : 553.0   Mean   :  817.7   Mean   : 246.5   Mean   :146.4   
##  3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2   3rd Qu.:164.0   
##  Max.   :3645.0   Max.   :19278.0   Max.   :1898.0   Max.   :228.0   
##                   NA's   :102                        NA's   :286

From the summary statistics, there are several variables that stand out due to extreme values. Such variables are TEAM_PITCHING_H and 19278.0. A deep dive into outlier analysis will be more informative regarding if these extreme values are correct or if they are generated by some data collection errors.

While the descriptives give us a look at the mean, median, and range for each variable, a correlation analysis can be used to tell the story on how each variable correlates with each other and with the response variable.

How do each of the 16 variables correlate with the response variable?

##      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B  TEAM_BATTING_3B 
##        1.0000000        0.3887675        0.2891036        0.1426084 
##  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB 
##        0.1761532        0.2325599               NA               NA 
##  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H TEAM_PITCHING_HR 
##               NA               NA       -0.1099371        0.1890137 
## TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
##        0.1241745               NA       -0.1764848               NA

The variable with the lowest negative correlation to the response variable is TEAM_PITCHING_H. One can infer that if a team’s pitcher is allowing more hits, then surely the opposing team would have more oppurtunities to score points and win. The variable that has the strongest positive correlation with TARGET_WINS is TEAM_BATTING_H. A similar argument as above can be considered. If a team is making more base hits, then theoretically the team is generating more oppurtunities to score points and win.

Correlation can be done under a more inclusive scope. Let’s consider all variables and test how each variable correlates to one another. A heat map that details correlation is a good visualization to use rather than just a correlation matrix. Using some additional r techniques, certain redundency found within a standard correlation matrix can be eliminated.

From the correlation heatmap, right off the bat, certain variables do not hold any correlation to the response variable or any other variable. Some examples of these variables are TEAM_BATTING_SO or TEAM_BASERUN_CS.

The next element of the data that should be checked are missing values. R provides us with an easy to use function that sums up the number of missing values in each column.

##      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B  TEAM_BATTING_3B 
##                0                0                0                0 
##  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB 
##                0                0              102              131 
##  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H TEAM_PITCHING_HR 
##              772             2085                0                0 
## TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
##                0              102                0              286

The information on missing values can help decided if an additional column should be removed from the data frame all together. Recall there are 2,276 rows. The variable TEAM_BATTING_HBP has 2,085 missing values. If this is considered using percentages, then 90% of data is missing for this specific variable. It makes sense to remove this variable from the data frame. There are a few more variables that have over 100 missing data points. The data preperation section will explore the proper way to treat these missing entries.

TEAM_BASERUN_CS has over 30% of data missing. This variable should be dropped as well. We can impliment a 5% rule, where we keep data that has under 15 percent of data missing. It is difficult to say how variables with such a high percentage of variables would react if we were to impute them. This will allow us to drop TEAM_FIELDING_DP as well. TEAM_BASERUN_SB has a little under 6% missing data. It will be removed from the data frame all together as well.

A major element of EDA is identifying if there exist any outliers. Once outliers are identified, an argument should be made in favor of keeping or removing said outliers. From the EDA in the earlier sections, the variables related to pitching and one of the team fielding demonstrated elements within their histograms and descriptive statistics that make them prime for outlier analyis.

Detect and compare the spread of each flagged variable with and without outliers. The variables of interest are TEAM_FILEDING_E, TEAM_PITCHING_BB, TEAM_PITCHING_H, TEAM_PITCHING_HR, and TEAM_PITCHING_SO.

Team Fileding E

## Outliers identified: 303 nPropotion (%) of outliers: 15.4 nMean of the outliers: 737.13 nMean without removing outliers: 246.48 nMean if we remove outliers: 171.13 nDo you want to remove outliers and to replace with NA? [yes/no]: 
## Nothing changed n

For team fielding E, there are 303 outliers. Removing the outliers does not have a major effect on the overall percentage of data in the upper 75% quartile range. The spread of the data for this variable still has a right skew.

Team Pitching BB

## Outliers identified: 90 nPropotion (%) of outliers: 4.1 nMean of the outliers: 806.99 nMean without removing outliers: 553.01 nMean if we remove outliers: 542.55 nDo you want to remove outliers and to replace with NA? [yes/no]: 
## Nothing changed n

For team pitching BB, there are 90 outliers. There is a considerable change when outliers are removed. The biggest change is within the spread. Once outliers are removed, the distribution is nearly normal. The box plot also shows changes within the amount of data below the 25% cut off and above the 75% cut off.

Team Pitching H

## Outliers identified: 213 nPropotion (%) of outliers: 10.3 nMean of the outliers: 4174.57 nMean without removing outliers: 1779.21 nMean if we remove outliers: 1531.9 nDo you want to remove outliers and to replace with NA? [yes/no]: 
## Nothing changed n

Team Pitching H also shows a big difference with and without outliers. Therr are 213 of them. Once outliers are removed, the spread is nearly normal with a minimal skew and the extreme ranges within this variable are reduced, thus reflected in the box plot.

Team Pitching HR

## Outliers identified: 4 nPropotion (%) of outliers: 0.2 nMean of the outliers: 321 nMean without removing outliers: 105.7 nMean if we remove outliers: 105.32 nDo you want to remove outliers and to replace with NA? [yes/no]: 
## Nothing changed n

Team Pitching HR is virtually unaffected when outliers are removed. The only difference is the reduction of data points above the 75% quartile range. There were only 4 outliers.

Team Pitching SO

## Outliers identified: 38 nPropotion (%) of outliers: 1.8 nMean of the outliers: 1818.24 nMean without removing outliers: 817.73 nMean if we remove outliers: 799.93 nDo you want to remove outliers and to replace with NA? [yes/no]: 
## Nothing changed n

For team pitching SO, there are several changes that occur when outliers are removed. The most noticable is the spread of the data. The data seems to follow a bimodal distribution from a glance but the box plot shows a reduction in data points above the 75% range. There are 38 outliers total.

Testing for outliers shows that there are several big changes in the behavior of the analyzed variables however, ourliers should be removed after an initial model is built. Once a model is built on the unchanged data, there are certain tests that can be used to determine if outliers should be removed all togeher or if predictor variables should be taken out all together. This is still only EDA.

We should make an alternative data set where the outliers are totally removed in the data prep phase.

  1. Data Preperation

From the previous section, we discovered that TEAM_BATTING_HBP had too many missing entries. It would be a fruitless effort to try to figure out an accurate replacement for 90% missing data. That variable is dropped all together. The outlier analysis demonstrated several instances of extreme values.

As for the outliers, I will wait until we build some models first. Tools such as cook’s distance give us insight into the affect of outliers in our model. There are several other packages that allow us to test the effect of outliers on the model and allow us to remove outliers if needed.

Given the current variables present, there is no variable for single hits. Single hits can be derived from the variables present in our data set. We can subtract any single, double, triple, or homerun hits from total hits in order to derive single hits and append to our data frame for both the training and evaluation datasets.

##   TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1          39           1445             194              39
## 2          70           1339             219              22
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_PITCHING_H
## 1              13             143             842            9364
## 2             190             685            1075            1347
##   TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## 1               84              927             5456            1011
## 2              191              689             1082             193
##   TEAM_BATTING_1B
## 1            1199
## 2             908

How is the distribution of our new variable?

outlierKD(moneyball_training2, TEAM_BATTING_1B)

## Outliers identified: 77 nPropotion (%) of outliers: 3.5 nMean of the outliers: 1495.79 nMean without removing outliers: 1073.16 nMean if we remove outliers: 1058.36 nDo you want to remove outliers and to replace with NA? [yes/no]: 
## Nothing changed n

The new variable does improve when outliers are removed. The data becomes close to normal. Our new variables yields 77 outliers.

Lets refine the missing value approach

There is now the problem of missing values for the other variables that have under 5% missing value threshold. These variables are Team Batting SO and team pitching SO. Common data science literature states that imputing missing variables should be done at a 5% threshold. Variables with over 5% of data missing should be removed from the data frame all together. (https://www.r-bloggers.com/imputing-missing-data-with-r-mice-package/)

Recall where are missing datais located

##      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B  TEAM_BATTING_3B 
##                0                0                0                0 
##  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_PITCHING_H 
##                0                0              102                0 
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##                0                0              102                0 
##  TEAM_BATTING_1B 
##                0

We will impute the missing values with the median.

We will make an additional data set where we impute with the mean

We will impute missing data with the mode

Make another data subset with zero values removed and missing values removed

outlier free data

  1. Build Models

This portion of the analysis focuses on building models over the prepared training data. The first model will be a standard lm with all the predictor variables including the derived statistics. Each model summary willbe followed up with plots that check the assumptions for multiple linear regressions. The assumptions are nomality of residuals, constant variance, and independence of predictors.

## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = moneyball_training3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.013  -8.579   0.237   8.876  54.770 
## 
## Coefficients: (1 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.4550178  5.1031807   0.481  0.63051    
## TEAM_BATTING_H    0.0513996  0.0037542  13.691  < 2e-16 ***
## TEAM_BATTING_2B  -0.0298252  0.0094031  -3.172  0.00154 ** 
## TEAM_BATTING_3B   0.0934385  0.0167486   5.579 2.72e-08 ***
## TEAM_BATTING_HR   0.0537165  0.0277346   1.937  0.05290 .  
## TEAM_BATTING_BB   0.0053172  0.0058828   0.904  0.36617    
## TEAM_BATTING_SO  -0.0030174  0.0024977  -1.208  0.22714    
## TEAM_PITCHING_H  -0.0014991  0.0003670  -4.084 4.58e-05 ***
## TEAM_PITCHING_HR -0.0030060  0.0245488  -0.122  0.90255    
## TEAM_PITCHING_BB  0.0058479  0.0041510   1.409  0.15904    
## TEAM_PITCHING_SO  0.0021830  0.0009268   2.355  0.01859 *  
## TEAM_FIELDING_E  -0.0137111  0.0023636  -5.801 7.56e-09 ***
## TEAM_BATTING_1B          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.19 on 2162 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.2863, Adjusted R-squared:  0.2827 
## F-statistic: 78.85 on 11 and 2162 DF,  p-value: < 2.2e-16

## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                  Data                   
##  ---------------------------------------
##  Response : TARGET_WINS 
##  Variables: fitted values of TARGET_WINS 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    1 
##  Chi2          =    14.84983 
##  Prob > Chi2   =    0.0001164184

The residuals are nearly normal. Constant Variance Check (Heteroscedasticity): We can use the Breusch Pagan Test for Heteroscedasticity Ho: The variance is constant Ha: the variance is not constant

Using the Breusch Pagan test for Heteroskedasticity, we can statistically determine if we have non-constant variance. The low p-value indicates stong evidence against the null hypothesis. There is strong evidence in favor of variance no being constant.

Lets remove predictors that have a high p value and the predictor that cannot be defined due to high co-linearity.

## 
## Call:
## lm(formula = TARGET_WINS ~ . - TEAM_BATTING_HR - TEAM_BATTING_BB - 
##     TEAM_BATTING_SO - TEAM_PITCHING_HR - TEAM_PITCHING_BB - TEAM_BATTING_1B, 
##     data = moneyball_training3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -53.018  -8.718   0.254   8.813  50.100 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.3765356  3.5755764   0.665   0.5063    
## TEAM_BATTING_H    0.0582964  0.0033617  17.341  < 2e-16 ***
## TEAM_BATTING_2B  -0.0235095  0.0092237  -2.549   0.0109 *  
## TEAM_BATTING_3B   0.0547593  0.0140045   3.910 9.51e-05 ***
## TEAM_PITCHING_H  -0.0012274  0.0003138  -3.912 9.44e-05 ***
## TEAM_PITCHING_SO  0.0031086  0.0006203   5.012 5.84e-07 ***
## TEAM_FIELDING_E  -0.0211828  0.0020850 -10.159  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.36 on 2167 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.2659, Adjusted R-squared:  0.2639 
## F-statistic: 130.9 on 6 and 2167 DF,  p-value: < 2.2e-16

## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                  Data                   
##  ---------------------------------------
##  Response : TARGET_WINS 
##  Variables: fitted values of TARGET_WINS 
## 
##          Test Summary          
##  ------------------------------
##  DF            =    1 
##  Chi2          =    9.587605 
##  Prob > Chi2   =    0.001958953

This model seems to indicate some elements that would be considered counter intuitive. The most glaring is that team batting 2B has a negative coefficient. The other coefficients that are negative make sense since hits allowed and errors would have a negative effect on target wins. In this model, the intercept has a high p value. All predictors now have low p-values but the adjusted r square remains virtually unchanged.

Given that we have two models built, we can use anova to test if the reduction in residual sum of squares is significant

## Analysis of Variance Table
## 
## Model 1: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
##     TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_BATTING_1B
## Model 2: TARGET_WINS ~ (TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
##     TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_BATTING_1B) - TEAM_BATTING_HR - TEAM_BATTING_BB - 
##     TEAM_BATTING_SO - TEAM_PITCHING_HR - TEAM_PITCHING_BB - TEAM_BATTING_1B
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   2162 376123                                  
## 2   2167 386856 -5    -10733 12.338 8.031e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

A low p-value indicates that there is indeed a significant difference between the two models. We can base any furthur transformation off mod2, which is the simpler of the two models.

It is also worth checking the model against the data with the mean imputed

## 
## Call:
## lm(formula = TARGET_WINS ~ . - TEAM_BATTING_HR - TEAM_BATTING_BB - 
##     TEAM_BATTING_SO - TEAM_PITCHING_HR - TEAM_PITCHING_BB - TEAM_BATTING_1B, 
##     data = moneyball_training4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -53.018  -8.718   0.254   8.813  50.100 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.3765356  3.5755764   0.665   0.5063    
## TEAM_BATTING_H    0.0582964  0.0033617  17.341  < 2e-16 ***
## TEAM_BATTING_2B  -0.0235095  0.0092237  -2.549   0.0109 *  
## TEAM_BATTING_3B   0.0547593  0.0140045   3.910 9.51e-05 ***
## TEAM_PITCHING_H  -0.0012274  0.0003138  -3.912 9.44e-05 ***
## TEAM_PITCHING_SO  0.0031086  0.0006203   5.012 5.84e-07 ***
## TEAM_FIELDING_E  -0.0211828  0.0020850 -10.159  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.36 on 2167 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.2659, Adjusted R-squared:  0.2639 
## F-statistic: 130.9 on 6 and 2167 DF,  p-value: < 2.2e-16

The current models do not seem to have any major changes but they still have several pitfalls which make them difficult to select. Without including them in this report, additional models were built on the several variations of data. There was no major change.

The final model I propose is to use the derived single base hits instead of doubles, triples, or homeruns plus the other predictors

mod4<-lm(TARGET_WINS~TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
    TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_BATTING_1B, data=moneyball_training4)
summary(mod4)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_BATTING_1B, data = moneyball_training4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.736  -8.723   0.431   8.955  53.654 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.4471739  5.2131964   1.045 0.296194    
## TEAM_BATTING_BB   0.0149380  0.0053348   2.800 0.005154 ** 
## TEAM_BATTING_SO  -0.0065459  0.0025152  -2.603 0.009316 ** 
## TEAM_PITCHING_H  -0.0020678  0.0003571  -5.790 8.05e-09 ***
## TEAM_PITCHING_HR  0.0808783  0.0073886  10.946  < 2e-16 ***
## TEAM_PITCHING_BB  0.0024371  0.0036459   0.668 0.503918    
## TEAM_PITCHING_SO  0.0030451  0.0009123   3.338 0.000859 ***
## TEAM_FIELDING_E  -0.0106073  0.0023035  -4.605 4.37e-06 ***
## TEAM_BATTING_1B   0.0616688  0.0035326  17.457  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.5 on 2165 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.2511, Adjusted R-squared:  0.2483 
## F-statistic: 90.73 on 8 and 2165 DF,  p-value: < 2.2e-16
#TARGET_WINS ~ TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
 #   TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
  #  TEAM_FIELDING_E + TEAM_BATTING_1B

The coefficients for this model make much more sense. The negative coefficients belong to variables that hurt a teams chances of winning. We are now on the right track. We can take this model a step forward by removing variables with large p values.

## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_BATTING_1B, data = moneyball_training3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.751  -8.749   0.394   8.916  53.559 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.0275275  5.1745970   0.972  0.33137    
## TEAM_BATTING_BB   0.0177761  0.0032295   5.504 4.15e-08 ***
## TEAM_BATTING_SO  -0.0070467  0.0024007  -2.935  0.00337 ** 
## TEAM_PITCHING_H  -0.0019698  0.0003256  -6.050 1.70e-09 ***
## TEAM_PITCHING_HR  0.0812672  0.0073647  11.035  < 2e-16 ***
## TEAM_PITCHING_SO  0.0034464  0.0006868   5.018 5.65e-07 ***
## TEAM_FIELDING_E  -0.0103163  0.0022617  -4.561 5.37e-06 ***
## TEAM_BATTING_1B   0.0617577  0.0035296  17.497  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.5 on 2166 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.2509, Adjusted R-squared:  0.2485 
## F-statistic: 103.6 on 7 and 2166 DF,  p-value: < 2.2e-16

Are there any additional predictors that show signs of multicolinearity? Lets check variance inflation numbers.

library(car)
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(mod5)
##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_PITCHING_H TEAM_PITCHING_HR 
##         1.891619         4.244223         2.611178         2.315811 
## TEAM_PITCHING_SO  TEAM_FIELDING_E  TEAM_BATTING_1B 
##         1.720504         3.296878         2.541484

The VIF numbers reveal that there is no predictor that is has high multicolinearity.

As of now, we have run models on the datasets that have been cleaned, unchanged, and imputed with the mean, median, or mode. The best performing model as of now is mod5. The results were virtually the same when run on the different variations of the data. The selected predictors for the model seem to be the driving factor in the performance of this model. We can run the same model but on the data free from any outliers.

mod6<-lm(TARGET_WINS~TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
    TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_BATTING_1B, data=moneyball_training_outlierfree)
summary(mod6);
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_BATTING_1B, data = moneyball_training_outlierfree)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.751  -8.749   0.394   8.916  53.559 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.0275275  5.1745970   0.972  0.33137    
## TEAM_BATTING_BB   0.0177761  0.0032295   5.504 4.15e-08 ***
## TEAM_BATTING_SO  -0.0070467  0.0024007  -2.935  0.00337 ** 
## TEAM_PITCHING_H  -0.0019698  0.0003256  -6.050 1.70e-09 ***
## TEAM_PITCHING_HR  0.0812672  0.0073647  11.035  < 2e-16 ***
## TEAM_PITCHING_SO  0.0034464  0.0006868   5.018 5.65e-07 ***
## TEAM_FIELDING_E  -0.0103163  0.0022617  -4.561 5.37e-06 ***
## TEAM_BATTING_1B   0.0617577  0.0035296  17.497  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.5 on 2166 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.2509, Adjusted R-squared:  0.2485 
## F-statistic: 103.6 on 7 and 2166 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(mod6)

hist(resid(mod6), main="Histogram of Residuals");
ols_test_breusch_pagan(mod6)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                  Data                   
##  ---------------------------------------
##  Response : TARGET_WINS 
##  Variables: fitted values of TARGET_WINS 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    1 
##  Chi2          =    36.15552 
##  Prob > Chi2   =    1.821818e-09

The model run on the outlier free data is the closest to satisfying the conditions of regression. We still run into some of the same problems regarding the coefficients that do not make sense especially with single hits. What if we run the model on the same data set using all the significant variables except the derived variable? (due to singularity)

mod7<-lm(TARGET_WINS~ TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
    TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E, data=moneyball_training_outlierfree)
summary(mod7);
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_3B + TEAM_BATTING_HR + 
##     TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E, data = moneyball_training_outlierfree)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.091  -8.315   0.296   8.746  70.126 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      68.9562920  2.8441524  24.245  < 2e-16 ***
## TEAM_BATTING_3B   0.2047428  0.0159726  12.818  < 2e-16 ***
## TEAM_BATTING_HR   0.0821605  0.0291634   2.817  0.00489 ** 
## TEAM_BATTING_BB   0.0054342  0.0061980   0.877  0.38071    
## TEAM_BATTING_SO  -0.0185318  0.0023963  -7.734 1.59e-14 ***
## TEAM_PITCHING_H  -0.0005940  0.0003807  -1.560  0.11883    
## TEAM_PITCHING_HR  0.0425940  0.0256980   1.657  0.09757 .  
## TEAM_PITCHING_BB  0.0031735  0.0043718   0.726  0.46798    
## TEAM_PITCHING_SO  0.0012028  0.0009714   1.238  0.21578    
## TEAM_FIELDING_E  -0.0135413  0.0024475  -5.533 3.54e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.91 on 2164 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.2059, Adjusted R-squared:  0.2026 
## F-statistic: 62.36 on 9 and 2164 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(mod7)

hist(resid(mod7), main="Histogram of Residuals");
ols_test_breusch_pagan(mod7)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                  Data                   
##  ---------------------------------------
##  Response : TARGET_WINS 
##  Variables: fitted values of TARGET_WINS 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    1 
##  Chi2          =    83.98595 
##  Prob > Chi2   =    4.983046e-20

##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##         2.256790        33.666087         6.566523         3.985319 
##  TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO 
##         3.364704        26.574633         6.057531         3.243416 
##  TEAM_FIELDING_E 
##         3.638868

TheVIF numbers suggest removing variables with VIF>4

mod8<-lm(TARGET_WINS~ TEAM_BATTING_3B +  TEAM_PITCHING_H  + TEAM_PITCHING_BB +  
    TEAM_FIELDING_E, data=moneyball_training_outlierfree)
summary(mod8);
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_3B + TEAM_PITCHING_H + 
##     TEAM_PITCHING_BB + TEAM_FIELDING_E, data = moneyball_training_outlierfree)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.817  -9.412   0.515   9.611  75.791 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      70.7518852  1.2507857  56.566  < 2e-16 ***
## TEAM_BATTING_3B   0.1801970  0.0133737  13.474  < 2e-16 ***
## TEAM_PITCHING_H   0.0003821  0.0003440   1.111    0.267    
## TEAM_PITCHING_BB  0.0100111  0.0021093   4.746  2.2e-06 ***
## TEAM_FIELDING_E  -0.0248819  0.0022894 -10.868  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.8 on 2271 degrees of freedom
## Multiple R-squared:  0.1185, Adjusted R-squared:  0.1169 
## F-statistic: 76.29 on 4 and 2271 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(mod8)

hist(resid(mod8), main="Histogram of Residuals");
ols_test_breusch_pagan(mod8)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                  Data                   
##  ---------------------------------------
##  Response : TARGET_WINS 
##  Variables: fitted values of TARGET_WINS 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    1 
##  Chi2          =    168.1347 
##  Prob > Chi2   =    1.890505e-38

How well does this model predict?

test_results <- predict(mod6, newdata = moneyball_evaluation_outlierfree2)
head(test_results);
##        1        2        3        4        5        6 
## 69.00692 73.06412 81.65427 83.33833 66.94667 68.86185
test_results2<-predict(mod6, data=moneyball_evaluation_outlierfree2, type = "response")
#table(test_results)
#predict(mod5, newdata=moneyball_evaluation3)
#plot(test_results);
predict(mod6,newdata=moneyball_evaluation_outlierfree2, interval='prediction')
##           fit       lwr       upr
## 1    69.00692 42.488339  95.52550
## 2    73.06412 46.568522  99.55971
## 3    81.65427 55.165106 108.14343
## 4    83.33833 56.848176 109.82848
## 5    66.94667 40.387369  93.50597
## 6    68.86185 42.334045  95.38965
## 7    77.63296 51.116836 104.14908
## 8    77.25622 50.756628 103.75582
## 9    68.62672 42.118403  95.13503
## 10   78.30516 51.811945 104.79838
## 11   78.48641 51.995741 104.97707
## 12   84.46434 57.949848 110.97882
## 13   82.03073 55.507079 108.55437
## 14   85.97106 59.454661 112.48745
## 15   83.77852 57.280820 110.27623
## 16   83.55736 57.057849 110.05688
## 17   74.83753 48.345365 101.32970
## 18   80.91740 54.425943 107.40886
## 19         NA        NA        NA
## 20   91.48884 64.976930 118.00075
## 21   82.39162 55.881054 108.90219
## 22   90.06592 63.567576 116.56427
## 23   82.02082 55.521401 108.52024
## 24   80.06926 53.578155 106.56037
## 25   83.37320 56.873756 109.87265
## 26   86.44826 59.952083 112.94443
## 27   63.86919 37.230400  90.50799
## 28   76.47461 49.967991 102.98123
## 29   82.60924 56.094368 109.12410
## 30   77.99079 51.492901 104.48867
## 31   88.93423 62.419148 115.44931
## 32   87.92535 61.425920 114.42478
## 33   86.52055 60.012033 113.02907
## 34   90.44214 63.918902 116.96538
## 35   79.95406 53.468311 106.43980
## 36   84.25969 57.752599 110.76677
## 37   76.69674 50.209037 103.18444
## 38   87.88548 61.379913 114.39106
## 39   96.01158 69.494312 122.52884
## 40   89.70686 63.208021 116.20570
## 41   83.00029 56.506899 109.49369
## 42   87.19457 60.704283 113.68486
## 43   42.85586 15.845194  69.86653
## 44   90.55282 63.958573 117.14708
## 45   87.49780 60.986698 114.00891
## 46   87.01992 60.491742 113.54810
## 47   94.61006 68.082816 121.13731
## 48   69.07920 42.577279  95.58111
## 49   68.42697 41.919308  94.93463
## 50   75.84798 49.342541 102.35342
## 51   79.96387 53.457597 106.47014
## 52   89.64642 63.145522 116.14731
## 53   76.62455 50.131492 103.11762
## 54   74.30642 47.803763 100.80908
## 55   77.10713 50.617695 103.59656
## 56   79.07357 52.576768 105.57037
## 57   83.91808 57.412914 110.42325
## 58   67.09825 40.582052  93.61444
## 59         NA        NA        NA
## 60         NA        NA        NA
## 61   83.31572 56.810665 109.82077
## 62   82.01087 55.475295 108.54645
## 63   86.02479 59.536845 112.51274
## 64   83.71249 57.188726 110.23626
## 65   81.73216 55.220945 108.24338
## 66   85.86840 59.361111 112.37569
## 67   74.65277 48.157915 101.14763
## 68   82.51063 55.994195 109.02707
## 69         NA        NA        NA
## 70   79.51071 52.991010 106.03040
## 71   84.44081 57.930372 110.95125
## 72   78.98346 52.477277 105.48964
## 73   83.96361 57.460110 110.46711
## 74   89.57916 63.039440 116.11888
## 75   83.43925 56.910824 109.96768
## 76   88.02691 61.510272 114.54354
## 77   80.89986 54.406198 107.39351
## 78   79.72256 53.230403 106.21472
## 79         NA        NA        NA
## 80         NA        NA        NA
## 81   86.18352 59.679426 112.68761
## 82   85.31286 58.824410 111.80131
## 83   93.21080 66.705286 119.71631
## 84   83.20132 56.713465 109.68917
## 85   86.99480 60.498181 113.49142
## 86   86.50900 60.005737 113.01226
## 87   81.00935 54.520625 107.49807
## 88   84.95889 58.472459 111.44532
## 89   86.50625 60.010381 113.00212
## 90   84.13036 57.630802 110.62991
## 91   79.46163 52.961283 105.96197
## 92   91.54929 64.510035 118.58855
## 93   77.13449 50.629082 103.63989
## 94         NA        NA        NA
## 95         NA        NA        NA
## 96         NA        NA        NA
## 97   80.16339 53.620894 106.70589
## 98   98.42825 71.888622 124.96789
## 99   86.43225 59.918306 112.94619
## 100  85.42591 58.919980 111.93185
## 101  80.92308 54.431792 107.41438
## 102  78.55118 52.047417 105.05495
## 103  86.83507 60.346410 113.32374
## 104  85.70204 59.204695 112.19939
## 105  80.06704 53.548923 106.58516
## 106  71.17562 44.655540  97.69570
## 107  58.98284 32.383883  85.58180
## 108  76.86213 50.357094 103.36716
## 109  81.31581 54.822177 107.80943
## 110  62.27430 35.734486  88.81411
## 111  79.53837 53.043954 106.03279
## 112  77.27456 50.780665 103.76845
## 113  89.85657 63.359518 116.35362
## 114  85.46018 58.964679 111.95567
## 115  79.17474 52.676514 105.67296
## 116  77.30412 50.814711 103.79352
## 117  86.77209 60.277805 113.26637
## 118  76.82168 50.334074 103.30928
## 119  76.54996 50.048467 103.05145
## 120  69.42273 42.902408  95.94305
## 121  85.74711 59.239063 112.25516
## 122        NA        NA        NA
## 123        NA        NA        NA
## 124        NA        NA        NA
## 125  66.81037 40.299350  93.32138
## 126  87.70733 61.177809 114.23685
## 127  91.79341 65.281691 118.30513
## 128  74.14519 47.640457 100.64992
## 129  87.31899 60.821387 113.81660
## 130  92.70447 66.202789 119.20615
## 131  88.45258 61.958966 114.94620
## 132  82.46999 55.968682 108.97130
## 133  77.63265 51.143142 104.12217
## 134  85.79874 59.281380 112.31609
## 135  84.71385 58.212597 111.21510
## 136  71.23197 44.703922  97.76002
## 137  74.38387 47.893756 100.87397
## 138  81.25773 54.764558 107.75090
## 139  81.33524 54.839988 107.83049
## 140  81.51682 55.028242 108.00539
## 141  64.31946 37.787409  90.85152
## 142        NA        NA        NA
## 143  90.56522 64.056742 117.07370
## 144  77.61080 51.122617 104.09898
## 145  76.58713 50.082747 103.09152
## 146  77.16540 50.674682 103.65611
## 147  77.53721 51.047924 104.02649
## 148  82.50523 56.013800 108.99666
## 149  86.34713 59.860022 112.83425
## 150  79.58348 53.092917 106.07405
## 151  81.19546 54.695663 107.69525
## 152  79.72165 53.218305 106.22500
## 153  38.14230  9.671444  66.61316
## 154  76.31577 49.804180 102.82736
## 155  79.34119 52.841722 105.84067
## 156  76.02661 49.522424 102.53079
## 157  79.07280 52.572370 105.57322
## 158  69.05638 42.531509  95.58125
## 159  83.38621 56.884717 109.88771
## 160        NA        NA        NA
## 161  95.67392 69.155948 122.19188
## 162 101.39959 74.845162 127.95402
## 163  89.33578 62.828070 115.84350
## 164  98.79403 72.251586 125.33647
## 165  93.13667 66.598749 119.67458
## 166  88.89552 62.370199 115.42085
## 167  87.43015 60.939838 113.92046
## 168  84.54582 58.034688 111.05696
## 169  75.50910 49.008601 102.00959
## 170  82.27345 55.786672 108.76023
## 171        NA        NA        NA
## 172  81.02350 54.523210 107.52379
## 173  79.13629 52.632790 105.63978
## 174  91.41320 64.899782 117.92663
## 175  82.15181 55.659576 108.64405
## 176  79.02006 52.515129 105.52500
## 177  82.38910 55.869911 108.90829
## 178  79.64150 53.157449 106.12555
## 179  79.15134 52.666391 105.63629
## 180  83.04312 56.552403 109.53384
## 181  77.86320 51.374559 104.35184
## 182  81.52034 55.020824 108.01986
## 183  85.23160 58.743644 111.71955
## 184  82.90413 56.404596 109.40366
## 185  87.27361 60.448857 114.09836
## 186  80.74543 54.231490 107.25937
## 187  84.89423 58.338710 111.44976
## 188  57.55175 30.914293  84.18920
## 189  64.34096 37.803683  90.87823
## 190 108.30933 81.703593 134.91506
## 191        NA        NA        NA
## 192        NA        NA        NA
## 193  76.57875 50.061037 103.09646
## 194  83.07243 56.570850 109.57402
## 195  84.56683 58.069421 111.06424
## 196  76.17460 49.678098 102.67110
## 197  79.64148 53.153393 106.12956
## 198  81.36580 54.852465 107.87914
## 199  80.77785 54.281595 107.27411
## 200  84.64168 58.150637 111.13273
## 201  78.24587 51.745434 104.74631
## 202  81.43457 54.936907 107.93224
## 203  72.02278 45.516040  98.52952
## 204  90.20950 63.709234 116.70977
## 205  80.01207 53.526951 106.49720
## 206  79.15978 52.665504 105.65406
## 207  80.25199 53.754708 106.74926
## 208  74.75166 48.254125 101.24920
## 209  76.41295 49.912877 102.91302
## 210  73.34455 46.823451  99.86565
## 211  92.84737 66.327035 119.36771
## 212  88.62258 62.109220 115.13594
## 213  75.63051 49.135527 102.12550
## 214  73.51613 47.023074 100.00918
## 215  73.67810 47.178008 100.17820
## 216  87.86262 61.371686 114.35356
## 217  83.06827 56.573994 109.56254
## 218  83.37368 56.883498 109.86385
## 219  78.12301 51.629545 104.61648
## 220  74.99128 48.498837 101.48373
## 221  80.39271 53.894096 106.89132
## 222  73.34675 46.831383  99.86212
## 223  81.61724 55.125575 108.10891
## 224  77.29332 50.785272 103.80136
## 225  98.60249 71.827422 125.37756
## 226  76.15014 49.662300 102.63799
## 227  81.50042 55.011996 107.98885
## 228  78.06449 51.553645 104.57533
## 229  79.22578 52.733909 105.71764
## 230  74.54467 48.000409 101.08892
## 231        NA        NA        NA
## 232  90.34274 63.838150 116.84733
## 233  84.62977 58.124956 111.13458
## 234  88.32381 61.820564 114.82706
## 235  79.54459 53.059515 106.02966
## 236  74.76081 48.269671 101.25194
## 237  85.20050 58.685197 111.71580
## 238  75.70614 49.213004 102.19927
## 239  85.35252 58.833598 111.87145
## 240  72.35656 45.858244  98.85488
## 241  86.85867 60.368664 113.34867
## 242  87.71673 61.215383 114.21808
## 243  87.59452 61.095264 114.09377
## 244  81.52432 55.031658 108.01698
## 245  67.34857 40.832827  93.86431
## 246  91.64752 65.145683 118.14936
## 247  82.10924 55.614914 108.60357
## 248  80.32087 53.825183 106.81656
## 249  77.16581 50.666609 103.66501
## 250  81.32372 54.818798 107.82864
## 251  78.66766 52.166697 105.16862
## 252  64.45281 37.861734  91.04389
## 253  89.89188 63.357908 116.42586
## 254  64.64781 36.077918  93.21770
## 255  74.81031 48.321425 101.29919
## 256  82.60197 56.099772 109.10416
## 257  74.49453 47.996356 100.99270
## 258  77.70889 51.224369 104.19341
## 259  72.63756 46.125359  99.14976
  1. Model Selection

After constructing 7 different models. Model 6 is the best one based on the adjusted r square. The coefficients were the most interpretable to understand and actually made sense within the context of the game. The variables that you would excpect to hurt the number of wins were negative where as the variables that one would expect to improve your chances of winning were positive.

There are several recommended steps regarding building a model on this data. The biggest thing I would change is to consider the use of a GLM family. The data has too many irregularities and the evaluation data does not even contain target wins as a column. The evaluation data is supposed to be a subset of the entire data set.

Additional Sources) http://www.r-tutor.com/elementary-statistics/simple-linear-regression/prediction-interval-linear-regression https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/

APPeNDIX)

moneyball_training <- read.csv(url("https://raw.githubusercontent.com/vindication09/DATA-621-Week-1-/master/moneyball-training-data.csv"), header =TRUE)

head(moneyball_training, 10)
moneyball_evaluation <- read.csv(url("https://raw.githubusercontent.com/vindication09/DATA-621-Week-1-/master/moneyball-evaluation-data.csv"), header =TRUE)

head(moneyball_evaluation, 10)
nrow(moneyball_training);ncol(moneyball_training)
library(dplyr)

#moneyball_training1 <- moneyball_training %>% select(-INDEX)
moneyball_training1 <- subset(moneyball_training, select = -c(INDEX))
names(moneyball_training1);

#moneyball_evaluation1 <- moneyball_evaluation %>% select(-INDEX)
moneyball_evaluation1 <- subset(moneyball_evaluation, select = -c(INDEX))
names(moneyball_evaluation1)
library(ggplot2)
library(tidyr)

ggplot(gather(moneyball_training1), aes(value)) + 
    geom_histogram(bins = 10) + 
    facet_wrap(~key, scales = 'free_x')
x <- moneyball_training1$TARGET_WINS
h<-hist(x, breaks=10, col="red", xlab="Target Wins", 
    main="Histogram with Normal Curve", ylim=range(0:1200)) 
xfit<-seq(min(x),max(x),length=40) 
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x)) 
yfit <- yfit*diff(h$mids[1:2])*length(x) 
lines(xfit, yfit, col="blue", lwd=2)
#library(dplyr)
#library(qwraps2)
summary(moneyball_training1)
apply(moneyball_training1,2, function(col)cor(col, moneyball_training1$TARGET_WINS))
correlation_matrix <- round(cor(moneyball_training1),2)

# Get lower triangle of the correlation matrix
  get_lower_tri<-function(correlation_matrix){
    correlation_matrix[upper.tri(correlation_matrix)] <- NA
    return(correlation_matrix)
  }
  # Get upper triangle of the correlation matrix
  get_upper_tri <- function(correlation_matrix){
    correlation_matrix[lower.tri(correlation_matrix)]<- NA
    return(correlation_matrix)
  }
  
  upper_tri <- get_upper_tri(correlation_matrix)
library(reshape2)

# Melt the correlation matrix
melted_correlation_matrix <- melt(upper_tri, na.rm = TRUE)

# Heatmap
library(ggplot2)

ggheatmap <- ggplot(data = melted_correlation_matrix, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()
#add nice labels 
ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 2) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  axis.text.x=element_text(size=rel(0.8), angle=90),
  axis.text.y=element_text(size=rel(0.8)),
  panel.grid.major = element_blank(),
  panel.border = element_blank(),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  legend.justification = c(1, 0),
  legend.position = c(0.6, 0.7),
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwimoneyball_training2h = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))
colSums(is.na(moneyball_training1))

#moneyball_training2 <- moneyball_training1 %>% select(-TEAM_BATTING_HBP, -TEAM_BASERUN_CS, -TEAM_FIELDING_DP, -TEAM_BASERUN_SB)
moneyball_training2 <- subset(moneyball_training1, select = -c(TEAM_BATTING_HBP, TEAM_BASERUN_CS, TEAM_FIELDING_DP, TEAM_BASERUN_SB))
names(moneyball_training2);

#moneyball_evaluation2 <- moneyball_evaluation1 %>% select(-TEAM_BATTING_HBP, -TEAM_BASERUN_CS,-TEAM_FIELDING_DP, -TEAM_BASERUN_SB)
moneyball_evaluation2 <- subset(moneyball_evaluation1, select = -c(TEAM_BATTING_HBP, TEAM_BASERUN_CS, TEAM_FIELDING_DP, TEAM_BASERUN_SB))
names(moneyball_evaluation2)
outlierKD <- function(moneyball_training2, var) {
     var_name <- eval(substitute(var),eval(moneyball_training2))
     na1 <- sum(is.na(var_name))
     m1 <- mean(var_name, na.rm = T)
     par(mfrow=c(2, 2), oma=c(0,0,3,0))
     boxplot(var_name, main="With outliers")
     hist(var_name, main="With outliers", xlab=NA, ylab=NA)
     outlier <- boxplot.stats(var_name)$out
     mo <- mean(outlier)
     var_name <- ifelse(var_name %in% outlier, NA, var_name)
     boxplot(var_name, main="Without outliers")
     hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
     title("Outlier Check", outer=TRUE)
     na2 <- sum(is.na(var_name))
     cat("Outliers identified:", na2 - na1, "n")
     cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "n")
     cat("Mean of the outliers:", round(mo, 2), "n")
     m2 <- mean(var_name, na.rm = T)
     cat("Mean without removing outliers:", round(m1, 2), "n")
     cat("Mean if we remove outliers:", round(m2, 2), "n")
     response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
     if(response == "y" | response == "yes"){
          moneyball_training2[as.character(substitute(var))] <- invisible(var_name)
          assign(as.character(as.list(match.call())$moneyball_training2), moneyball_training2, envir = .GlobalEnv)
          cat("Outliers successfully removed", "n")
          return(invisible(moneyball_training2))
     } else{
          cat("Nothing changed", "n")
          return(invisible(var_name))
     }
}
outlierKD(moneyball_training2, TEAM_FIELDING_E)
outlierKD(moneyball_training2, TEAM_PITCHING_BB)
outlierKD(moneyball_training2, TEAM_PITCHING_H)
outlierKD(moneyball_training2, TEAM_PITCHING_HR)
outlierKD(moneyball_training2, TEAM_PITCHING_SO)
moneyball_training2$TEAM_BATTING_1B=(moneyball_training2$TEAM_BATTING_H-moneyball_training2$TEAM_BATTING_2B-moneyball_training2$TEAM_BATTING_3B-moneyball_training2$TEAM_BATTING_HR)
head(moneyball_training2, 2)
outlierKD(moneyball_training2, TEAM_BATTING_1B)
moneyball_evaluation2$TEAM_BATTING_1B=(moneyball_evaluation2$TEAM_BATTING_H-moneyball_evaluation2$TEAM_BATTING_2B-moneyball_evaluation2$TEAM_BATTING_3B-moneyball_evaluation2$TEAM_BATTING_HR)
colSums(is.na(moneyball_training2))
library(Hmisc)
moneyball_training3<-moneyball_training2
impute(moneyball_training3$TEAM_BATTING_SO, median)
impute(moneyball_training3$TEAM_PITCHING_SO, median)

library(Hmisc)
moneyball_training4<-moneyball_training2
impute(moneyball_training4$TEAM_BATTING_SO, mean)
impute(moneyball_training4$TEAM_PITCHING_SO, mean)
library(Hmisc)
moneyball_training5<-moneyball_training2
impute(moneyball_training5$TEAM_BATTING_SO, mode)
impute(moneyball_training5$TEAM_PITCHING_SO, mode)
moneyball_training6<-na.omit(moneyball_training2)
##Go through each row and determine if a value is zero
row_sub = apply(moneyball_training6, 1, function(row) all(row !=0 ))
##Subset as usual
moneyball_training6[row_sub,]
moneyball_training_outlierfree<-moneyball_training2
moneyball_evaluation_outlierfree<-moneyball_evaluation2

outlierKD(moneyball_training_outlierfree, TEAM_FIELDING_E)
outlierKD(moneyball_training_outlierfree, TEAM_PITCHING_BB)
outlierKD(moneyball_training_outlierfree, TEAM_PITCHING_H)
outlierKD(moneyball_training_outlierfree, TEAM_PITCHING_HR)
outlierKD(moneyball_training_outlierfree, TEAM_PITCHING_SO)
outlierKD(moneyball_training_outlierfree, TEAM_PITCHING_SO)
outlierKD(moneyball_training_outlierfree, TEAM_BATTING_1B)

outlierKD(moneyball_evaluation_outlierfree, TEAM_FIELDING_E)
outlierKD(moneyball_evaluation_outlierfree, TEAM_PITCHING_BB)
outlierKD(moneyball_evaluation_outlierfree, TEAM_PITCHING_H)
outlierKD(moneyball_evaluation_outlierfree, TEAM_PITCHING_HR)
outlierKD(moneyball_evaluation_outlierfree, TEAM_PITCHING_SO)
outlierKD(moneyball_evaluation_outlierfree, TEAM_PITCHING_SO)
outlierKD(moneyball_evaluation_outlierfree, TEAM_BATTING_1B)

colSums(is.na(moneyball_evaluation_outlierfree))
#impute missing data 
moneyball_evaluation_outlierfree2<-moneyball_evaluation_outlierfree
impute(moneyball_evaluation_outlierfree2$TEAM_BATTING_SO, median)
impute(moneyball_evaluation_outlierfree2$TEAM_PITCHING_H, median)
impute(moneyball_evaluation_outlierfree2$TEAM_PITCHING_HR, median)
impute(moneyball_evaluation_outlierfree2$TEAM_PITCHING_BB, median)
impute(moneyball_evaluation_outlierfree2$TEAM_FIELDING_E, median)
impute(moneyball_evaluation_outlierfree2$TEAM_BATTING_1B, median)
mod1<-lm(TARGET_WINS~., data=moneyball_training3)
summary(mod1);
par(mfrow=c(2,2))
plot(mod1)
hist(resid(mod1), main="Histogram of Residuals");
library(olsrr)
ols_test_breusch_pagan(mod1)

mod2<-lm(TARGET_WINS~.-TEAM_BATTING_HR-TEAM_BATTING_BB-TEAM_BATTING_SO-TEAM_PITCHING_HR-TEAM_PITCHING_BB-TEAM_BATTING_1B, data=moneyball_training3)
summary(mod2);
par(mfrow=c(2,2))
plot(mod2)
hist(resid(mod2), main="Histogram of Residuals");
library(olsrr)
ols_test_breusch_pagan(mod2)

anova(mod1, mod2)
mod3<-lm(TARGET_WINS~.-TEAM_BATTING_HR-TEAM_BATTING_BB-TEAM_BATTING_SO-TEAM_PITCHING_HR-TEAM_PITCHING_BB-TEAM_BATTING_1B, data=moneyball_training4)
summary(mod3);
par(mfrow=c(2,2))
plot(mod3)
hist(resid(mod3), main="Histogram of Residuals")
mod4<-lm(TARGET_WINS~TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
    TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_BATTING_1B, data=moneyball_training4)
summary(mod4)
#TARGET_WINS ~ TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
 #   TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
  #  TEAM_FIELDING_E + TEAM_BATTING_1B
mod5<-lm(TARGET_WINS~TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
    TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_BATTING_1B, data=moneyball_training3)
summary(mod5);
par(mfrow=c(2,2))
plot(mod5)
hist(resid(mod5), main="Histogram of Residuals")
library(car)
vif(mod5)
mod6<-lm(TARGET_WINS~TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
    TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_BATTING_1B, data=moneyball_training_outlierfree)
summary(mod6);
par(mfrow=c(2,2))
plot(mod6)
hist(resid(mod6), main="Histogram of Residuals");
ols_test_breusch_pagan(mod6)
mod7<-lm(TARGET_WINS~ TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
    TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E, data=moneyball_training_outlierfree)
summary(mod7);
par(mfrow=c(2,2))
plot(mod7)
hist(resid(mod7), main="Histogram of Residuals");
ols_test_breusch_pagan(mod7)
vif(mod7)
mod8<-lm(TARGET_WINS~ TEAM_BATTING_3B +  TEAM_PITCHING_H  + TEAM_PITCHING_BB +  
    TEAM_FIELDING_E, data=moneyball_training_outlierfree)
summary(mod8);
par(mfrow=c(2,2))
plot(mod8)
hist(resid(mod8), main="Histogram of Residuals");
ols_test_breusch_pagan(mod8)
test_results <- predict(mod6, newdata = moneyball_evaluation_outlierfree2)
head(test_results);
test_results2<-predict(mod6, data=moneyball_evaluation_outlierfree2, type = "response")
#table(test_results)
#predict(mod5, newdata=moneyball_evaluation3)
#plot(test_results);
predict(mod6,newdata=moneyball_evaluation_outlierfree2, interval='prediction')

#read in the data
moneyball_training <- read.csv(url("https://raw.githubusercontent.com/vindication09/DATA-621-Week-1-/master/moneyball-training-data.csv"), header =TRUE)

head(moneyball_training, 10)

moneyball_evaluation <- read.csv(url("https://raw.githubusercontent.com/vindication09/DATA-621-Week-1-/master/moneyball-evaluation-data.csv"), header =TRUE)

head(moneyball_evaluation, 10)

#EDA
nrow(moneyball_training);ncol(moneyball_training)

library(dplyr)

#moneyball_training1 <- moneyball_training %>% select(-INDEX)
moneyball_training1 <- subset(moneyball_training, select = -c(INDEX))
names(moneyball_training1);

#moneyball_evaluation1 <- moneyball_evaluation %>% select(-INDEX)
moneyball_evaluation1 <- subset(moneyball_evaluation, select = -c(INDEX))
names(moneyball_evaluation1)

library(ggplot2)
library(tidyr)

ggplot(gather(moneyball_training1), aes(value)) + 
    geom_histogram(bins = 10) + 
    facet_wrap(~key, scales = 'free_x')


x <- moneyball_training1$TARGET_WINS
h<-hist(x, breaks=10, col="red", xlab="Target Wins", 
    main="Histogram with Normal Curve", ylim=range(0:1200)) 
xfit<-seq(min(x),max(x),length=40) 
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x)) 
yfit <- yfit*diff(h$mids[1:2])*length(x) 
lines(xfit, yfit, col="blue", lwd=2)


summary(moneyball_training1)


apply(moneyball_training1,2, function(col)cor(col, moneyball_training1$TARGET_WINS))

#correlation matrix and visualization 
correlation_matrix <- round(cor(moneyball_training1),2)

# Get lower triangle of the correlation matrix
  get_lower_tri<-function(correlation_matrix){
    correlation_matrix[upper.tri(correlation_matrix)] <- NA
    return(correlation_matrix)
  }
  # Get upper triangle of the correlation matrix
  get_upper_tri <- function(correlation_matrix){
    correlation_matrix[lower.tri(correlation_matrix)]<- NA
    return(correlation_matrix)
  }
  
  upper_tri <- get_upper_tri(correlation_matrix)



library(reshape2)

# Melt the correlation matrix
melted_correlation_matrix <- melt(upper_tri, na.rm = TRUE)

# Heatmap
library(ggplot2)

ggheatmap <- ggplot(data = melted_correlation_matrix, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()


#add nice labels 
ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 2) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  axis.text.x=element_text(size=rel(0.8), angle=90),
  axis.text.y=element_text(size=rel(0.8)),
  panel.grid.major = element_blank(),
  panel.border = element_blank(),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  legend.justification = c(1, 0),
  legend.position = c(0.6, 0.7),
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwimoneyball_training2h = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))


#missing data 
colSums(is.na(moneyball_training1))

#moneyball_training2 <- moneyball_training1 %>% select(-TEAM_BATTING_HBP, -TEAM_BASERUN_CS, -TEAM_FIELDING_DP, -TEAM_BASERUN_SB)
moneyball_training2 <- subset(moneyball_training1, select = -c(TEAM_BATTING_HBP, TEAM_BASERUN_CS, TEAM_FIELDING_DP, TEAM_BASERUN_SB))
names(moneyball_training2);

#moneyball_evaluation2 <- moneyball_evaluation1 %>% select(-TEAM_BATTING_HBP, -TEAM_BASERUN_CS,-TEAM_FIELDING_DP, -TEAM_BASERUN_SB)
moneyball_evaluation2 <- subset(moneyball_evaluation, select = -c(TEAM_BATTING_HBP, TEAM_BASERUN_CS, TEAM_FIELDING_DP, TEAM_BASERUN_SB))
names(moneyball_evaluation2)


#outlier detection 
outlierKD <- function(moneyball_training2, var) {
     var_name <- eval(substitute(var),eval(moneyball_training2))
     na1 <- sum(is.na(var_name))
     m1 <- mean(var_name, na.rm = T)
     par(mfrow=c(2, 2), oma=c(0,0,3,0))
     boxplot(var_name, main="With outliers")
     hist(var_name, main="With outliers", xlab=NA, ylab=NA)
     outlier <- boxplot.stats(var_name)$out
     mo <- mean(outlier)
     var_name <- ifelse(var_name %in% outlier, NA, var_name)
     boxplot(var_name, main="Without outliers")
     hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
     title("Outlier Check", outer=TRUE)
     na2 <- sum(is.na(var_name))
     cat("Outliers identified:", na2 - na1, "n")
     cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "n")
     cat("Mean of the outliers:", round(mo, 2), "n")
     m2 <- mean(var_name, na.rm = T)
     cat("Mean without removing outliers:", round(m1, 2), "n")
     cat("Mean if we remove outliers:", round(m2, 2), "n")
     response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
     if(response == "y" | response == "yes"){
          moneyball_training2[as.character(substitute(var))] <- invisible(var_name)
          assign(as.character(as.list(match.call())$moneyball_training2), moneyball_training2, envir = .GlobalEnv)
          cat("Outliers successfully removed", "n")
          return(invisible(moneyball_training2))
     } else{
          cat("Nothing changed", "n")
          return(invisible(var_name))
     }
}

outlierKD(moneyball_training2, TEAM_FIELDING_E)
outlierKD(moneyball_training2, TEAM_PITCHING_BB)
outlierKD(moneyball_training2, TEAM_PITCHING_H)
outlierKD(moneyball_training2, TEAM_PITCHING_HR)
outlierKD(moneyball_training2, TEAM_PITCHING_SO)


#data prep
moneyball_training2$TEAM_BATTING_1B=(moneyball_training2$TEAM_BATTING_H-moneyball_training2$TEAM_BATTING_2B-moneyball_training2$TEAM_BATTING_3B-moneyball_training2$TEAM_BATTING_HR)
head(moneyball_training2, 2)

outlierKD(moneyball_training2, TEAM_BATTING_1B)

colSums(is.na(moneyball_training2))

library(Hmisc)
moneyball_training3<-moneyball_training2
impute(moneyball_training3$TEAM_BATTING_SO, median)
impute(moneyball_training3$TEAM_PITCHING_SO, median)

library(Hmisc)
moneyball_training4<-moneyball_training2
impute(moneyball_training4$TEAM_BATTING_SO, mean)
impute(moneyball_training4$TEAM_PITCHING_SO, mean)

library(Hmisc)
moneyball_training5<-moneyball_training2
impute(moneyball_training5$TEAM_BATTING_SO, mode)
impute(moneyball_training5$TEAM_PITCHING_SO, mode)

moneyball_training6<-na.omit(moneyball_training2)
##Go through each row and determine if a value is zero
row_sub = apply(moneyball_training6, 1, function(row) all(row !=0 ))
##Subset as usual
moneyball_training6[row_sub,]


#build models
mod1<-lm(TARGET_WINS~., data=moneyball_training3)
summary(mod1)
par(mfrow=c(2,2))
plot(mod1)
hist(resid(mod1), main="Histogram of Residuals")
library(olsrr)
ols_test_breusch_pagan(mod1)


mod2<-lm(TARGET_WINS~.-TEAM_BATTING_HR-TEAM_BATTING_BB-TEAM_BATTING_SO-TEAM_PITCHING_HR-TEAM_PITCHING_BB-TEAM_BATTING_1B, data=moneyball_training3)
summary(mod2)
par(mfrow=c(2,2))
plot(mod2)
hist(resid(mod2), main="Histogram of Residuals")
library(olsrr)
ols_test_breusch_pagan(mod2)

anova(mod1, mod2)

mod3<-lm(TARGET_WINS~.-TEAM_BATTING_HR-TEAM_BATTING_BB-TEAM_BATTING_SO-TEAM_PITCHING_HR-TEAM_PITCHING_BB-TEAM_BATTING_1B, data=moneyball_training4)
summary(mod3)
par(mfrow=c(2,2))
plot(mod3)
hist(resid(mod3), main="Histogram of Residuals")


mod4<-lm(TARGET_WINS~TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
    TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_BATTING_1B, data=moneyball_training4)
summary(mod4)



mod5<-lm(TARGET_WINS~TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_PITCHING_H + 
    TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_BATTING_1B, data=moneyball_training3)
summary(mod5)

library(car)
vif(mod5)

par(mfrow=c(2,2))
plot(mod6)
hist(resid(mod6), main="Histogram of Residuals")

#prediction 
test_results <- predict(mod6, newdata = moneyball_evaluation3)
head(test_results)