library(corrplot)
library(psych)
library(ggplot2)
require(gridExtra)
library(car)
library(mice)
library(VIM)
library(caret)
library(dplyr)
library(MASS)

Data Exploration

Read Data

Here, we read the dataset and shorten the feature names for better readibility in visualizations.

#setwd("/Users/elinaazrilyan/Documents/Data621/")
df <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/moneyball-training-data.csv")[-1]
names(df) <- sub("TEAM_", "", names(df))
names(df) <- sub("BATTING_", "bt_", names(df))
names(df) <- sub("BASERUN_", "br_", names(df))
names(df) <- sub("FIELDING_", "fd_", names(df))
names(df) <- sub("PITCHING_", "ph_", names(df))
names(df) <- sub("TARGET_", "", names(df))
head(df)

##   WINS bt_H bt_2B bt_3B bt_HR bt_BB bt_SO br_SB br_CS bt_HBP ph_H ph_HR
## 1   39 1445   194    39    13   143   842    NA    NA     NA 9364    84
## 2   70 1339   219    22   190   685  1075    37    28     NA 1347   191
## 3   86 1377   232    35   137   602   917    46    27     NA 1377   137
## 4   70 1387   209    38    96   451   922    43    30     NA 1396    97
## 5   82 1297   186    27   102   472   920    49    39     NA 1297   102
## 6   75 1279   200    36    92   443   973   107    59     NA 1279    92
##   ph_BB ph_SO fd_E fd_DP
## 1   927  5456 1011    NA
## 2   689  1082  193   155
## 3   602   917  175   153
## 4   454   928  164   156
## 5   472   920  138   168
## 6   443   973  123   149

Summary

First, we take a look at a summary of the data. A few things of interest are revealed:

bt_SO, br_SB, br_CS, bt_HBP, ph_SO, and fd_DP have missing values
The max values of ph_H, ph_BB, ph_SO, and fd_E seem abnormally high

summary(df)

##       WINS             bt_H          bt_2B           bt_3B       
##  Min.   :  0.00   Min.   : 891   Min.   : 69.0   Min.   :  0.00  
##  1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0   1st Qu.: 34.00  
##  Median : 82.00   Median :1454   Median :238.0   Median : 47.00  
##  Mean   : 80.79   Mean   :1469   Mean   :241.2   Mean   : 55.25  
##  3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0   3rd Qu.: 72.00  
##  Max.   :146.00   Max.   :2554   Max.   :458.0   Max.   :223.00  
##                                                                  
##      bt_HR            bt_BB           bt_SO            br_SB      
##  Min.   :  0.00   Min.   :  0.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0   1st Qu.: 66.0  
##  Median :102.00   Median :512.0   Median : 750.0   Median :101.0  
##  Mean   : 99.61   Mean   :501.6   Mean   : 735.6   Mean   :124.8  
##  3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0   3rd Qu.:156.0  
##  Max.   :264.00   Max.   :878.0   Max.   :1399.0   Max.   :697.0  
##                                   NA's   :102      NA's   :131    
##      br_CS           bt_HBP           ph_H           ph_HR      
##  Min.   :  0.0   Min.   :29.00   Min.   : 1137   Min.   :  0.0  
##  1st Qu.: 38.0   1st Qu.:50.50   1st Qu.: 1419   1st Qu.: 50.0  
##  Median : 49.0   Median :58.00   Median : 1518   Median :107.0  
##  Mean   : 52.8   Mean   :59.36   Mean   : 1779   Mean   :105.7  
##  3rd Qu.: 62.0   3rd Qu.:67.00   3rd Qu.: 1682   3rd Qu.:150.0  
##  Max.   :201.0   Max.   :95.00   Max.   :30132   Max.   :343.0  
##  NA's   :772     NA's   :2085                                   
##      ph_BB            ph_SO              fd_E            fd_DP      
##  Min.   :   0.0   Min.   :    0.0   Min.   :  65.0   Min.   : 52.0  
##  1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0   1st Qu.:131.0  
##  Median : 536.5   Median :  813.5   Median : 159.0   Median :149.0  
##  Mean   : 553.0   Mean   :  817.7   Mean   : 246.5   Mean   :146.4  
##  3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2   3rd Qu.:164.0  
##  Max.   :3645.0   Max.   :19278.0   Max.   :1898.0   Max.   :228.0  
##                   NA's   :102                        NA's   :286

Histogram

Next, we create histograms of each of the features and target variable.

bt_H, bt_2B, bt_BB, br_CS, bt_HBP, fd_DP, WINS all have normal distributions
ph_H, ph_BB, ph_SO, and fd_E are highly right-skewed

grid.arrange(ggplot(df, aes(bt_H)) + geom_histogram(binwidth = 30),
             ggplot(df, aes(bt_2B)) + geom_histogram(binwidth = 10),
             ggplot(df, aes(bt_3B)) + geom_histogram(binwidth = 10),
             ggplot(df, aes(bt_HR)) + geom_histogram(binwidth = 10),
             ggplot(df, aes(bt_BB)) + geom_histogram(binwidth = 30),
             ggplot(df, aes(bt_SO)) + geom_histogram(binwidth = 50),
             ggplot(df, aes(br_SB)) + geom_histogram(binwidth = 30),
             ggplot(df, aes(br_CS)) + geom_histogram(binwidth = 10),
             ggplot(df, aes(bt_HBP)) + geom_histogram(binwidth = 3),
             ggplot(df, aes(ph_H)) + geom_histogram(binwidth = 100),
             ggplot(df, aes(ph_HR)) + geom_histogram(binwidth = 10),
             ggplot(df, aes(ph_BB)) + geom_histogram(binwidth = 100),
             ggplot(df, aes(ph_SO)) + geom_histogram(binwidth = 30),
             ggplot(df, aes(fd_E)) + geom_histogram(binwidth = 30),
             ggplot(df, aes(fd_DP)) + geom_histogram(binwidth = 10),
             ggplot(df, aes(WINS)) + geom_histogram(binwidth = 5),
             ncol=4)

QQ Plots

Most of the features are not lined up with the theoretical QQ plot, however this will be addressed by the models we build.

Boxplot

Most of the boxplots shown below reflect a long right tail with many outliers.

grid.arrange(ggplot(df, aes(x = "bt_H", y = bt_H))+geom_boxplot(),
             ggplot(df, aes(x = "bt_2B", y = bt_2B))+geom_boxplot(),
             ggplot(df, aes(x = "bt_3B", y = bt_3B))+geom_boxplot(),
             ggplot(df, aes(x = "bt_HR", y = bt_HR))+geom_boxplot(),
             ggplot(df, aes(x = "bt_BB", y = bt_BB))+geom_boxplot(),
             ggplot(df, aes(x = "bt_SO", y = bt_SO))+geom_boxplot(),
             ggplot(df, aes(x = "br_SB", y = br_SB))+geom_boxplot(),
             ggplot(df, aes(x = "br_CS", y = br_CS))+geom_boxplot(),
             ggplot(df, aes(x = "bt_HBP", y = bt_HBP))+geom_boxplot(),
             ggplot(df, aes(x = "ph_H", y = ph_H))+geom_boxplot(),
             ggplot(df, aes(x = "ph_HR", y = ph_HR))+geom_boxplot(),
             ggplot(df, aes(x = "ph_BB", y = ph_BB))+geom_boxplot(),
             ggplot(df, aes(x = "ph_SO", y = ph_SO))+geom_boxplot(),
             ggplot(df, aes(x = "fd_E", y = fd_E))+geom_boxplot(),
             ggplot(df, aes(x = "fd_DP", y = fd_DP))+geom_boxplot(),
             ggplot(df, aes(x = "WINS", y = WINS))+geom_boxplot(),
             ncol=4)

Correlation Plot

There is a strong positive correlation between ph_H and bt_H
There is a strong positive correlation between ph_HR and bt_HR
There is a strong positive correlation between ph_BB and bt_BB
There is a strong positive correlation between ph_SO and bt_SO
There seems to be a weak correlation between bt_HBP/br_SB and Wins

corrplot(cor(df, use = "complete.obs"), method="color", type="lower", tl.col = "black", tl.srt = 25)

Scatter Plots

Here, we see a scatter plot of each of the feature variables with the target variable.

grid.arrange(ggplot(df, aes(bt_H, WINS)) + geom_point(),
             ggplot(df, aes(bt_2B, WINS)) + geom_point(),
             ggplot(df, aes(bt_3B, WINS)) + geom_point(),
             ggplot(df, aes(bt_HR, WINS)) + geom_point(),
             ggplot(df, aes(bt_BB, WINS)) + geom_point(),
             ggplot(df, aes(bt_SO, WINS)) + geom_point(),
             ggplot(df, aes(br_SB, WINS)) + geom_point(),
             ggplot(df, aes(br_CS, WINS)) + geom_point(),
             ggplot(df, aes(bt_HBP, WINS)) + geom_point(),
             ggplot(df, aes(ph_H, WINS)) + geom_point(),
             ggplot(df, aes(ph_HR, WINS)) + geom_point(),
             ggplot(df, aes(ph_BB, WINS)) + geom_point(),
             ggplot(df, aes(ph_SO, WINS)) + geom_point(),
             ggplot(df, aes(fd_E, WINS)) + geom_point(),
             ggplot(df, aes(fd_DP, WINS)) + geom_point(),
             ncol=4)

Data Preparation

Outliers

Extreme Values

While exploring the data, we noticed that the max values of ph_H, ph_BB, ph_SO, and fd_E seem abnormally high.

We see that the record for most hits in a season by team (ph_H) was set at 1,724 in 1921. However, we also know that the datapoints were normalized for 162 games in a season. To take a moderate approach, we will remove the some of the most egggregious outliers that are seen in these variables.

grid.arrange(ggplot(df, aes(x = "ph_H", y = ph_H))+geom_boxplot(),
             ggplot(df, aes(x = "ph_BB", y = ph_BB))+geom_boxplot(),
             ggplot(df, aes(x = "ph_SO", y = ph_SO))+geom_boxplot(),
             ggplot(df, aes(x = "fd_E", y = fd_E))+geom_boxplot(),
             ncol=4)

df <- filter(df, ph_H < 15000 | ph_BB < 1500 | ph_SO < 3000 | fd_E < 1500)

Cooks Distance

We will also remove influencial outliers using Cooks distance.

mod <- lm(WINS ~ ., data=df)
cooksd <- cooks.distance(mod)
plot(cooksd, pch="*", cex=2, main="Influential Outliers by Cooks distance")
abline(h = 4*mean(cooksd, na.rm=T), col="red")  # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red")  # add labels

We remove the influencial outliers.

influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])
df <- df[-influential, ]

Fill Missing Values

The following features have missing values.

bt_SO - Strikeouts by batters
br_SB - Stolen bases
br_CS - Caught stealing
bt_HBP - Batters hit by pitch (get a free base)
ph_SO - Strikeouts by pitchers
fd_DP - Double Plays

Since most values in bt_HBP are missing (90%), we will drop this feature.

Multivariate Imputation by Chained Equations (mice)

We will use Multivariable Imputation by Chained Equations (mice) to fill the missing variables.

df <- subset(df, select = -c(bt_HBP))
aggr_plot <- aggr(df, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##  Variable      Count
##     br_CS 0.33934066
##     fd_DP 0.12571429
##     br_SB 0.05758242
##     bt_SO 0.04483516
##     ph_SO 0.04483516
##      WINS 0.00000000
##      bt_H 0.00000000
##     bt_2B 0.00000000
##     bt_3B 0.00000000
##     bt_HR 0.00000000
##     bt_BB 0.00000000
##      ph_H 0.00000000
##     ph_HR 0.00000000
##     ph_BB 0.00000000
##      fd_E 0.00000000

Address Correlated Features

While exploring the data, we noticed several features had strong positive linear relationships.

Let’s run a Variance Inflation Factor test to detect multicollinearity. Features with a VIF score > 10 will be reviewed.

model1 <- lm(WINS ~., data = df)
car::vif(model1)

##      bt_H     bt_2B     bt_3B     bt_HR     bt_BB     bt_SO     br_SB 
##  3.820596  2.467157  2.989892 36.501400  6.787771  5.279911  3.862460 
##     br_CS      ph_H     ph_HR     ph_BB     ph_SO      fd_E     fd_DP 
##  3.793169  4.073762 29.596294  6.468847  3.369127  4.988328  1.902235

Let’s make another correlation plot with only these features.

bt_SO (strikeouts by batters) and bt_H (base hits by batters) have a strong positive correlation
bt_H (base hits by batters) and bt_BB (walks by batters) have a strong positive correlation
ph_BB (walks allowed) and bt_BB (walks by batters) have a strong negative correlation
ph_SO (strikeouts by pitchers) and bt_SO (strikeouts by batters) have a moderate negative correlation
ph_HR (homeruns allowed) and bt_HR (homeruns by batters) have a strong negative correlation
ph_SO (strikeouts by pitchers) and ph_BB (walks allowed) have a moderate negative correation

corrplot(cor(subset(df, select = c(WINS, bt_H, bt_HR, bt_BB, bt_SO, ph_H, ph_HR, ph_BB, ph_SO)), use = "complete.obs"), method="color", type="lower", tl.col = "black", tl.srt = 25)

To fix this, we can remove some correlated features and combine others.

Remove bt_HR. It has an extremely strong correlation with ph_HR.
Remove bt_SO. It has an extremely strong correlation with ph_SO.
Replace bt_H (total base hits by batters) with BT_1B = bt_H - BT_2B - BT_3B - BT_HR (1B base hits)
Replace ph_BB and bt_BB as a ratio of walks by batters to walks allowed

df$bt_1B <- df$bt_H - df$bt_2B - df$bt_3B - df$bt_HR
df$BB <- df$bt_BB / df$ph_BB
df2 <- subset(df, select = -c(bt_HR, bt_SO, bt_H, bt_BB, ph_BB))

These adjustments result in less multicollinearity.

model1 <- lm(WINS ~., data = df2)
car::vif(model1)

##    bt_2B    bt_3B    br_SB    br_CS     ph_H    ph_HR    ph_SO     fd_E 
## 1.553145 2.338689 3.650821 3.686438 3.628940 2.311793 1.832450 6.805560 
##    fd_DP    bt_1B       BB 
## 1.865776 2.664315 5.725045

Create Output

#write.csv(df, "C:\\Users\\mkive\\Documents\\GitHub\\Business-Analytics-Data-Mining\\Business-Analytics-Data-Mining\\Moneyball Regression\\baseball_output.csv")

Linear Model 1.

We will begin with all independent variables and use the back elimination method to eliminate the non-significant ones.

be_lm1 <- lm(WINS ~., data = df)
summary(be_lm1)

## 
## Call:
## lm(formula = WINS ~ ., data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.840  -8.261   0.255   8.063  58.803 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.126e+01  7.397e+00   9.634  < 2e-16 ***
## bt_H         4.312e-02  3.685e-03  11.700  < 2e-16 ***
## bt_2B       -2.292e-02  8.895e-03  -2.576 0.010051 *  
## bt_3B        6.010e-02  1.661e-02   3.619 0.000302 ***
## bt_HR        1.705e-01  3.091e-02   5.514 3.90e-08 ***
## bt_BB        2.336e-02  6.080e-03   3.843 0.000125 ***
## bt_SO       -1.179e-02  2.504e-03  -4.708 2.66e-06 ***
## br_SB        4.750e-02  5.313e-03   8.940  < 2e-16 ***
## br_CS       -8.480e-04  1.071e-02  -0.079 0.936878    
## ph_H         1.000e-03  4.370e-04   2.289 0.022175 *  
## ph_HR       -8.838e-02  2.837e-02  -3.116 0.001859 ** 
## ph_BB       -8.976e-03  4.329e-03  -2.073 0.038249 *  
## ph_SO        1.158e-03  8.935e-04   1.296 0.195121    
## fd_E        -5.340e-02  3.370e-03 -15.845  < 2e-16 ***
## fd_DP       -1.128e-01  1.233e-02  -9.147  < 2e-16 ***
## bt_1B               NA         NA      NA       NA    
## BB          -4.075e+01  5.829e+00  -6.991 3.58e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.54 on 2258 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3634, Adjusted R-squared:  0.3592 
## F-statistic: 85.94 on 15 and 2258 DF,  p-value: < 2.2e-16

We will start by eliminating the variables with high p-values and lowest significance from the model

Let’s take a look at the resulting model:

be_lm1_1 <- lm(WINS ~ bt_H + bt_SO + br_SB + ph_HR + fd_E + fd_DP + BB, data = df)
summary(be_lm1_1)

## 
## Call:
## lm(formula = WINS ~ bt_H + bt_SO + br_SB + ph_HR + fd_E + fd_DP + 
##     BB, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.406  -8.681   0.188   8.571  50.992 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  55.032320   6.046549   9.101  < 2e-16 ***
## bt_H          0.041900   0.002537  16.517  < 2e-16 ***
## bt_SO        -0.012711   0.002075  -6.126 1.06e-09 ***
## br_SB         0.049784   0.003897  12.774  < 2e-16 ***
## ph_HR         0.064094   0.008003   8.009 1.83e-15 ***
## fd_E         -0.045279   0.002865 -15.804  < 2e-16 ***
## fd_DP        -0.104410   0.011864  -8.800  < 2e-16 ***
## BB          -15.324908   3.851413  -3.979 7.14e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.71 on 2266 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3433, Adjusted R-squared:  0.3413 
## F-statistic: 169.2 on 7 and 2266 DF,  p-value: < 2.2e-16

We are seeing high significance indicators and p-values of 0 across all 10 remaining variables, however our R squared value is rather low - 36.

The next step is to check residuals plot and QQ plot to check the validity of our model.

plot(be_lm1_1$residuals)
abline(h = 0, lty = 3)

qqnorm(be_lm1_1$residuals)
qqline(be_lm1_1$residuals)

Both of these plots show that the model is a reasonable model. There is no pattern evident in the residuals and normality assumptions is close enough, even though there are some outliers.

We are going to use Box-Cox transformation to determine if a transformation is required.

boxcox(be_lm1_1, data=df, plotit=T)

Lambda is close to 1, so no transformation is needed.

Linear Model 2.

This Linear Model will be built using the variables we believe would have the highest corelation with WINs.

THe following variables will be used: - Base Hits by batters (1B,2B,3B,HR) - Walks by batters - Stolen bases - Strikeouts by batters - Errors - Strikeouts by pitchers - Double Plays - Hits allowed

be_lm2b <- lm(WINS ~ bt_H + bt_BB + br_SB + bt_SO + fd_E + ph_SO + fd_DP + ph_H, df)
summary(be_lm2b)

## 
## Call:
## lm(formula = WINS ~ bt_H + bt_BB + br_SB + bt_SO + fd_E + ph_SO + 
##     fd_DP + ph_H, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.371  -8.782   0.096   8.404  48.478 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.7559399  4.0363642   3.656 0.000262 ***
## bt_H         0.0516053  0.0022156  23.292  < 2e-16 ***
## bt_BB        0.0148564  0.0030068   4.941 8.34e-07 ***
## br_SB        0.0402606  0.0040219  10.010  < 2e-16 ***
## bt_SO       -0.0023137  0.0016423  -1.409 0.159035    
## fd_E        -0.0368034  0.0026176 -14.060  < 2e-16 ***
## ph_SO        0.0007058  0.0006636   1.063 0.287674    
## fd_DP       -0.1018142  0.0124835  -8.156 5.68e-16 ***
## ph_H         0.0011261  0.0003389   3.323 0.000904 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.88 on 2266 degrees of freedom
## Multiple R-squared:  0.3335, Adjusted R-squared:  0.3312 
## F-statistic: 141.7 on 8 and 2266 DF,  p-value: < 2.2e-16

Let’s remove the two variables with low significance:

be_lm2b <- lm(WINS ~ I(bt_H^1/2) + I(bt_BB^1/2) +  I(br_SB^1/2) + I(fd_E^1/2) + I(fd_DP^1/2) + I(ph_H^1/2), df)
summary(be_lm2b)

## 
## Call:
## lm(formula = WINS ~ I(bt_H^1/2) + I(bt_BB^1/2) + I(br_SB^1/2) + 
##     I(fd_E^1/2) + I(fd_DP^1/2) + I(ph_H^1/2), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.801  -8.782   0.074   8.372  47.873 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.9481124  3.3773778   3.834  0.00013 ***
## I(bt_H^1/2)   0.1039919  0.0040143  25.906  < 2e-16 ***
## I(bt_BB^1/2)  0.0292321  0.0060052   4.868 1.21e-06 ***
## I(br_SB^1/2)  0.0814860  0.0077994  10.448  < 2e-16 ***
## I(fd_E^1/2)  -0.0726319  0.0048402 -15.006  < 2e-16 ***
## I(fd_DP^1/2) -0.2063788  0.0248440  -8.307  < 2e-16 ***
## I(ph_H^1/2)   0.0025359  0.0005943   4.267 2.06e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.88 on 2268 degrees of freedom
## Multiple R-squared:  0.3329, Adjusted R-squared:  0.3311 
## F-statistic: 188.6 on 6 and 2268 DF,  p-value: < 2.2e-16

plot(be_lm2b$residuals)
abline(h = 0, lty = 3)

qqnorm(be_lm2b$residuals)
qqline(be_lm2b$residuals)

dfeval <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/moneyball-evaluation-data.csv")[-1]
names(dfeval) <- sub("TEAM_", "", names(dfeval))
names(dfeval) <- sub("BATTING_", "bt_", names(dfeval))
names(dfeval) <- sub("BASERUN_", "br_", names(dfeval))
names(dfeval) <- sub("FIELDING_", "fd_", names(dfeval))
names(dfeval) <- sub("PITCHING_", "ph_", names(dfeval))
names(dfeval) <- sub("TARGET_", "", names(dfeval))
head(dfeval)

##   bt_H bt_2B bt_3B bt_HR bt_BB bt_SO br_SB br_CS bt_HBP ph_H ph_HR ph_BB
## 1 1209   170    33    83   447  1080    62    50     NA 1209    83   447
## 2 1221   151    29    88   516   929    54    39     NA 1221    88   516
## 3 1395   183    29    93   509   816    59    47     NA 1395    93   509
## 4 1539   309    29   159   486   914   148    57     42 1539   159   486
## 5 1445   203    68     5    95   416    NA    NA     NA 3902    14   257
## 6 1431   236    53    10   215   377    NA    NA     NA 2793    20   420
##   ph_SO fd_E fd_DP
## 1  1080  140   156
## 2   929  135   164
## 3   816  156   153
## 4   914  124   154
## 5  1123  616   130
## 6   736  572   105

pred <- predict(be_lm2b, dfeval, type="response", se.fit=FALSE)
final <- data.frame(Win_Pred=pred)
Mean = mean(final[, 1], na.rm = TRUE)
final[,1][is.na(final[,1])] <- Mean
#write.csv(final,"Baseball_pred.csv", row.names = FALSE)

plot(be_lm2b)

Baseball Data - Data Exploration and Preparation

Mary Anna Kivenson, Elina Azrilyan

February 15, 2020