library(tidyverse)
library(ggplot2)
library(ggthemes)
library(reshape2)
library(knitr)
TARGET_WINS using scatterplots:We can visualize the variables using histograms to account for non-normal distributions:
One of the key takeaways here is that strikeouts by batters has a
bimodal distribution. Several variables, such as strikeouts by pitchers
and walks allowed are skewed right. TARGET_WINS (i.e. our
outcome of interest) has a normal distribution.
lmod <- lm(TARGET_WINS ~ TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+TEAM_BATTING_HR+TEAM_BATTING_BB+TEAM_BATTING_SO+TEAM_BASERUN_SB+TEAM_BASERUN_CS+TEAM_BATTING_HBP+TEAM_PITCHING_H+TEAM_PITCHING_HR+TEAM_PITCHING_BB+TEAM_PITCHING_SO+TEAM_FIELDING_E+TEAM_FIELDING_DP, df.training)
coef(lmod)
## (Intercept) TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 60.28826257 1.91347621 0.02638808 -0.10117554
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## -4.84370721 -4.45969136 0.34196258 0.03304398
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## -0.01104427 0.08247269 -1.89095685 4.93043182
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 4.51089069 -0.37364495 -0.17204198 -0.10819208
summary(lmod)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = df.training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.8708 -5.6564 -0.0599 5.2545 22.9274
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.28826 19.67842 3.064 0.00253 **
## TEAM_BATTING_H 1.91348 2.76139 0.693 0.48927
## TEAM_BATTING_2B 0.02639 0.03029 0.871 0.38484
## TEAM_BATTING_3B -0.10118 0.07751 -1.305 0.19348
## TEAM_BATTING_HR -4.84371 10.50851 -0.461 0.64542
## TEAM_BATTING_BB -4.45969 3.63624 -1.226 0.22167
## TEAM_BATTING_SO 0.34196 2.59876 0.132 0.89546
## TEAM_BASERUN_SB 0.03304 0.02867 1.152 0.25071
## TEAM_BASERUN_CS -0.01104 0.07143 -0.155 0.87730
## TEAM_BATTING_HBP 0.08247 0.04960 1.663 0.09815 .
## TEAM_PITCHING_H -1.89096 2.76095 -0.685 0.49432
## TEAM_PITCHING_HR 4.93043 10.50664 0.469 0.63946
## TEAM_PITCHING_BB 4.51089 3.63372 1.241 0.21612
## TEAM_PITCHING_SO -0.37364 2.59705 -0.144 0.88577
## TEAM_FIELDING_E -0.17204 0.04140 -4.155 5.08e-05 ***
## TEAM_FIELDING_DP -0.10819 0.03654 -2.961 0.00349 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.467 on 175 degrees of freedom
## (2085 observations deleted due to missingness)
## Multiple R-squared: 0.5501, Adjusted R-squared: 0.5116
## F-statistic: 14.27 on 15 and 175 DF, p-value: < 2.2e-16
plot(lmod)
d<-density(lmod[['residuals']])
plot(d,main='Residual Plot',xlab='Residual value')
plot(ecdf(lmod[['residuals']]))
lmodel <- lm(log(TARGET_WINS) ~ TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+TEAM_BATTING_HR+TEAM_BATTING_BB+TEAM_BATTING_SO+TEAM_BASERUN_SB+TEAM_BASERUN_CS+TEAM_BATTING_HBP+TEAM_PITCHING_H+TEAM_PITCHING_HR+TEAM_PITCHING_BB+TEAM_PITCHING_SO+TEAM_FIELDING_E+TEAM_FIELDING_DP, df.training)
library(moments)
skewness(lmodel$residuals)
## [1] -0.0625463
summary(lmodel)
##
## Call:
## lm(formula = log(TARGET_WINS) ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = df.training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.262518 -0.068128 0.003095 0.068237 0.286990
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.1341043 0.2506709 16.492 < 2e-16 ***
## TEAM_BATTING_H 0.0251235 0.0351756 0.714 0.4760
## TEAM_BATTING_2B 0.0004703 0.0003858 1.219 0.2245
## TEAM_BATTING_3B -0.0012175 0.0009873 -1.233 0.2192
## TEAM_BATTING_HR -0.0587638 0.1338613 -0.439 0.6612
## TEAM_BATTING_BB -0.0579397 0.0463198 -1.251 0.2127
## TEAM_BATTING_SO 0.0039398 0.0331039 0.119 0.9054
## TEAM_BASERUN_SB 0.0004387 0.0003652 1.201 0.2313
## TEAM_BASERUN_CS -0.0001988 0.0009099 -0.218 0.8273
## TEAM_BATTING_HBP 0.0011143 0.0006318 1.764 0.0795 .
## TEAM_PITCHING_H -0.0248689 0.0351699 -0.707 0.4804
## TEAM_PITCHING_HR 0.0599100 0.1338375 0.448 0.6550
## TEAM_PITCHING_BB 0.0585912 0.0462877 1.266 0.2073
## TEAM_PITCHING_SO -0.0043423 0.0330822 -0.131 0.8957
## TEAM_FIELDING_E -0.0023193 0.0005274 -4.397 1.9e-05 ***
## TEAM_FIELDING_DP -0.0014222 0.0004655 -3.055 0.0026 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1079 on 175 degrees of freedom
## (2085 observations deleted due to missingness)
## Multiple R-squared: 0.5597, Adjusted R-squared: 0.5219
## F-statistic: 14.83 on 15 and 175 DF, p-value: < 2.2e-16
plot(lmodel)
ld<-density(lmodel[['residuals']])
plot(ld,main='Residual Plot',xlab='Residual value')
l2model <- lm(sqrt(TARGET_WINS) ~ ., df.training)
summary(l2model)
##
## Call:
## lm(formula = sqrt(TARGET_WINS) ~ ., data = df.training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14135 -0.29908 0.01654 0.29094 1.27950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.858e+00 1.107e+00 7.097 3.10e-11 ***
## INDEX -1.548e-05 4.773e-05 -0.324 0.74612
## TEAM_BATTING_H 1.031e-01 1.566e-01 0.659 0.51094
## TEAM_BATTING_2B 1.788e-03 1.705e-03 1.048 0.29588
## TEAM_BATTING_3B -5.560e-03 4.361e-03 -1.275 0.20401
## TEAM_BATTING_HR -2.532e-01 5.927e-01 -0.427 0.66983
## TEAM_BATTING_BB -2.534e-01 2.045e-01 -1.239 0.21707
## TEAM_BATTING_SO 2.379e-02 1.472e-01 0.162 0.87174
## TEAM_BASERUN_SB 1.918e-03 1.616e-03 1.187 0.23704
## TEAM_BASERUN_CS -8.446e-04 4.036e-03 -0.209 0.83449
## TEAM_BATTING_HBP 4.845e-03 2.800e-03 1.731 0.08528 .
## TEAM_PITCHING_H -1.019e-01 1.565e-01 -0.651 0.51571
## TEAM_PITCHING_HR 2.581e-01 5.927e-01 0.435 0.66375
## TEAM_PITCHING_BB 2.563e-01 2.044e-01 1.254 0.21158
## TEAM_PITCHING_SO -2.557e-02 1.471e-01 -0.174 0.86217
## TEAM_FIELDING_E -9.986e-03 2.330e-03 -4.286 3.01e-05 ***
## TEAM_FIELDING_DP -6.062e-03 2.087e-03 -2.905 0.00415 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4762 on 174 degrees of freedom
## (2085 observations deleted due to missingness)
## Multiple R-squared: 0.5565, Adjusted R-squared: 0.5157
## F-statistic: 13.64 on 16 and 174 DF, p-value: < 2.2e-16
par(mfrow= c(2,1))
hist(l2model$residuals)
qqnorm(l2model$residuals)
qqline(l2model$residuals)
#plot(l2model)
library(moments)
skewness(l2model$residuals)
## [1] 0.05744128
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
model0 <- lm(TARGET_WINS ~ TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+TEAM_BATTING_HR+TEAM_BATTING_BB+TEAM_BATTING_SO+TEAM_BASERUN_SB+TEAM_BASERUN_CS+TEAM_BATTING_HBP+TEAM_PITCHING_H+TEAM_PITCHING_HR+TEAM_PITCHING_BB+TEAM_PITCHING_SO+TEAM_FIELDING_E+TEAM_FIELDING_DP, df.training)
model <- lm(sqrt(TARGET_WINS) ~ TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+TEAM_BATTING_HR+TEAM_BATTING_BB+TEAM_BATTING_SO+TEAM_BASERUN_SB+TEAM_BASERUN_CS+TEAM_BATTING_HBP+TEAM_PITCHING_H+TEAM_PITCHING_HR+TEAM_PITCHING_BB+TEAM_PITCHING_SO+TEAM_FIELDING_E+TEAM_FIELDING_DP, df.training)
y <- df.training$TARGET_WINS
x <- df.training$TEAM_BATTING_H + df.training$TEAM_BATTING_2B + df.training$TEAM_BATTING_3B + df.training$TEAM_BATTING_HR + df.training$TEAM_BATTING_BB + df.training$TEAM_BATTING_SO + df.training$TEAM_BASERUN_SB + df.training$TEAM_BASERUN_CS + df.training$TEAM_BATTING_HBP + df.training$TEAM_PITCHING_H + df.training$TEAM_PITCHING_HR + df.training$TEAM_PITCHING_BB + df.training$TEAM_PITCHING_SO + df.training$TEAM_FIELDING_E + df.training$TEAM_FIELDING_DP
#find optimal lambda for Box-Cox transformation
bc_model <- boxcox(y ~ x)
(lambda <- bc_model$x[which.max(bc_model$y)])
## [1] 1.313131
#fit new linear regression model using the Box-Cox transformation
new_model <- lm(((y^lambda-1)/lambda) ~ x)
summary(new_model)
##
## Call:
## lm(formula = ((y^lambda - 1)/lambda) ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -116.242 -37.523 4.899 33.215 128.707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -91.339817 68.412101 -1.335 0.183
## x 0.046092 0.009384 4.912 1.95e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.01 on 189 degrees of freedom
## (2085 observations deleted due to missingness)
## Multiple R-squared: 0.1132, Adjusted R-squared: 0.1085
## F-statistic: 24.13 on 1 and 189 DF, p-value: 1.948e-06
################################################
#define plotting area
op <- par(pty = "s", mfrow = c(1, 3))
#Q-Q plot for original model
qqnorm(model0$residuals)
qqline(model0$residuals)
#Q-Q plot for sqrt model
qqnorm(model$residuals)
qqline(model$residuals)
#Q-Q plot for Box-Cox transformed model
qqnorm(new_model$residuals)
qqline(new_model$residuals)
#display both Q-Q plots
par(op)
library(MASS)
y1 <- df.training$TARGET_WINS
x1 <- df.training$TEAM_BATTING_H + df.training$TEAM_BATTING_2B
#find optimal lambda for Box-Cox transformation
# bc_model1 <- boxcox(y1 ~ x1)
#bc_model1 <- boxcox( df.training$TARGET_WINS ~ (df.training$TEAM_BATTING_H + df.training$TEAM_BATTING_2B) )
#(lambda1 <- bc_model1$x1[which.max(bc_model1$y1)])
#fit new linear regression model using the Box-Cox transformation
#new_model1 <- lm(((y1^lambda1 - 1)/lambda1) ~ x1)
#summary(new_model1)