library(rvest)
## Warning: package 'rvest' was built under R version 3.6.3
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.6.3
library(ggplot2)
x <- "https://raw.githubusercontent.com/ChristopherBloome/621/main/moneyball-training-data.csv"
TrainingData <- read.csv(x)
summary(TrainingData)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0
## Median :1270.5 Median : 82.00 Median :1454 Median :238.0
## Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2
## 3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0
## Median : 47.00 Median :102.00 Median :512.0 Median : 750.0
## Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6
## 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0
## NA's :102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137
## 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419
## Median :101.0 Median : 49.0 Median :58.00 Median : 1518
## Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779
## 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682
## Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132
## NA's :131 NA's :772 NA's :2085
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0
## Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0
## Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5
## 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
## NA's :102
## TEAM_FIELDING_DP
## Min. : 52.0
## 1st Qu.:131.0
## Median :149.0
## Mean :146.4
## 3rd Qu.:164.0
## Max. :228.0
## NA's :286
x <- "https://raw.githubusercontent.com/ChristopherBloome/621/main/moneyball-evaluation-data.csv"
TestingData <- read.csv(x)
summary(TestingData)
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## Min. : 9 Min. : 819 Min. : 44.0 Min. : 14.00
## 1st Qu.: 708 1st Qu.:1387 1st Qu.:210.0 1st Qu.: 35.00
## Median :1249 Median :1455 Median :239.0 Median : 52.00
## Mean :1264 Mean :1469 Mean :241.3 Mean : 55.91
## 3rd Qu.:1832 3rd Qu.:1548 3rd Qu.:278.5 3rd Qu.: 72.00
## Max. :2525 Max. :2170 Max. :376.0 Max. :155.00
##
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## Min. : 0.00 Min. : 15.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 44.50 1st Qu.:436.5 1st Qu.: 545.0 1st Qu.: 59.0
## Median :101.00 Median :509.0 Median : 686.0 Median : 92.0
## Mean : 95.63 Mean :499.0 Mean : 709.3 Mean :123.7
## 3rd Qu.:135.50 3rd Qu.:565.5 3rd Qu.: 912.0 3rd Qu.:151.8
## Max. :242.00 Max. :792.0 Max. :1268.0 Max. :580.0
## NA's :18 NA's :13
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## Min. : 0.00 Min. :42.00 Min. : 1155 Min. : 0.0
## 1st Qu.: 38.00 1st Qu.:53.50 1st Qu.: 1426 1st Qu.: 52.0
## Median : 49.50 Median :62.00 Median : 1515 Median :104.0
## Mean : 52.32 Mean :62.37 Mean : 1813 Mean :102.1
## 3rd Qu.: 63.00 3rd Qu.:67.50 3rd Qu.: 1681 3rd Qu.:142.5
## Max. :154.00 Max. :96.00 Max. :22768 Max. :336.0
## NA's :87 NA's :240
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## Min. : 136.0 Min. : 0.0 Min. : 73.0 Min. : 69.0
## 1st Qu.: 471.0 1st Qu.: 613.0 1st Qu.: 131.0 1st Qu.:131.0
## Median : 526.0 Median : 745.0 Median : 163.0 Median :148.0
## Mean : 552.4 Mean : 799.7 Mean : 249.7 Mean :146.1
## 3rd Qu.: 606.5 3rd Qu.: 938.0 3rd Qu.: 252.0 3rd Qu.:164.0
## Max. :2008.0 Max. :9963.0 Max. :1568.0 Max. :204.0
## NA's :18 NA's :31
TrainingData1 <- subset(TrainingData, select = -c(TEAM_BATTING_HBP,TEAM_BASERUN_CS,TEAM_FIELDING_DP, TEAM_BATTING_SO, TEAM_PITCHING_SO, TEAM_BASERUN_SB, INDEX))
summary(TrainingData1)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## Min. : 0.00 Min. : 891 Min. : 69.0 Min. : 0.00
## 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0 1st Qu.: 34.00
## Median : 82.00 Median :1454 Median :238.0 Median : 47.00
## Mean : 80.79 Mean :1469 Mean :241.2 Mean : 55.25
## 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0 3rd Qu.: 72.00
## Max. :146.00 Max. :2554 Max. :458.0 Max. :223.00
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_PITCHING_H TEAM_PITCHING_HR
## Min. : 0.00 Min. : 0.0 Min. : 1137 Min. : 0.0
## 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 1419 1st Qu.: 50.0
## Median :102.00 Median :512.0 Median : 1518 Median :107.0
## Mean : 99.61 Mean :501.6 Mean : 1779 Mean :105.7
## 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 1682 3rd Qu.:150.0
## Max. :264.00 Max. :878.0 Max. :30132 Max. :343.0
## TEAM_PITCHING_BB TEAM_FIELDING_E
## Min. : 0.0 Min. : 65.0
## 1st Qu.: 476.0 1st Qu.: 127.0
## Median : 536.5 Median : 159.0
## Mean : 553.0 Mean : 246.5
## 3rd Qu.: 611.0 3rd Qu.: 249.2
## Max. :3645.0 Max. :1898.0
TrainingData1$SLG1 <- (TrainingData1$TEAM_BATTING_H + TrainingData1$TEAM_BATTING_2B + 2*TrainingData1$TEAM_BATTING_3B + 3*TrainingData1$TEAM_BATTING_HR)/TrainingData1$TEAM_BATTING_H
TrainingData1$SLG2 <- (TrainingData1$TEAM_BATTING_H + TrainingData1$TEAM_BATTING_2B + 2*TrainingData1$TEAM_BATTING_3B + 3*TrainingData1$TEAM_BATTING_HR)
TrainingData1$HitDiff <- TrainingData1$TEAM_BATTING_H/ TrainingData1$TEAM_PITCHING_H
TrainingData1$WalkDiff <- TrainingData1$TEAM_BATTING_BB/ TrainingData1$TEAM_PITCHING_BB
TrainingData1$HRDiff <- TrainingData1$TEAM_BATTING_HR/TrainingData1$TEAM_PITCHING_HR
TrainingData2HRDiff <- data.frame(TrainingData1$HRDiff)
TrainingData2HRDiff[is.na(TrainingData2HRDiff)] <- 1
TrainingData2WalkDiff <- data.frame(TrainingData1$WalkDiff)
TrainingData2WalkDiff[is.na(TrainingData2WalkDiff)] <- 1
TrainingData2HitDiff <- data.frame(TrainingData1$HitDiff)
TrainingData2HitDiff[is.na(TrainingData2HitDiff)] <- 1
TrainingData2 <- TrainingData1
TrainingData2$HRDiff <- TrainingData2HRDiff$TrainingData1.HRDiff
TrainingData2$WalkDiff <- TrainingData2WalkDiff$TrainingData1.WalkDiff
TrainingData2$HitDiff <- TrainingData2HitDiff$TrainingData1.HitDiff
names(TrainingData2) <- names(TrainingData1)
names(TrainingData1)
## [1] "TARGET_WINS" "TEAM_BATTING_H" "TEAM_BATTING_2B" "TEAM_BATTING_3B"
## [5] "TEAM_BATTING_HR" "TEAM_BATTING_BB" "TEAM_PITCHING_H" "TEAM_PITCHING_HR"
## [9] "TEAM_PITCHING_BB" "TEAM_FIELDING_E" "SLG1" "SLG2"
## [13] "HitDiff" "WalkDiff" "HRDiff"
LM1 <- lm(TARGET_WINS~TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+TEAM_BATTING_HR+TEAM_BATTING_BB+TEAM_PITCHING_H+TEAM_PITCHING_HR+TEAM_PITCHING_BB+TEAM_FIELDING_E+SLG1+HitDiff+WalkDiff+SLG2+HRDiff, data = TrainingData2 )
summary(LM1)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_PITCHING_H +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_FIELDING_E + SLG1 +
## HitDiff + WalkDiff + SLG2 + HRDiff, data = TrainingData2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.884 -8.652 0.330 8.763 59.970
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.733e+01 3.956e+01 -0.691 0.489790
## TEAM_BATTING_H 6.050e-02 7.600e-03 7.960 2.69e-15 ***
## TEAM_BATTING_2B -6.494e-02 1.935e-02 -3.356 0.000804 ***
## TEAM_BATTING_3B 5.082e-02 3.898e-02 1.304 0.192503
## TEAM_BATTING_HR 9.703e-02 6.837e-02 1.419 0.155974
## TEAM_BATTING_BB 1.079e-02 5.504e-03 1.961 0.050045 .
## TEAM_PITCHING_H -1.447e-03 5.239e-04 -2.762 0.005798 **
## TEAM_PITCHING_HR -1.696e-01 3.180e-02 -5.333 1.06e-07 ***
## TEAM_PITCHING_BB 6.138e-03 3.553e-03 1.728 0.084172 .
## TEAM_FIELDING_E -3.367e-02 3.302e-03 -10.196 < 2e-16 ***
## SLG1 5.894e+01 2.788e+01 2.114 0.034628 *
## HitDiff -6.132e+01 1.933e+01 -3.172 0.001532 **
## WalkDiff 2.928e+01 1.722e+01 1.700 0.089235 .
## SLG2 NA NA NA NA
## HRDiff -1.452e+01 6.897e+00 -2.106 0.035352 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.3 on 2262 degrees of freedom
## Multiple R-squared: 0.2912, Adjusted R-squared: 0.2872
## F-statistic: 71.5 on 13 and 2262 DF, p-value: < 2.2e-16
Non-Transformed seems most predictive
#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_PITCHING_H)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = TEAM_PITCHING_H, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_PITCHING_H_LM = lm(TARGET_WINS ~ TEAM_PITCHING_H, data = TrainingData1)
print(paste("Non Transformed",summary(TEAM_PITCHING_H_LM)$r.squared))
## [1] "Non Transformed 0.012086155892813"
TEAM_PITCHING_H_LM_DF <- data.frame(TEAM_PITCHING_H_LM$fitted.values, TEAM_PITCHING_H_LM$residuals)
names(TEAM_PITCHING_H_LM_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_PITCHING_H_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#Log10
ggplot(data=TrainingData1, aes(x=log10(TEAM_PITCHING_H))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = log10(TEAM_PITCHING_H), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_PITCHING_H_LM2 = lm(TARGET_WINS ~ log10(TEAM_PITCHING_H), data = TrainingData1)
print(paste("Log10",summary(TEAM_PITCHING_H_LM2)$r.squared))
## [1] "Log10 0.000117579870753259"
TEAM_PITCHING_H_LM2_DF <- data.frame(TEAM_PITCHING_H_LM2$fitted.values, TEAM_PITCHING_H_LM2$residuals)
names(TEAM_PITCHING_H_LM2_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_PITCHING_H_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#Not convinced Log is worth it.
TrainingData2pitch <- data.frame(TrainingData1$TEAM_PITCHING_H)
TrainingData2pitch[TrainingData2pitch > 3000] <- 3000
TrainingData2 <- TrainingData2
TrainingData2$TEAM_PITCHING_H <- TrainingData2pitch$TrainingData1.TEAM_PITCHING_H
names(TrainingData2) <- names(TrainingData1)
#CAPPED
ggplot(data=TrainingData2, aes(x=TEAM_PITCHING_H)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData2, aes(x = TEAM_PITCHING_H, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_PITCHING_H_LM3 = lm(TARGET_WINS ~ TEAM_PITCHING_H, data = TrainingData2)
print(paste("CAPPED",summary(TEAM_PITCHING_H_LM3)$r.squared))
## [1] "CAPPED 0.00477054431469875"
TEAM_PITCHING_H_LM3_DF <- data.frame(TEAM_PITCHING_H_LM3$fitted.values, TEAM_PITCHING_H_LM3$residuals)
names(TEAM_PITCHING_H_LM3_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_PITCHING_H_LM3_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#Capping changes slope of regression line considerably.
Non-transformed
#Not Transformed
ggplot(data=TrainingData1, aes(x=SLG1)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = SLG1, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
SLG1_LM = lm(TARGET_WINS ~ SLG1, data = TrainingData1)
print(paste("Non Transformed",summary(SLG1_LM)$r.squared))
## [1] "Non Transformed 0.0362979442242567"
SLG1_LM_DF <- data.frame(SLG1_LM$fitted.values, SLG1_LM$residuals)
names(SLG1_LM_DF) <- c("Fitted","Resid")
ggplot(data=SLG1_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
Non-Transformed
#Not Transformed
ggplot(data=TrainingData1, aes(x=SLG2)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = SLG2, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) +geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
SLG2_LM = lm(TARGET_WINS ~ SLG2, data = TrainingData1)
print(paste("Non Transformed",summary(SLG2_LM)$r.squared))
## [1] "Non Transformed 0.178739044216743"
SLG2_LM_DF <- data.frame(SLG2_LM$fitted.values, SLG2_LM$residuals)
names(SLG2_LM_DF) <- c("Fitted","Resid")
ggplot(data=SLG2_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
Non-Transformed
#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_H)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = TEAM_BATTING_H, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_BATTING_H_LM = lm(TARGET_WINS ~ TEAM_BATTING_H, data = TrainingData1)
print(paste("Non Transformed",summary(TEAM_BATTING_H_LM)$r.squared))
## [1] "Non Transformed 0.151140185440693"
TEAM_BATTING_H_LM_DF <- data.frame(TEAM_BATTING_H_LM$fitted.values, TEAM_BATTING_H_LM$residuals)
names(TEAM_BATTING_H_LM_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_BATTING_H_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
Non-transformed
#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_2B)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = TEAM_BATTING_2B, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_BATTING_2B_LM = lm(TARGET_WINS ~ TEAM_BATTING_2B, data = TrainingData1)
print(paste("Non Transformed",summary(TEAM_BATTING_2B_LM)$r.squared))
## [1] "Non Transformed 0.0835809176046633"
TEAM_BATTING_2B_LM_DF <- data.frame(TEAM_BATTING_2B_LM$fitted.values, TEAM_BATTING_2B_LM$residuals)
names(TEAM_BATTING_2B_LM_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_BATTING_2B_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
Non-Transformed
#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_3B)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = TEAM_BATTING_3B, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_BATTING_3B_LM = lm(TARGET_WINS ~ TEAM_BATTING_3B, data = TrainingData1)
print(paste("Non Transformed",summary(TEAM_BATTING_3B_LM)$r.squared))
## [1] "Non Transformed 0.0203371588023067"
TEAM_BATTING_3B_LM_DF <- data.frame(TEAM_BATTING_3B_LM$fitted.values, TEAM_BATTING_3B_LM$residuals)
names(TEAM_BATTING_3B_LM_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_BATTING_3B_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#Log10
ggplot(data=TrainingData1, aes(x=log10(TEAM_BATTING_3B+1))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = log10(TEAM_BATTING_3B+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_BATTING_3B_LM2 = lm(TARGET_WINS ~ log10(TEAM_BATTING_3B+1), data = TrainingData1)
print(paste("Log10",summary(TEAM_BATTING_3B_LM2)$r.squared))
## [1] "Log10 0.0154687934749513"
TEAM_BATTING_3B_LM2_DF <- data.frame(TEAM_BATTING_3B_LM2$fitted.values, TEAM_BATTING_3B_LM2$residuals)
names(TEAM_BATTING_3B_LM2_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_BATTING_3B_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
Log10
#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_HR)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = TEAM_BATTING_HR, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_BATTING_HR_LM = lm(TARGET_WINS ~ TEAM_BATTING_HR, data = TrainingData1)
print(paste("Non Transformed",summary(TEAM_BATTING_HR_LM)$r.squared))
## [1] "Non Transformed 0.0310299500358932"
TEAM_BATTING_HR_LM_DF <- data.frame(TEAM_BATTING_HR_LM$fitted.values, TEAM_BATTING_HR_LM$residuals)
names(TEAM_BATTING_HR_LM_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_BATTING_HR_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#Log10
ggplot(data=TrainingData1, aes(x=log10(TEAM_BATTING_HR+1))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = log10(TEAM_BATTING_HR+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_BATTING_HR_LM2 = lm(TARGET_WINS ~ log10(TEAM_BATTING_HR+1), data = TrainingData1)
print(paste("Log10",summary(TEAM_BATTING_HR_LM2)$r.squared))
## [1] "Log10 0.0418750229513914"
TEAM_BATTING_HR_LM2_DF <- data.frame(TEAM_BATTING_HR_LM2$fitted.values, TEAM_BATTING_HR_LM2$residuals)
names(TEAM_BATTING_HR_LM2_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_BATTING_HR_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
Non-Transformed
#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_BB)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = TEAM_BATTING_BB, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_BATTING_BB_LM = lm(TARGET_WINS ~ TEAM_BATTING_BB, data = TrainingData1)
print(paste("Non Transformed",summary(TEAM_BATTING_BB_LM)$r.squared))
## [1] "Non Transformed 0.05408409024369"
TEAM_BATTING_BB_LM_DF <- data.frame(TEAM_BATTING_BB_LM$fitted.values, TEAM_BATTING_BB_LM$residuals)
names(TEAM_BATTING_BB_LM_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_BATTING_BB_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
Non-Transformed
#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_H)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = TEAM_BATTING_H, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_BATTING_H_LM = lm(TARGET_WINS ~ TEAM_BATTING_H, data = TrainingData1)
print(paste("Non Transformed",summary(TEAM_BATTING_H_LM)$r.squared))
## [1] "Non Transformed 0.151140185440693"
TEAM_BATTING_H_LM_DF <- data.frame(TEAM_BATTING_H_LM$fitted.values, TEAM_BATTING_H_LM$residuals)
names(TEAM_BATTING_H_LM_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_BATTING_H_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
Log10
#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_PITCHING_HR)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = TEAM_PITCHING_HR, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_PITCHING_HR_LM = lm(TARGET_WINS ~ TEAM_PITCHING_HR, data = TrainingData1)
print(paste("Non Transformed",summary(TEAM_PITCHING_HR_LM)$r.squared))
## [1] "Non Transformed 0.0357261919307869"
TEAM_PITCHING_HR_LM_DF <- data.frame(TEAM_PITCHING_HR_LM$fitted.values, TEAM_PITCHING_HR_LM$residuals)
names(TEAM_PITCHING_HR_LM_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_PITCHING_HR_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#Log10
ggplot(data=TrainingData1, aes(x=log10(TEAM_PITCHING_HR+1))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = log10(TEAM_PITCHING_HR+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_PITCHING_HR_LM2 = lm(TARGET_WINS ~ log10(TEAM_PITCHING_HR+1), data = TrainingData1)
print(paste("Log10",summary(TEAM_PITCHING_HR_LM2)$r.squared))
## [1] "Log10 0.0473982795376588"
TEAM_PITCHING_HR_LM2_DF <- data.frame(TEAM_PITCHING_HR_LM2$fitted.values, TEAM_PITCHING_HR_LM2$residuals)
names(TEAM_PITCHING_HR_LM2_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_PITCHING_HR_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
Capped at 100, AND Log10
#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_PITCHING_BB)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = TEAM_PITCHING_BB, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_PITCHING_BB_LM = lm(TARGET_WINS ~ TEAM_PITCHING_BB, data = TrainingData1)
print(paste("Non Transformed",summary(TEAM_PITCHING_BB_LM)$r.squared))
## [1] "Non Transformed 0.015419315390593"
TEAM_PITCHING_BB_LM_DF <- data.frame(TEAM_PITCHING_BB_LM$fitted.values, TEAM_PITCHING_BB_LM$residuals)
names(TEAM_PITCHING_BB_LM_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_PITCHING_BB_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#Log10
ggplot(data=TrainingData1, aes(x=log10(TEAM_PITCHING_BB+1))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = log10(TEAM_PITCHING_BB+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_PITCHING_BB_LM2 = lm(TARGET_WINS ~ log10(TEAM_PITCHING_BB+1), data = TrainingData1)
print(paste("Log10",summary(TEAM_PITCHING_BB_LM2)$r.squared))
## [1] "Log10 0.0365613920428514"
TEAM_PITCHING_BB_LM2_DF <- data.frame(TEAM_PITCHING_BB_LM2$fitted.values, TEAM_PITCHING_BB_LM2$residuals)
names(TEAM_PITCHING_BB_LM2_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_PITCHING_BB_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#Log10 + 100
ggplot(data=TrainingData1, aes(x=log10(TEAM_PITCHING_BB+100))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = log10(TEAM_PITCHING_BB+100), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_PITCHING_BB_LM2 = lm(TARGET_WINS ~ log10(TEAM_PITCHING_BB+100), data = TrainingData1)
print(paste("Log10 + 100",summary(TEAM_PITCHING_BB_LM2)$r.squared))
## [1] "Log10 + 100 0.0308748132493466"
TEAM_PITCHING_BB_LM2_DF <- data.frame(TEAM_PITCHING_BB_LM2$fitted.values, TEAM_PITCHING_BB_LM2$residuals)
names(TEAM_PITCHING_BB_LM2_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_PITCHING_BB_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#CAPPED
TrainingData2PitchBB <- data.frame(TrainingData1$TEAM_PITCHING_BB)
TrainingData2PitchBB[TrainingData2PitchBB > 1000] <- 1000
TrainingData2 <- TrainingData2
TrainingData2$TEAM_PITCHING_BB <- TrainingData2PitchBB$TrainingData1.TEAM_PITCHING_BB
names(TrainingData2) <- names(TrainingData1)
ggplot(data=TrainingData2, aes(x=TEAM_PITCHING_BB)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData2, aes(x = TEAM_PITCHING_BB, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_PITCHING_BB_LM3 = lm(TARGET_WINS ~ TEAM_PITCHING_BB, data = TrainingData2)
print(paste("CAPPED",summary(TEAM_PITCHING_BB_LM3)$r.squared))
## [1] "CAPPED 0.0329242395344625"
TEAM_PITCHING_BB_LM3_DF <- data.frame(TEAM_PITCHING_BB_LM3$fitted.values, TEAM_PITCHING_BB_LM3$residuals)
names(TEAM_PITCHING_BB_LM3_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_PITCHING_BB_LM3_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#Capped with Log
ggplot(data=TrainingData2, aes(x=log10(TEAM_PITCHING_BB+1))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData2, aes(x = log10(TEAM_PITCHING_BB+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_PITCHING_BB_LM3 = lm(TARGET_WINS ~ log10(TEAM_PITCHING_BB+1), data = TrainingData2)
print(paste("CAPPED and Log",summary(TEAM_PITCHING_BB_LM3)$r.squared))
## [1] "CAPPED and Log 0.0415885398806444"
TEAM_PITCHING_BB_LM3_DF <- data.frame(TEAM_PITCHING_BB_LM3$fitted.values, TEAM_PITCHING_BB_LM3$residuals)
names(TEAM_PITCHING_BB_LM3_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_PITCHING_BB_LM3_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
Not-transformed
#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_FIELDING_E)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = TEAM_FIELDING_E, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_FIELDING_E_LM = lm(TARGET_WINS ~ TEAM_FIELDING_E, data = TrainingData1)
print(paste("Non Transformed",summary(TEAM_FIELDING_E_LM)$r.squared))
## [1] "Non Transformed 0.0311468701765676"
TEAM_FIELDING_E_LM_DF <- data.frame(TEAM_FIELDING_E_LM$fitted.values, TEAM_FIELDING_E_LM$residuals)
names(TEAM_FIELDING_E_LM_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_FIELDING_E_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#Log10
ggplot(data=TrainingData1, aes(x=log10(TEAM_FIELDING_E+1))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = log10(TEAM_FIELDING_E+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_FIELDING_E_LM2 = lm(TARGET_WINS ~ log10(TEAM_FIELDING_E+1), data = TrainingData1)
print(paste("Log10",summary(TEAM_FIELDING_E_LM2)$r.squared))
## [1] "Log10 0.0227521650191439"
TEAM_FIELDING_E_LM2_DF <- data.frame(TEAM_FIELDING_E_LM2$fitted.values, TEAM_FIELDING_E_LM2$residuals)
names(TEAM_FIELDING_E_LM2_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_FIELDING_E_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
#SQRT
ggplot(data=TrainingData1, aes(x=sqrt(TEAM_FIELDING_E+1))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = sqrt(TEAM_FIELDING_E+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
TEAM_FIELDING_E_LM2 = lm(TARGET_WINS ~ sqrt(TEAM_FIELDING_E+1), data = TrainingData1)
print(paste("sqrt",summary(TEAM_FIELDING_E_LM2)$r.squared))
## [1] "sqrt 0.025828939700874"
TEAM_FIELDING_E_LM2_DF <- data.frame(TEAM_FIELDING_E_LM2$fitted.values, TEAM_FIELDING_E_LM2$residuals)
names(TEAM_FIELDING_E_LM2_DF) <- c("Fitted","Resid")
ggplot(data=TEAM_FIELDING_E_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
sQRT
#Not Transformed
ggplot(data=TrainingData1, aes(x=HitDiff)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = HitDiff, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
HitDiff_LM = lm(TARGET_WINS ~ HitDiff, data = TrainingData1)
print(paste("Non Transformed",summary(HitDiff_LM)$r.squared))
## [1] "Non Transformed 0.00917764624341174"
HitDiff_LM_DF <- data.frame(HitDiff_LM$fitted.values, HitDiff_LM$residuals)
names(HitDiff_LM_DF) <- c("Fitted","Resid")
ggplot(data=HitDiff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
ggplot(data=TrainingData1, aes(x=sqrt(HitDiff))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=TrainingData1, aes(x = sqrt(HitDiff), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
HitDiff_LM = lm(TARGET_WINS ~ sqrt(HitDiff), data = TrainingData1)
print(paste("SQRT",summary(HitDiff_LM)$r.squared))
## [1] "SQRT 0.0140877071658045"
HitDiff_LM_DF <- data.frame(HitDiff_LM$fitted.values, HitDiff_LM$residuals)
names(HitDiff_LM_DF) <- c("Fitted","Resid")
ggplot(data=HitDiff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
SQRT
#Not Transformed
ggplot(data=TrainingData1, aes(x=WalkDiff)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
ggplot(data=TrainingData1, aes(x = WalkDiff, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
WalkDiff_LM = lm(TARGET_WINS ~ WalkDiff, data = TrainingData1)
print(paste("Non Transformed",summary(WalkDiff_LM)$r.squared))
## [1] "Non Transformed 0.00699903660331598"
WalkDiff_LM_DF <- data.frame(WalkDiff_LM$fitted.values, WalkDiff_LM$residuals)
names(WalkDiff_LM_DF) <- c("Fitted","Resid")
ggplot(data=WalkDiff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
ggplot(data=TrainingData1, aes(x=sqrt(WalkDiff))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
ggplot(data=TrainingData1, aes(x = sqrt(WalkDiff), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
WalkDiff_LM = lm(TARGET_WINS ~ sqrt(WalkDiff), data = TrainingData1)
print(paste("SQRT",summary(WalkDiff_LM)$r.squared))
## [1] "SQRT 0.0105888883346195"
WalkDiff_LM_DF <- data.frame(WalkDiff_LM$fitted.values, WalkDiff_LM$residuals)
names(WalkDiff_LM_DF) <- c("Fitted","Resid")
ggplot(data=WalkDiff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
All bad
#Not Transformed
ggplot(data=TrainingData1, aes(x=HRDiff)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (stat_bin).
ggplot(data=TrainingData1, aes(x = HRDiff, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
HRDiff_LM = lm(TARGET_WINS ~ HRDiff, data = TrainingData1)
print(paste("Non Transformed",summary(HRDiff_LM)$r.squared))
## [1] "Non Transformed 0.000180659522696911"
HRDiff_LM_DF <- data.frame(HRDiff_LM$fitted.values, HRDiff_LM$residuals)
names(HRDiff_LM_DF) <- c("Fitted","Resid")
ggplot(data=HRDiff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
ggplot(data=TrainingData1, aes(x=sqrt(HRDiff))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (stat_bin).
ggplot(data=TrainingData1, aes(x = sqrt(HRDiff), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
HRDifff_LM = lm(TARGET_WINS ~ sqrt(HRDiff), data = TrainingData1)
print(paste("Non Transformed",summary(HRDifff_LM)$r.squared))
## [1] "Non Transformed 0.000371022205049381"
HRDifff_LM_DF <- data.frame(HRDifff_LM$fitted.values, HRDifff_LM$residuals)
names(HRDifff_LM_DF) <- c("Fitted","Resid")
ggplot(data=HRDifff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)
LM2 <- lm(TARGET_WINS~TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+log10(TEAM_BATTING_HR+1)+TEAM_BATTING_BB+TEAM_PITCHING_H+log10(TEAM_PITCHING_HR+1)+log10(TEAM_PITCHING_BB+1)+TEAM_FIELDING_E+SLG1+sqrt(HitDiff)+sqrt(WalkDiff)+SLG2+ HRDiff, data = TrainingData2 )
summary(LM2)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB +
## TEAM_PITCHING_H + log10(TEAM_PITCHING_HR + 1) + log10(TEAM_PITCHING_BB +
## 1) + TEAM_FIELDING_E + SLG1 + sqrt(HitDiff) + sqrt(WalkDiff) +
## SLG2 + HRDiff, data = TrainingData2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.717 -8.575 0.268 8.673 64.196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -85.811976 57.940923 -1.481 0.138739
## TEAM_BATTING_H 0.125754 0.025498 4.932 8.73e-07 ***
## TEAM_BATTING_2B -0.053140 0.010322 -5.148 2.86e-07 ***
## TEAM_BATTING_3B 0.066228 0.016262 4.073 4.81e-05 ***
## log10(TEAM_BATTING_HR + 1) 9.132120 19.464374 0.469 0.638993
## TEAM_BATTING_BB 0.045808 0.007511 6.099 1.25e-09 ***
## TEAM_PITCHING_H 0.011714 0.003955 2.962 0.003087 **
## log10(TEAM_PITCHING_HR + 1) -24.413303 18.805386 -1.298 0.194348
## log10(TEAM_PITCHING_BB + 1) -28.147916 7.328188 -3.841 0.000126 ***
## TEAM_FIELDING_E -0.034177 0.003025 -11.299 < 2e-16 ***
## SLG1 138.146757 29.089418 4.749 2.17e-06 ***
## sqrt(HitDiff) 92.104148 25.869677 3.560 0.000378 ***
## sqrt(WalkDiff) -72.003716 30.649354 -2.349 0.018896 *
## SLG2 -0.057520 0.019150 -3.004 0.002696 **
## HRDiff -35.548299 13.661528 -2.602 0.009327 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.15 on 2261 degrees of freedom
## Multiple R-squared: 0.3078, Adjusted R-squared: 0.3035
## F-statistic: 71.82 on 14 and 2261 DF, p-value: < 2.2e-16
LM3 <- lm(TARGET_WINS~TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+log10(TEAM_BATTING_HR+1)+TEAM_BATTING_BB+TEAM_PITCHING_H+log10(TEAM_PITCHING_BB+1)+TEAM_FIELDING_E+SLG1+SLG2, data = TrainingData2 )
summary(LM3)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB +
## TEAM_PITCHING_H + log10(TEAM_PITCHING_BB + 1) + TEAM_FIELDING_E +
## SLG1 + SLG2, data = TrainingData2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.284 -8.538 0.185 8.691 61.161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.398e+02 3.703e+01 -3.776 0.000163 ***
## TEAM_BATTING_H 1.357e-01 2.496e-02 5.435 6.06e-08 ***
## TEAM_BATTING_2B -5.196e-02 1.027e-02 -5.058 4.57e-07 ***
## TEAM_BATTING_3B 9.615e-02 1.457e-02 6.601 5.07e-11 ***
## log10(TEAM_BATTING_HR + 1) -1.065e+01 2.149e+00 -4.957 7.67e-07 ***
## TEAM_BATTING_BB 3.357e-02 4.970e-03 6.755 1.82e-11 ***
## TEAM_PITCHING_H 1.364e-02 1.735e-03 7.862 5.82e-15 ***
## log10(TEAM_PITCHING_BB + 1) -1.515e+01 4.049e+00 -3.742 0.000187 ***
## TEAM_FIELDING_E -3.739e-02 2.786e-03 -13.422 < 2e-16 ***
## SLG1 1.401e+02 2.809e+01 4.987 6.59e-07 ***
## SLG2 -6.709e-02 1.852e-02 -3.623 0.000298 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.25 on 2265 degrees of freedom
## Multiple R-squared: 0.2961, Adjusted R-squared: 0.293
## F-statistic: 95.28 on 10 and 2265 DF, p-value: < 2.2e-16
library(GA)
## Warning: package 'GA' was built under R version 3.6.3
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.6.3
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 3.6.3
## Package 'GA' version 3.2
## Type 'citation("GA")' for citing this R package in publications.
##
## Attaching package: 'GA'
## The following object is masked from 'package:utils':
##
## de
set.seed(4167)
LMFunction <- function(string) {
TrainingDataX <- TrainingData2
for (i in 1:14) {
TrainingDataX[,i+1] <- TrainingData2[,i+1]*string[i]
assign("TrainingDataX", TrainingDataX, envir = globalenv())
}
LMX <- lm(TARGET_WINS~TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+log10(TEAM_BATTING_HR+1)+TEAM_BATTING_BB+TEAM_PITCHING_H+log10(TEAM_PITCHING_HR+1)+log10(TEAM_PITCHING_BB+1)+TEAM_FIELDING_E+SLG1+sqrt(HitDiff)+sqrt(WalkDiff)+SLG2+ HRDiff, data = TrainingDataX )
summary(LMX)$r.squared
#summary(LM)
#sum(string)
}
ga1 <- ga(type = "binary",fitness = LMFunction,nBits=14)
ga1@fitnessValue
## [1] 0.3078181
anova(LM1,LM3)
## Analysis of Variance Table
##
## Model 1: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_FIELDING_E + SLG1 + HitDiff + WalkDiff +
## SLG2 + HRDiff
## Model 2: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB + TEAM_PITCHING_H +
## log10(TEAM_PITCHING_BB + 1) + TEAM_FIELDING_E + SLG1 + SLG2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2262 400098
## 2 2265 397350 -3 2748.4
anova(LM2,LM3)
## Analysis of Variance Table
##
## Model 1: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB + TEAM_PITCHING_H +
## log10(TEAM_PITCHING_HR + 1) + log10(TEAM_PITCHING_BB + 1) +
## TEAM_FIELDING_E + SLG1 + sqrt(HitDiff) + sqrt(WalkDiff) +
## SLG2 + HRDiff
## Model 2: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB + TEAM_PITCHING_H +
## log10(TEAM_PITCHING_BB + 1) + TEAM_FIELDING_E + SLG1 + SLG2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2261 390734
## 2 2265 397350 -4 -6615.7 9.5706 1.132e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(LM1,LM2)
## Analysis of Variance Table
##
## Model 1: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_FIELDING_E + SLG1 + HitDiff + WalkDiff +
## SLG2 + HRDiff
## Model 2: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB + TEAM_PITCHING_H +
## log10(TEAM_PITCHING_HR + 1) + log10(TEAM_PITCHING_BB + 1) +
## TEAM_FIELDING_E + SLG1 + sqrt(HitDiff) + sqrt(WalkDiff) +
## SLG2 + HRDiff
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2262 400098
## 2 2261 390734 1 9364.2 54.186 2.54e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1