Import/Clean

Import Data / View Summaries

library(rvest)

## Warning: package 'rvest' was built under R version 3.6.3

## Loading required package: xml2

## Warning: package 'xml2' was built under R version 3.6.3

library(ggplot2)

x <- "https://raw.githubusercontent.com/ChristopherBloome/621/main/moneyball-training-data.csv"
TrainingData <- read.csv(x)

summary(TrainingData)

##      INDEX         TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 630.8   1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0  
##  Median :1270.5   Median : 82.00   Median :1454   Median :238.0  
##  Mean   :1268.5   Mean   : 80.79   Mean   :1469   Mean   :241.2  
##  3rd Qu.:1915.5   3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                  
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO 
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0  
##  Median : 47.00   Median :102.00   Median :512.0   Median : 750.0  
##  Mean   : 55.25   Mean   : 99.61   Mean   :501.6   Mean   : 735.6  
##  3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.00   Max.   :878.0   Max.   :1399.0  
##                                                    NA's   :102     
##  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
##  Min.   :  0.0   Min.   :  0.0   Min.   :29.00    Min.   : 1137  
##  1st Qu.: 66.0   1st Qu.: 38.0   1st Qu.:50.50    1st Qu.: 1419  
##  Median :101.0   Median : 49.0   Median :58.00    Median : 1518  
##  Mean   :124.8   Mean   : 52.8   Mean   :59.36    Mean   : 1779  
##  3rd Qu.:156.0   3rd Qu.: 62.0   3rd Qu.:67.00    3rd Qu.: 1682  
##  Max.   :697.0   Max.   :201.0   Max.   :95.00    Max.   :30132  
##  NA's   :131     NA's   :772     NA's   :2085                    
##  TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##  Min.   :  0.0    Min.   :   0.0   Min.   :    0.0   Min.   :  65.0  
##  1st Qu.: 50.0    1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0  
##  Median :107.0    Median : 536.5   Median :  813.5   Median : 159.0  
##  Mean   :105.7    Mean   : 553.0   Mean   :  817.7   Mean   : 246.5  
##  3rd Qu.:150.0    3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2  
##  Max.   :343.0    Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                    NA's   :102                       
##  TEAM_FIELDING_DP
##  Min.   : 52.0   
##  1st Qu.:131.0   
##  Median :149.0   
##  Mean   :146.4   
##  3rd Qu.:164.0   
##  Max.   :228.0   
##  NA's   :286

x <- "https://raw.githubusercontent.com/ChristopherBloome/621/main/moneyball-evaluation-data.csv"
TestingData <- read.csv(x)

summary(TestingData)

##      INDEX      TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B 
##  Min.   :   9   Min.   : 819   Min.   : 44.0   Min.   : 14.00  
##  1st Qu.: 708   1st Qu.:1387   1st Qu.:210.0   1st Qu.: 35.00  
##  Median :1249   Median :1455   Median :239.0   Median : 52.00  
##  Mean   :1264   Mean   :1469   Mean   :241.3   Mean   : 55.91  
##  3rd Qu.:1832   3rd Qu.:1548   3rd Qu.:278.5   3rd Qu.: 72.00  
##  Max.   :2525   Max.   :2170   Max.   :376.0   Max.   :155.00  
##                                                                
##  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO  TEAM_BASERUN_SB
##  Min.   :  0.00   Min.   : 15.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 44.50   1st Qu.:436.5   1st Qu.: 545.0   1st Qu.: 59.0  
##  Median :101.00   Median :509.0   Median : 686.0   Median : 92.0  
##  Mean   : 95.63   Mean   :499.0   Mean   : 709.3   Mean   :123.7  
##  3rd Qu.:135.50   3rd Qu.:565.5   3rd Qu.: 912.0   3rd Qu.:151.8  
##  Max.   :242.00   Max.   :792.0   Max.   :1268.0   Max.   :580.0  
##                                   NA's   :18       NA's   :13     
##  TEAM_BASERUN_CS  TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
##  Min.   :  0.00   Min.   :42.00    Min.   : 1155   Min.   :  0.0   
##  1st Qu.: 38.00   1st Qu.:53.50    1st Qu.: 1426   1st Qu.: 52.0   
##  Median : 49.50   Median :62.00    Median : 1515   Median :104.0   
##  Mean   : 52.32   Mean   :62.37    Mean   : 1813   Mean   :102.1   
##  3rd Qu.: 63.00   3rd Qu.:67.50    3rd Qu.: 1681   3rd Qu.:142.5   
##  Max.   :154.00   Max.   :96.00    Max.   :22768   Max.   :336.0   
##  NA's   :87       NA's   :240                                      
##  TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E  TEAM_FIELDING_DP
##  Min.   : 136.0   Min.   :   0.0   Min.   :  73.0   Min.   : 69.0   
##  1st Qu.: 471.0   1st Qu.: 613.0   1st Qu.: 131.0   1st Qu.:131.0   
##  Median : 526.0   Median : 745.0   Median : 163.0   Median :148.0   
##  Mean   : 552.4   Mean   : 799.7   Mean   : 249.7   Mean   :146.1   
##  3rd Qu.: 606.5   3rd Qu.: 938.0   3rd Qu.: 252.0   3rd Qu.:164.0   
##  Max.   :2008.0   Max.   :9963.0   Max.   :1568.0   Max.   :204.0   
##                   NA's   :18                        NA's   :31

Remove incomplete fields

TrainingData1 <- subset(TrainingData, select = -c(TEAM_BATTING_HBP,TEAM_BASERUN_CS,TEAM_FIELDING_DP, TEAM_BATTING_SO, TEAM_PITCHING_SO, TEAM_BASERUN_SB, INDEX))

summary(TrainingData1)

##   TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B 
##  Min.   :  0.00   Min.   : 891   Min.   : 69.0   Min.   :  0.00  
##  1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0   1st Qu.: 34.00  
##  Median : 82.00   Median :1454   Median :238.0   Median : 47.00  
##  Mean   : 80.79   Mean   :1469   Mean   :241.2   Mean   : 55.25  
##  3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0   3rd Qu.: 72.00  
##  Max.   :146.00   Max.   :2554   Max.   :458.0   Max.   :223.00  
##  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_PITCHING_H TEAM_PITCHING_HR
##  Min.   :  0.00   Min.   :  0.0   Min.   : 1137   Min.   :  0.0   
##  1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 1419   1st Qu.: 50.0   
##  Median :102.00   Median :512.0   Median : 1518   Median :107.0   
##  Mean   : 99.61   Mean   :501.6   Mean   : 1779   Mean   :105.7   
##  3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 1682   3rd Qu.:150.0   
##  Max.   :264.00   Max.   :878.0   Max.   :30132   Max.   :343.0   
##  TEAM_PITCHING_BB TEAM_FIELDING_E 
##  Min.   :   0.0   Min.   :  65.0  
##  1st Qu.: 476.0   1st Qu.: 127.0  
##  Median : 536.5   Median : 159.0  
##  Mean   : 553.0   Mean   : 246.5  
##  3rd Qu.: 611.0   3rd Qu.: 249.2  
##  Max.   :3645.0   Max.   :1898.0

Add Calculated Fields

TrainingData1$SLG1 <- (TrainingData1$TEAM_BATTING_H + TrainingData1$TEAM_BATTING_2B + 2*TrainingData1$TEAM_BATTING_3B + 3*TrainingData1$TEAM_BATTING_HR)/TrainingData1$TEAM_BATTING_H

TrainingData1$SLG2 <- (TrainingData1$TEAM_BATTING_H + TrainingData1$TEAM_BATTING_2B + 2*TrainingData1$TEAM_BATTING_3B + 3*TrainingData1$TEAM_BATTING_HR)

TrainingData1$HitDiff <- TrainingData1$TEAM_BATTING_H/ TrainingData1$TEAM_PITCHING_H

TrainingData1$WalkDiff <- TrainingData1$TEAM_BATTING_BB/ TrainingData1$TEAM_PITCHING_BB 

TrainingData1$HRDiff <- TrainingData1$TEAM_BATTING_HR/TrainingData1$TEAM_PITCHING_HR


TrainingData2HRDiff <- data.frame(TrainingData1$HRDiff)
TrainingData2HRDiff[is.na(TrainingData2HRDiff)] <- 1

TrainingData2WalkDiff <- data.frame(TrainingData1$WalkDiff)
TrainingData2WalkDiff[is.na(TrainingData2WalkDiff)] <- 1

TrainingData2HitDiff <- data.frame(TrainingData1$HitDiff)
TrainingData2HitDiff[is.na(TrainingData2HitDiff)] <- 1

TrainingData2 <- TrainingData1
TrainingData2$HRDiff <- TrainingData2HRDiff$TrainingData1.HRDiff
TrainingData2$WalkDiff <- TrainingData2WalkDiff$TrainingData1.WalkDiff
TrainingData2$HitDiff <- TrainingData2HitDiff$TrainingData1.HitDiff



names(TrainingData2) <- names(TrainingData1)

names(TrainingData1)

##  [1] "TARGET_WINS"      "TEAM_BATTING_H"   "TEAM_BATTING_2B"  "TEAM_BATTING_3B" 
##  [5] "TEAM_BATTING_HR"  "TEAM_BATTING_BB"  "TEAM_PITCHING_H"  "TEAM_PITCHING_HR"
##  [9] "TEAM_PITCHING_BB" "TEAM_FIELDING_E"  "SLG1"             "SLG2"            
## [13] "HitDiff"          "WalkDiff"         "HRDiff"

First Model

LM1 <- lm(TARGET_WINS~TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+TEAM_BATTING_HR+TEAM_BATTING_BB+TEAM_PITCHING_H+TEAM_PITCHING_HR+TEAM_PITCHING_BB+TEAM_FIELDING_E+SLG1+HitDiff+WalkDiff+SLG2+HRDiff, data = TrainingData2 )

summary(LM1)

## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_PITCHING_H + 
##     TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_FIELDING_E + SLG1 + 
##     HitDiff + WalkDiff + SLG2 + HRDiff, data = TrainingData2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.884  -8.652   0.330   8.763  59.970 
## 
## Coefficients: (1 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2.733e+01  3.956e+01  -0.691 0.489790    
## TEAM_BATTING_H    6.050e-02  7.600e-03   7.960 2.69e-15 ***
## TEAM_BATTING_2B  -6.494e-02  1.935e-02  -3.356 0.000804 ***
## TEAM_BATTING_3B   5.082e-02  3.898e-02   1.304 0.192503    
## TEAM_BATTING_HR   9.703e-02  6.837e-02   1.419 0.155974    
## TEAM_BATTING_BB   1.079e-02  5.504e-03   1.961 0.050045 .  
## TEAM_PITCHING_H  -1.447e-03  5.239e-04  -2.762 0.005798 ** 
## TEAM_PITCHING_HR -1.696e-01  3.180e-02  -5.333 1.06e-07 ***
## TEAM_PITCHING_BB  6.138e-03  3.553e-03   1.728 0.084172 .  
## TEAM_FIELDING_E  -3.367e-02  3.302e-03 -10.196  < 2e-16 ***
## SLG1              5.894e+01  2.788e+01   2.114 0.034628 *  
## HitDiff          -6.132e+01  1.933e+01  -3.172 0.001532 ** 
## WalkDiff          2.928e+01  1.722e+01   1.700 0.089235 .  
## SLG2                     NA         NA      NA       NA    
## HRDiff           -1.452e+01  6.897e+00  -2.106 0.035352 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.3 on 2262 degrees of freedom
## Multiple R-squared:  0.2912, Adjusted R-squared:  0.2872 
## F-statistic:  71.5 on 13 and 2262 DF,  p-value: < 2.2e-16

Explore Transformations

TEAM_PITCHING_H

Non-Transformed seems most predictive

#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_PITCHING_H)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = TEAM_PITCHING_H, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_PITCHING_H_LM = lm(TARGET_WINS ~ TEAM_PITCHING_H, data = TrainingData1)

print(paste("Non Transformed",summary(TEAM_PITCHING_H_LM)$r.squared))

## [1] "Non Transformed 0.012086155892813"

TEAM_PITCHING_H_LM_DF <- data.frame(TEAM_PITCHING_H_LM$fitted.values, TEAM_PITCHING_H_LM$residuals)
names(TEAM_PITCHING_H_LM_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_PITCHING_H_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#Log10 

ggplot(data=TrainingData1, aes(x=log10(TEAM_PITCHING_H))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = log10(TEAM_PITCHING_H), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_PITCHING_H_LM2 = lm(TARGET_WINS ~ log10(TEAM_PITCHING_H), data = TrainingData1)

print(paste("Log10",summary(TEAM_PITCHING_H_LM2)$r.squared))

## [1] "Log10 0.000117579870753259"

TEAM_PITCHING_H_LM2_DF <- data.frame(TEAM_PITCHING_H_LM2$fitted.values, TEAM_PITCHING_H_LM2$residuals)
names(TEAM_PITCHING_H_LM2_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_PITCHING_H_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#Not convinced Log is worth it. 

TrainingData2pitch <- data.frame(TrainingData1$TEAM_PITCHING_H)

TrainingData2pitch[TrainingData2pitch > 3000] <- 3000

TrainingData2 <- TrainingData2
TrainingData2$TEAM_PITCHING_H <- TrainingData2pitch$TrainingData1.TEAM_PITCHING_H
names(TrainingData2) <- names(TrainingData1)

#CAPPED
ggplot(data=TrainingData2, aes(x=TEAM_PITCHING_H)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData2, aes(x = TEAM_PITCHING_H, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_PITCHING_H_LM3 = lm(TARGET_WINS ~ TEAM_PITCHING_H, data = TrainingData2)

print(paste("CAPPED",summary(TEAM_PITCHING_H_LM3)$r.squared))

## [1] "CAPPED 0.00477054431469875"

TEAM_PITCHING_H_LM3_DF <- data.frame(TEAM_PITCHING_H_LM3$fitted.values, TEAM_PITCHING_H_LM3$residuals)
names(TEAM_PITCHING_H_LM3_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_PITCHING_H_LM3_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#Capping changes slope of regression line considerably.

SLG1

Non-transformed

#Not Transformed
ggplot(data=TrainingData1, aes(x=SLG1)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = SLG1, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

SLG1_LM = lm(TARGET_WINS ~ SLG1, data = TrainingData1)

print(paste("Non Transformed",summary(SLG1_LM)$r.squared))

## [1] "Non Transformed 0.0362979442242567"

SLG1_LM_DF <- data.frame(SLG1_LM$fitted.values, SLG1_LM$residuals)
names(SLG1_LM_DF) <- c("Fitted","Resid")

ggplot(data=SLG1_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

SLG2

Non-Transformed

#Not Transformed
ggplot(data=TrainingData1, aes(x=SLG2)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = SLG2, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) +geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

SLG2_LM = lm(TARGET_WINS ~ SLG2, data = TrainingData1)

print(paste("Non Transformed",summary(SLG2_LM)$r.squared))

## [1] "Non Transformed 0.178739044216743"

SLG2_LM_DF <- data.frame(SLG2_LM$fitted.values, SLG2_LM$residuals)
names(SLG2_LM_DF) <- c("Fitted","Resid")

ggplot(data=SLG2_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

Team_Batting_H

Non-Transformed

#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_H)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = TEAM_BATTING_H, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_BATTING_H_LM = lm(TARGET_WINS ~ TEAM_BATTING_H, data = TrainingData1)

print(paste("Non Transformed",summary(TEAM_BATTING_H_LM)$r.squared))

## [1] "Non Transformed 0.151140185440693"

TEAM_BATTING_H_LM_DF <- data.frame(TEAM_BATTING_H_LM$fitted.values, TEAM_BATTING_H_LM$residuals) 
names(TEAM_BATTING_H_LM_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_BATTING_H_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

Team_Batting_2B

Non-transformed

#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_2B)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = TEAM_BATTING_2B, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_BATTING_2B_LM = lm(TARGET_WINS ~ TEAM_BATTING_2B, data = TrainingData1)

print(paste("Non Transformed",summary(TEAM_BATTING_2B_LM)$r.squared))

## [1] "Non Transformed 0.0835809176046633"

TEAM_BATTING_2B_LM_DF <- data.frame(TEAM_BATTING_2B_LM$fitted.values, TEAM_BATTING_2B_LM$residuals) 
names(TEAM_BATTING_2B_LM_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_BATTING_2B_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

Team_Batting_3B

Non-Transformed

#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_3B)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = TEAM_BATTING_3B, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_BATTING_3B_LM = lm(TARGET_WINS ~ TEAM_BATTING_3B, data = TrainingData1)

print(paste("Non Transformed",summary(TEAM_BATTING_3B_LM)$r.squared))

## [1] "Non Transformed 0.0203371588023067"

TEAM_BATTING_3B_LM_DF <- data.frame(TEAM_BATTING_3B_LM$fitted.values, TEAM_BATTING_3B_LM$residuals)
names(TEAM_BATTING_3B_LM_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_BATTING_3B_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#Log10 

ggplot(data=TrainingData1, aes(x=log10(TEAM_BATTING_3B+1))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = log10(TEAM_BATTING_3B+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_BATTING_3B_LM2 = lm(TARGET_WINS ~ log10(TEAM_BATTING_3B+1), data = TrainingData1)

print(paste("Log10",summary(TEAM_BATTING_3B_LM2)$r.squared))

## [1] "Log10 0.0154687934749513"

TEAM_BATTING_3B_LM2_DF <- data.frame(TEAM_BATTING_3B_LM2$fitted.values, TEAM_BATTING_3B_LM2$residuals)
names(TEAM_BATTING_3B_LM2_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_BATTING_3B_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

Team_Batting_HR

Log10

#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_HR)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = TEAM_BATTING_HR, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_BATTING_HR_LM = lm(TARGET_WINS ~ TEAM_BATTING_HR, data = TrainingData1)

print(paste("Non Transformed",summary(TEAM_BATTING_HR_LM)$r.squared))

## [1] "Non Transformed 0.0310299500358932"

TEAM_BATTING_HR_LM_DF <- data.frame(TEAM_BATTING_HR_LM$fitted.values, TEAM_BATTING_HR_LM$residuals)
names(TEAM_BATTING_HR_LM_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_BATTING_HR_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#Log10 

ggplot(data=TrainingData1, aes(x=log10(TEAM_BATTING_HR+1))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = log10(TEAM_BATTING_HR+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_BATTING_HR_LM2 = lm(TARGET_WINS ~ log10(TEAM_BATTING_HR+1), data = TrainingData1)

print(paste("Log10",summary(TEAM_BATTING_HR_LM2)$r.squared))

## [1] "Log10 0.0418750229513914"

TEAM_BATTING_HR_LM2_DF <- data.frame(TEAM_BATTING_HR_LM2$fitted.values, TEAM_BATTING_HR_LM2$residuals)
names(TEAM_BATTING_HR_LM2_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_BATTING_HR_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

Team_Batting_BB

Non-Transformed

#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_BB)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = TEAM_BATTING_BB, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_BATTING_BB_LM = lm(TARGET_WINS ~ TEAM_BATTING_BB, data = TrainingData1)

print(paste("Non Transformed",summary(TEAM_BATTING_BB_LM)$r.squared))

## [1] "Non Transformed 0.05408409024369"

TEAM_BATTING_BB_LM_DF <- data.frame(TEAM_BATTING_BB_LM$fitted.values, TEAM_BATTING_BB_LM$residuals)
names(TEAM_BATTING_BB_LM_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_BATTING_BB_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

Team_Batting_H

Non-Transformed

#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_BATTING_H)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = TEAM_BATTING_H, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_BATTING_H_LM = lm(TARGET_WINS ~ TEAM_BATTING_H, data = TrainingData1)

print(paste("Non Transformed",summary(TEAM_BATTING_H_LM)$r.squared))

## [1] "Non Transformed 0.151140185440693"

TEAM_BATTING_H_LM_DF <- data.frame(TEAM_BATTING_H_LM$fitted.values, TEAM_BATTING_H_LM$residuals)
names(TEAM_BATTING_H_LM_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_BATTING_H_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

Team_Pitching_HR

Log10

#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_PITCHING_HR)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = TEAM_PITCHING_HR, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_PITCHING_HR_LM = lm(TARGET_WINS ~ TEAM_PITCHING_HR, data = TrainingData1)

print(paste("Non Transformed",summary(TEAM_PITCHING_HR_LM)$r.squared))

## [1] "Non Transformed 0.0357261919307869"

TEAM_PITCHING_HR_LM_DF <- data.frame(TEAM_PITCHING_HR_LM$fitted.values, TEAM_PITCHING_HR_LM$residuals)
names(TEAM_PITCHING_HR_LM_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_PITCHING_HR_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#Log10 

ggplot(data=TrainingData1, aes(x=log10(TEAM_PITCHING_HR+1))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = log10(TEAM_PITCHING_HR+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_PITCHING_HR_LM2 = lm(TARGET_WINS ~ log10(TEAM_PITCHING_HR+1), data = TrainingData1)

print(paste("Log10",summary(TEAM_PITCHING_HR_LM2)$r.squared))

## [1] "Log10 0.0473982795376588"

TEAM_PITCHING_HR_LM2_DF <- data.frame(TEAM_PITCHING_HR_LM2$fitted.values, TEAM_PITCHING_HR_LM2$residuals)
names(TEAM_PITCHING_HR_LM2_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_PITCHING_HR_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

Team_Pitching_BB

Capped at 100, AND Log10

#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_PITCHING_BB)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = TEAM_PITCHING_BB, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_PITCHING_BB_LM = lm(TARGET_WINS ~ TEAM_PITCHING_BB, data = TrainingData1)

print(paste("Non Transformed",summary(TEAM_PITCHING_BB_LM)$r.squared))

## [1] "Non Transformed 0.015419315390593"

TEAM_PITCHING_BB_LM_DF <- data.frame(TEAM_PITCHING_BB_LM$fitted.values, TEAM_PITCHING_BB_LM$residuals)
names(TEAM_PITCHING_BB_LM_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_PITCHING_BB_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#Log10 

ggplot(data=TrainingData1, aes(x=log10(TEAM_PITCHING_BB+1))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = log10(TEAM_PITCHING_BB+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_PITCHING_BB_LM2 = lm(TARGET_WINS ~ log10(TEAM_PITCHING_BB+1), data = TrainingData1)

print(paste("Log10",summary(TEAM_PITCHING_BB_LM2)$r.squared))

## [1] "Log10 0.0365613920428514"

TEAM_PITCHING_BB_LM2_DF <- data.frame(TEAM_PITCHING_BB_LM2$fitted.values, TEAM_PITCHING_BB_LM2$residuals)
names(TEAM_PITCHING_BB_LM2_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_PITCHING_BB_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#Log10 + 100

ggplot(data=TrainingData1, aes(x=log10(TEAM_PITCHING_BB+100))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = log10(TEAM_PITCHING_BB+100), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_PITCHING_BB_LM2 = lm(TARGET_WINS ~ log10(TEAM_PITCHING_BB+100), data = TrainingData1)

print(paste("Log10 + 100",summary(TEAM_PITCHING_BB_LM2)$r.squared))

## [1] "Log10 + 100 0.0308748132493466"

TEAM_PITCHING_BB_LM2_DF <- data.frame(TEAM_PITCHING_BB_LM2$fitted.values, TEAM_PITCHING_BB_LM2$residuals)
names(TEAM_PITCHING_BB_LM2_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_PITCHING_BB_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#CAPPED
TrainingData2PitchBB <- data.frame(TrainingData1$TEAM_PITCHING_BB)

TrainingData2PitchBB[TrainingData2PitchBB > 1000] <- 1000

TrainingData2 <- TrainingData2
TrainingData2$TEAM_PITCHING_BB <- TrainingData2PitchBB$TrainingData1.TEAM_PITCHING_BB
names(TrainingData2) <- names(TrainingData1)

ggplot(data=TrainingData2, aes(x=TEAM_PITCHING_BB)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData2, aes(x = TEAM_PITCHING_BB, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_PITCHING_BB_LM3 = lm(TARGET_WINS ~ TEAM_PITCHING_BB, data = TrainingData2)

print(paste("CAPPED",summary(TEAM_PITCHING_BB_LM3)$r.squared))

## [1] "CAPPED 0.0329242395344625"

TEAM_PITCHING_BB_LM3_DF <- data.frame(TEAM_PITCHING_BB_LM3$fitted.values, TEAM_PITCHING_BB_LM3$residuals)
names(TEAM_PITCHING_BB_LM3_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_PITCHING_BB_LM3_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#Capped with Log

ggplot(data=TrainingData2, aes(x=log10(TEAM_PITCHING_BB+1))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData2, aes(x = log10(TEAM_PITCHING_BB+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_PITCHING_BB_LM3 = lm(TARGET_WINS ~ log10(TEAM_PITCHING_BB+1), data = TrainingData2)

print(paste("CAPPED and Log",summary(TEAM_PITCHING_BB_LM3)$r.squared))

## [1] "CAPPED and Log 0.0415885398806444"

TEAM_PITCHING_BB_LM3_DF <- data.frame(TEAM_PITCHING_BB_LM3$fitted.values, TEAM_PITCHING_BB_LM3$residuals)
names(TEAM_PITCHING_BB_LM3_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_PITCHING_BB_LM3_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

TEAM_FIELDING_E

Not-transformed

#Not Transformed
ggplot(data=TrainingData1, aes(x=TEAM_FIELDING_E)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = TEAM_FIELDING_E, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_FIELDING_E_LM = lm(TARGET_WINS ~ TEAM_FIELDING_E, data = TrainingData1)

print(paste("Non Transformed",summary(TEAM_FIELDING_E_LM)$r.squared))

## [1] "Non Transformed 0.0311468701765676"

TEAM_FIELDING_E_LM_DF <- data.frame(TEAM_FIELDING_E_LM$fitted.values, TEAM_FIELDING_E_LM$residuals)
names(TEAM_FIELDING_E_LM_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_FIELDING_E_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#Log10 

ggplot(data=TrainingData1, aes(x=log10(TEAM_FIELDING_E+1))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = log10(TEAM_FIELDING_E+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_FIELDING_E_LM2 = lm(TARGET_WINS ~ log10(TEAM_FIELDING_E+1), data = TrainingData1)

print(paste("Log10",summary(TEAM_FIELDING_E_LM2)$r.squared))

## [1] "Log10 0.0227521650191439"

TEAM_FIELDING_E_LM2_DF <- data.frame(TEAM_FIELDING_E_LM2$fitted.values, TEAM_FIELDING_E_LM2$residuals)
names(TEAM_FIELDING_E_LM2_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_FIELDING_E_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

#SQRT 

ggplot(data=TrainingData1, aes(x=sqrt(TEAM_FIELDING_E+1))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = sqrt(TEAM_FIELDING_E+1), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

TEAM_FIELDING_E_LM2 = lm(TARGET_WINS ~ sqrt(TEAM_FIELDING_E+1), data = TrainingData1)

print(paste("sqrt",summary(TEAM_FIELDING_E_LM2)$r.squared))

## [1] "sqrt 0.025828939700874"

TEAM_FIELDING_E_LM2_DF <- data.frame(TEAM_FIELDING_E_LM2$fitted.values, TEAM_FIELDING_E_LM2$residuals)
names(TEAM_FIELDING_E_LM2_DF) <- c("Fitted","Resid")

ggplot(data=TEAM_FIELDING_E_LM2_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

HitDiff

sQRT

#Not Transformed
ggplot(data=TrainingData1, aes(x=HitDiff)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = HitDiff, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

HitDiff_LM = lm(TARGET_WINS ~ HitDiff, data = TrainingData1)

print(paste("Non Transformed",summary(HitDiff_LM)$r.squared))

## [1] "Non Transformed 0.00917764624341174"

HitDiff_LM_DF <- data.frame(HitDiff_LM$fitted.values, HitDiff_LM$residuals)
names(HitDiff_LM_DF) <- c("Fitted","Resid")

ggplot(data=HitDiff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

ggplot(data=TrainingData1, aes(x=sqrt(HitDiff))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=TrainingData1, aes(x = sqrt(HitDiff), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

HitDiff_LM = lm(TARGET_WINS ~ sqrt(HitDiff), data = TrainingData1)

print(paste("SQRT",summary(HitDiff_LM)$r.squared))

## [1] "SQRT 0.0140877071658045"

HitDiff_LM_DF <- data.frame(HitDiff_LM$fitted.values, HitDiff_LM$residuals)
names(HitDiff_LM_DF) <- c("Fitted","Resid")

ggplot(data=HitDiff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

WalkDiff

SQRT

#Not Transformed
ggplot(data=TrainingData1, aes(x=WalkDiff)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1 rows containing non-finite values (stat_bin).

ggplot(data=TrainingData1, aes(x = WalkDiff, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

WalkDiff_LM = lm(TARGET_WINS ~ WalkDiff, data = TrainingData1)

print(paste("Non Transformed",summary(WalkDiff_LM)$r.squared))

## [1] "Non Transformed 0.00699903660331598"

WalkDiff_LM_DF <- data.frame(WalkDiff_LM$fitted.values, WalkDiff_LM$residuals)
names(WalkDiff_LM_DF) <- c("Fitted","Resid")

ggplot(data=WalkDiff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

ggplot(data=TrainingData1, aes(x=sqrt(WalkDiff))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1 rows containing non-finite values (stat_bin).

ggplot(data=TrainingData1, aes(x = sqrt(WalkDiff), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

WalkDiff_LM = lm(TARGET_WINS ~ sqrt(WalkDiff), data = TrainingData1)

print(paste("SQRT",summary(WalkDiff_LM)$r.squared))

## [1] "SQRT 0.0105888883346195"

WalkDiff_LM_DF <- data.frame(WalkDiff_LM$fitted.values, WalkDiff_LM$residuals)
names(WalkDiff_LM_DF) <- c("Fitted","Resid")

ggplot(data=WalkDiff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

HRDiff

All bad

#Not Transformed
ggplot(data=TrainingData1, aes(x=HRDiff)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 15 rows containing non-finite values (stat_bin).

ggplot(data=TrainingData1, aes(x = HRDiff, y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## Warning: Removed 15 rows containing non-finite values (stat_smooth).

## Warning: Removed 15 rows containing non-finite values (stat_smooth).

## Warning: Removed 15 rows containing missing values (geom_point).

HRDiff_LM = lm(TARGET_WINS ~ HRDiff, data = TrainingData1)

print(paste("Non Transformed",summary(HRDiff_LM)$r.squared))

## [1] "Non Transformed 0.000180659522696911"

HRDiff_LM_DF <- data.frame(HRDiff_LM$fitted.values, HRDiff_LM$residuals)
names(HRDiff_LM_DF) <- c("Fitted","Resid")

ggplot(data=HRDiff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

ggplot(data=TrainingData1, aes(x=sqrt(HRDiff))) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 15 rows containing non-finite values (stat_bin).

ggplot(data=TrainingData1, aes(x = sqrt(HRDiff), y=TARGET_WINS)) + geom_jitter() + geom_smooth(formula= y~x) + geom_smooth(method='lm', formula= y~x)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## Warning: Removed 15 rows containing non-finite values (stat_smooth).

## Warning: Removed 15 rows containing non-finite values (stat_smooth).

## Warning: Removed 15 rows containing missing values (geom_point).

HRDifff_LM = lm(TARGET_WINS ~ sqrt(HRDiff), data = TrainingData1)

print(paste("Non Transformed",summary(HRDifff_LM)$r.squared))

## [1] "Non Transformed 0.000371022205049381"

HRDifff_LM_DF <- data.frame(HRDifff_LM$fitted.values, HRDifff_LM$residuals)
names(HRDifff_LM_DF) <- c("Fitted","Resid")

ggplot(data=HRDifff_LM_DF, aes(y=Resid, x = Fitted )) + geom_jitter() + geom_smooth(method='lm', formula= y~x)

Models

LM2

LM2 <- lm(TARGET_WINS~TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+log10(TEAM_BATTING_HR+1)+TEAM_BATTING_BB+TEAM_PITCHING_H+log10(TEAM_PITCHING_HR+1)+log10(TEAM_PITCHING_BB+1)+TEAM_FIELDING_E+SLG1+sqrt(HitDiff)+sqrt(WalkDiff)+SLG2+ HRDiff, data = TrainingData2 )


summary(LM2)

## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB + 
##     TEAM_PITCHING_H + log10(TEAM_PITCHING_HR + 1) + log10(TEAM_PITCHING_BB + 
##     1) + TEAM_FIELDING_E + SLG1 + sqrt(HitDiff) + sqrt(WalkDiff) + 
##     SLG2 + HRDiff, data = TrainingData2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.717  -8.575   0.268   8.673  64.196 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -85.811976  57.940923  -1.481 0.138739    
## TEAM_BATTING_H                0.125754   0.025498   4.932 8.73e-07 ***
## TEAM_BATTING_2B              -0.053140   0.010322  -5.148 2.86e-07 ***
## TEAM_BATTING_3B               0.066228   0.016262   4.073 4.81e-05 ***
## log10(TEAM_BATTING_HR + 1)    9.132120  19.464374   0.469 0.638993    
## TEAM_BATTING_BB               0.045808   0.007511   6.099 1.25e-09 ***
## TEAM_PITCHING_H               0.011714   0.003955   2.962 0.003087 ** 
## log10(TEAM_PITCHING_HR + 1) -24.413303  18.805386  -1.298 0.194348    
## log10(TEAM_PITCHING_BB + 1) -28.147916   7.328188  -3.841 0.000126 ***
## TEAM_FIELDING_E              -0.034177   0.003025 -11.299  < 2e-16 ***
## SLG1                        138.146757  29.089418   4.749 2.17e-06 ***
## sqrt(HitDiff)                92.104148  25.869677   3.560 0.000378 ***
## sqrt(WalkDiff)              -72.003716  30.649354  -2.349 0.018896 *  
## SLG2                         -0.057520   0.019150  -3.004 0.002696 ** 
## HRDiff                      -35.548299  13.661528  -2.602 0.009327 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.15 on 2261 degrees of freedom
## Multiple R-squared:  0.3078, Adjusted R-squared:  0.3035 
## F-statistic: 71.82 on 14 and 2261 DF,  p-value: < 2.2e-16

LM3

LM3 <- lm(TARGET_WINS~TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+log10(TEAM_BATTING_HR+1)+TEAM_BATTING_BB+TEAM_PITCHING_H+log10(TEAM_PITCHING_BB+1)+TEAM_FIELDING_E+SLG1+SLG2, data = TrainingData2 )


summary(LM3)

## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB + 
##     TEAM_PITCHING_H + log10(TEAM_PITCHING_BB + 1) + TEAM_FIELDING_E + 
##     SLG1 + SLG2, data = TrainingData2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.284  -8.538   0.185   8.691  61.161 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -1.398e+02  3.703e+01  -3.776 0.000163 ***
## TEAM_BATTING_H               1.357e-01  2.496e-02   5.435 6.06e-08 ***
## TEAM_BATTING_2B             -5.196e-02  1.027e-02  -5.058 4.57e-07 ***
## TEAM_BATTING_3B              9.615e-02  1.457e-02   6.601 5.07e-11 ***
## log10(TEAM_BATTING_HR + 1)  -1.065e+01  2.149e+00  -4.957 7.67e-07 ***
## TEAM_BATTING_BB              3.357e-02  4.970e-03   6.755 1.82e-11 ***
## TEAM_PITCHING_H              1.364e-02  1.735e-03   7.862 5.82e-15 ***
## log10(TEAM_PITCHING_BB + 1) -1.515e+01  4.049e+00  -3.742 0.000187 ***
## TEAM_FIELDING_E             -3.739e-02  2.786e-03 -13.422  < 2e-16 ***
## SLG1                         1.401e+02  2.809e+01   4.987 6.59e-07 ***
## SLG2                        -6.709e-02  1.852e-02  -3.623 0.000298 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.25 on 2265 degrees of freedom
## Multiple R-squared:  0.2961, Adjusted R-squared:  0.293 
## F-statistic: 95.28 on 10 and 2265 DF,  p-value: < 2.2e-16

Genetic Alg

library(GA)

## Warning: package 'GA' was built under R version 3.6.3

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 3.6.3

## Loading required package: iterators

## Warning: package 'iterators' was built under R version 3.6.3

## Package 'GA' version 3.2
## Type 'citation("GA")' for citing this R package in publications.

## 
## Attaching package: 'GA'

## The following object is masked from 'package:utils':
## 
##     de

set.seed(4167)

LMFunction <- function(string) {
  TrainingDataX <- TrainingData2
  for (i in 1:14) {
    TrainingDataX[,i+1] <- TrainingData2[,i+1]*string[i]
    assign("TrainingDataX", TrainingDataX, envir = globalenv())
  }

LMX <-  lm(TARGET_WINS~TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+log10(TEAM_BATTING_HR+1)+TEAM_BATTING_BB+TEAM_PITCHING_H+log10(TEAM_PITCHING_HR+1)+log10(TEAM_PITCHING_BB+1)+TEAM_FIELDING_E+SLG1+sqrt(HitDiff)+sqrt(WalkDiff)+SLG2+ HRDiff, data = TrainingDataX )

summary(LMX)$r.squared
#summary(LM)
  #sum(string)
  
}


ga1 <- ga(type = "binary",fitness = LMFunction,nBits=14)
ga1@fitnessValue

## [1] 0.3078181

ANOVA Tests

anova(LM1,LM3)

## Analysis of Variance Table
## 
## Model 1: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_BB + TEAM_FIELDING_E + SLG1 + HitDiff + WalkDiff + 
##     SLG2 + HRDiff
## Model 2: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB + TEAM_PITCHING_H + 
##     log10(TEAM_PITCHING_BB + 1) + TEAM_FIELDING_E + SLG1 + SLG2
##   Res.Df    RSS Df Sum of Sq F Pr(>F)
## 1   2262 400098                      
## 2   2265 397350 -3    2748.4

anova(LM2,LM3)

## Analysis of Variance Table
## 
## Model 1: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB + TEAM_PITCHING_H + 
##     log10(TEAM_PITCHING_HR + 1) + log10(TEAM_PITCHING_BB + 1) + 
##     TEAM_FIELDING_E + SLG1 + sqrt(HitDiff) + sqrt(WalkDiff) + 
##     SLG2 + HRDiff
## Model 2: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB + TEAM_PITCHING_H + 
##     log10(TEAM_PITCHING_BB + 1) + TEAM_FIELDING_E + SLG1 + SLG2
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   2261 390734                                  
## 2   2265 397350 -4   -6615.7 9.5706 1.132e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(LM1,LM2)

## Analysis of Variance Table
## 
## Model 1: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_BB + TEAM_FIELDING_E + SLG1 + HitDiff + WalkDiff + 
##     SLG2 + HRDiff
## Model 2: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     log10(TEAM_BATTING_HR + 1) + TEAM_BATTING_BB + TEAM_PITCHING_H + 
##     log10(TEAM_PITCHING_HR + 1) + log10(TEAM_PITCHING_BB + 1) + 
##     TEAM_FIELDING_E + SLG1 + sqrt(HitDiff) + sqrt(WalkDiff) + 
##     SLG2 + HRDiff
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)    
## 1   2262 400098                                 
## 2   2261 390734  1    9364.2 54.186 2.54e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

HW1Prep

Christopher Bloome

2/28/2021

Import/Clean

Import Data / View Summaries

Remove incomplete fields

Add Calculated Fields

First Model

Explore Transformations

TEAM_PITCHING_H

SLG1

SLG2

Team_Batting_H

Team_Batting_2B

Team_Batting_3B

Team_Batting_HR

Team_Batting_BB

Team_Batting_H

Team_Pitching_HR

Team_Pitching_BB

TEAM_FIELDING_E

HitDiff

WalkDiff

HRDiff

Models

LM2

LM3

Genetic Alg

ANOVA Tests