library(corrplot)
library(car)
library(ppcor)
## Loading required package: MASS
train<-read.csv("https://raw.githubusercontent.com/scottogden10/621_hw/master/moneyball-training-data.csv")
test<-read.csv("https://raw.githubusercontent.com/scottogden10/621_hw/master/moneyball-evaluation-data.csv")

##Plot Boxplots with scatterplots to get a view of all the variables
hist(train$TARGET_WINS,xlab = "Wins",ylab = "Probability",main = "Target",probability = TRUE)

##Look into Independent variables

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_H, xlab="Team Batting H",
     ylab="Wins",col = "dark red")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_H, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_H),1),"SD=",round(sd(train$TEAM_BATTING_H),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_H))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_2B, xlab="Team Batting 2B",
     ylab="Wins",col = "dark blue")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_2B, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_2B),1),"SD=",round(sd(train$TEAM_BATTING_2B),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_2B))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=log(train$TEAM_BATTING_3B), xlab="Team Batting 3B",
     ylab="Wins",col = "dark green")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(log(train$TEAM_BATTING_3B), axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_3B),1),"SD=",round(sd(train$TEAM_BATTING_3B),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_3B))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_HR, xlab="Team Batting HR",
     ylab="Wins",col = "brown")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_HR, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_HR),1),"SD=",round(sd(train$TEAM_BATTING_HR),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_HR))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_BB, xlab="Team Batting BB",
     ylab="Wins",col = "dark grey")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_BB, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_BB),1),"SD=",round(sd(train$TEAM_BATTING_BB),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_BB))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_SO, xlab="Team Batting SO",
     ylab="Wins",col = "purple")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_SO, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_SO,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_BATTING_SO,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_SO))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=log(train$TEAM_BASERUN_SB), xlab="Team Baserun SB",
     ylab="Wins",col = "green")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(log(train$TEAM_BASERUN_SB), axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BASERUN_SB,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_BASERUN_SB,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BASERUN_SB))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BASERUN_CS, xlab="Team Baserun CS",
     ylab="Wins",col = "hot pink")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BASERUN_CS, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BASERUN_CS,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_BASERUN_CS,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BASERUN_CS))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_HBP, xlab="Team Batting HBP",
     ylab="Wins",col = "magenta")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_HBP, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_HBP,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_BATTING_HBP,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_HBP))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_PITCHING_H, xlab="Team PITCHING_H",
     ylab="Wins",col = "black")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_PITCHING_H, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_PITCHING_H,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_PITCHING_H,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_PITCHING_H))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_PITCHING_HR, xlab="Team PITCHING_HR",
     ylab="Wins",col = "red")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_HR, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_PITCHING_HR, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_PITCHING_HR,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_PITCHING_HR,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_PITCHING_HR))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_PITCHING_BB, xlab="Team PITCHING_BB",
     ylab="Wins",col = "green")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_BB, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_PITCHING_BB, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_PITCHING_BB,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_PITCHING_BB,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_PITCHING_BB))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_PITCHING_SO, xlab="Team PITCHING_SO",
     ylab="Wins",col = "maroon")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_SO, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_PITCHING_SO, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_PITCHING_SO,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_PITCHING_SO,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_PITCHING_SO))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_FIELDING_DP, xlab="Team FIELDING_DP",
     ylab="Wins",col = "dark blue")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_SO, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_FIELDING_DP, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_FIELDING_DP,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_FIELDING_DP,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_FIELDING_DP))),outer=TRUE, line=-4)

####Correlation Plot

mat<-as.matrix(cor(train[-1],use="pairwise.complete.obs"))
corrplot(mat,tl.cex=.5)

###Impute missing values by generating random data with noise based on mean and SD


train1<-train
###Transform data

##Batting H
  ##Leave as is

##Batting 2B
    ##Leave as is

##Batting H
##Leave as is

##Batting BB
##Leave as is

#Batting 3B Transform and impute

train1[train1$TEAM_BATTING_3B==0,5 ]<-8
train1$TEAM_BATTING_3B<-log(train1$TEAM_BATTING_3B)
colnames(train1)[5] <- "log(Bat_3B)"

## Baserun CS Introduce randomness when imputing
brcs<-train$TEAM_BASERUN_CS
brcs[is.na(brcs)]<-rnorm(n=1,mean = mean(brcs,na.rm=TRUE),sd=sd(brcs,na.rm = TRUE))
train1$TEAM_BASERUN_CS<-brcs
  ##Create flag for missing or not
y<-as.integer(is.na(train$TEAM_BASERUN_CS))
train1$BRCS_FLAG<-y

## Baserun SB Introduce randomness when imputing
brsb<-train$TEAM_BASERUN_SB
brsb[brsb==0]<-15
brsb<-log(brsb)
brsb[is.na(brsb)]<-rnorm(n=1,mean = mean(brsb,na.rm=TRUE),sd=sd(brsb,na.rm = TRUE))
train1$TEAM_BASERUN_SB<-brsb
colnames(train1)[9] <- "log(BR_SB)"

##Batting SO

btso<-train$TEAM_BATTING_SO
btso[is.na(btso)]<-rnorm(n=1,mean = mean(btso,na.rm=TRUE),sd=sd(btso,na.rm = TRUE))
train1$TEAM_BATTING_SO<-btso


##Batting HBP  Not a useful variable because:

cor(train1$TARGET_WINS,train1$TEAM_BATTING_HBP,use="complete.obs")
## [1] 0.07350424
cor.test(train1$TARGET_WINS,train1$TEAM_BATTING_HBP,use="complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  train1$TARGET_WINS and train1$TEAM_BATTING_HBP
## t = 1.0133, df = 189, p-value = 0.3122
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06919725  0.21325801
## sample estimates:
##        cor 
## 0.07350424
    ##Not At All significant
      #Now let's check Whether null or not is different
par(mfrow=c(2,1))
hist(train1$TARGET_WINS[!is.na(train1$TEAM_BATTING_HBP)],xlab="",main="Not Null")
 hist(train1$TARGET_WINS[is.na(train1$TEAM_BATTING_HBP)],xlab="",main="Null")
 t.test(x=train1$TARGET_WINS[!is.na(train1$TEAM_BATTING_HBP)],y=train1$TARGET_WINS[is.na(train1$TEAM_BATTING_HBP)])
## 
##  Welch Two Sample t-test
## 
## data:  train1$TARGET_WINS[!is.na(train1$TEAM_BATTING_HBP)] and train1$TARGET_WINS[is.na(train1$TEAM_BATTING_HBP)]
## t = 0.15701, df = 255.37, p-value = 0.8754
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.711574  2.008143
## sample estimates:
## mean of x mean of y 
##  80.92670  80.77842
 hbp<-train1$TEAM_BATTING_HBP
 hbp[is.na(hbp)]<-rnorm(n=1,mean = mean(hbp,na.rm=TRUE),sd=sd(hbp,na.rm = TRUE))
 train1$TEAM_BATTING_HBP<-hbp
 
 ### Fielding DP, impute
 
 fdp<-train$TEAM_FIELDING_DP
 fdp[is.na(fdp)]<-rnorm(n=1,mean = mean(fdp,na.rm=TRUE),sd=sd(fdp,na.rm = TRUE))
 train1$TEAM_FIELDING_DP<-fdp
 
 
 ###Pitching BB # Replace outliers with 99.9 percentile value
 
 bb<-train1$TEAM_PITCHING_BB
 outliers<-mean(bb)+3*sd(bb)
 bb[bb>outliers]<-outliers
 train1$TEAM_PITCHING_BB<-bb
 
 ### Fielding E
 cor(train1$TARGET_WINS,train1$TEAM_FIELDING_E)
## [1] -0.1764848
 ##Keep this the way it is.
 
 ## PITCHING H
 #Use Box cox transform
 lambda<-powerTransform(train1$TEAM_PITCHING_H)$lambda
 train1$BoxCoxPitch_H<- train1$TEAM_PITCHING_H^lambda
 ##Try a bin with seperation level = Pitch hits = 1300
 train1$pitch_h_bin<-as.integer(train1$TEAM_PITCHING_H>1300)
 
# dev.off()
 par(fig=c(0,0.8,0,0.8), new=TRUE)
 plot(y=train$TARGET_WINS, x=train$TEAM_PITCHING_H^lambda, xlab="Team PITCHING_H",
      ylab="Wins",col = "black")
 par(fig=c(0,0.8,0.45,1), new=TRUE)
 #boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
 hist(train$TEAM_PITCHING_H^lambda, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
 mtext(paste("Mean =",round(mean(train$TEAM_PITCHING_H,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_PITCHING_H,na.rm=TRUE),1)),outer=TRUE, line=-3)
 mtext(paste("Missing Values = ",sum(is.na(train$TEAM_PITCHING_H))),outer=TRUE, line=-4)

 ## Pitching SO replace missing values and get rid of outliers
 #pp<-train1$TEAM_PITCHING_SO
 #outliersp<-mean(pp,na.rm = TRUE)+3*sd(pp,na.rm=TRUE)
 #pp[pp>out]<-10000
 #train1$TEAM_PITCHING_SO<-pp
 
 pso<-train$TEAM_PITCHING_SO
 pso[is.na(pso)]<-rnorm(n=1,mean = mean(pso,na.rm=TRUE),sd=sd(pso,na.rm = TRUE))
 train1$TEAM_PITCHING_SO<-pso

  
 
 ##Building Models
 
 head(train1,1)
##   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B log(Bat_3B)
## 1     1          39           1445             194    3.663562
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO log(BR_SB)
## 1              13             143             842   3.741591
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1        49.85987         74.08955            9364               84
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011         145.6695
##   BRCS_FLAG BoxCoxPitch_H pitch_h_bin
## 1         1  4.652053e-13           1
 model1<-lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_BASERUN_CS+train1$TEAM_BATTING_HBP+train1$TEAM_PITCHING_HR+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin))
 summary(model1)
## 
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B + 
##     train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB + 
##     train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_BASERUN_CS + 
##     train1$TEAM_BATTING_HBP + train1$TEAM_PITCHING_HR + train1$TEAM_PITCHING_BB + 
##     train1$TEAM_PITCHING_SO + train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + 
##     as.factor(train1$BRCS_FLAG) + train1$BoxCoxPitch_H + as.factor(train1$pitch_h_bin), 
##     data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.883  -8.511   0.189   8.785  55.047 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -3.317e+00  8.889e+00  -0.373 0.709051    
## train1$TEAM_BATTING_H           4.867e-02  3.903e-03  12.469  < 2e-16 ***
## train1$TEAM_BATTING_2B         -2.338e-02  9.597e-03  -2.436 0.014936 *  
## train1$"log(Bat_3B)"            3.283e+00  9.274e-01   3.540 0.000408 ***
## train1$TEAM_BATTING_HR          3.827e-02  3.099e-02   1.235 0.216994    
## train1$TEAM_BATTING_BB          3.360e-02  8.106e-03   4.145 3.52e-05 ***
## train1$TEAM_BATTING_SO         -4.377e-03  2.480e-03  -1.765 0.077688 .  
## train1$"log(BR_SB)"             1.250e+00  5.423e-01   2.306 0.021223 *  
## train1$TEAM_BASERUN_CS          1.781e-02  1.766e-02   1.009 0.313222    
## train1$TEAM_BATTING_HBP         1.472e-01  5.394e-02   2.728 0.006413 ** 
## train1$TEAM_PITCHING_HR         2.944e-02  2.747e-02   1.072 0.284025    
## train1$TEAM_PITCHING_BB        -1.573e-02  6.686e-03  -2.353 0.018726 *  
## train1$TEAM_PITCHING_SO         2.765e-03  6.985e-04   3.958 7.80e-05 ***
## train1$TEAM_FIELDING_E         -1.810e-02  2.479e-03  -7.299 4.00e-13 ***
## train1$TEAM_FIELDING_DP        -1.183e-01  1.344e-02  -8.800  < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1    3.527e+00  9.645e-01   3.657 0.000262 ***
## train1$BoxCoxPitch_H           -2.286e+10  1.236e+10  -1.849 0.064517 .  
## as.factor(train1$pitch_h_bin)1 -2.771e+00  1.700e+00  -1.631 0.103105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.1 on 2258 degrees of freedom
## Multiple R-squared:  0.3136, Adjusted R-squared:  0.3084 
## F-statistic: 60.68 on 17 and 2258 DF,  p-value: < 2.2e-16
 ##Use backwards selection to build models to converge on max(adjusted RSME)
 
 #Remove Pitching Hits
 summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_BASERUN_CS+train1$TEAM_BATTING_HBP+train1$TEAM_PITCHING_HR+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin)))
## 
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B + 
##     train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB + 
##     train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_BASERUN_CS + 
##     train1$TEAM_BATTING_HBP + train1$TEAM_PITCHING_HR + train1$TEAM_PITCHING_BB + 
##     train1$TEAM_PITCHING_SO + train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + 
##     as.factor(train1$BRCS_FLAG) + train1$BoxCoxPitch_H + as.factor(train1$pitch_h_bin), 
##     data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.883  -8.511   0.189   8.785  55.047 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -3.317e+00  8.889e+00  -0.373 0.709051    
## train1$TEAM_BATTING_H           4.867e-02  3.903e-03  12.469  < 2e-16 ***
## train1$TEAM_BATTING_2B         -2.338e-02  9.597e-03  -2.436 0.014936 *  
## train1$"log(Bat_3B)"            3.283e+00  9.274e-01   3.540 0.000408 ***
## train1$TEAM_BATTING_HR          3.827e-02  3.099e-02   1.235 0.216994    
## train1$TEAM_BATTING_BB          3.360e-02  8.106e-03   4.145 3.52e-05 ***
## train1$TEAM_BATTING_SO         -4.377e-03  2.480e-03  -1.765 0.077688 .  
## train1$"log(BR_SB)"             1.250e+00  5.423e-01   2.306 0.021223 *  
## train1$TEAM_BASERUN_CS          1.781e-02  1.766e-02   1.009 0.313222    
## train1$TEAM_BATTING_HBP         1.472e-01  5.394e-02   2.728 0.006413 ** 
## train1$TEAM_PITCHING_HR         2.944e-02  2.747e-02   1.072 0.284025    
## train1$TEAM_PITCHING_BB        -1.573e-02  6.686e-03  -2.353 0.018726 *  
## train1$TEAM_PITCHING_SO         2.765e-03  6.985e-04   3.958 7.80e-05 ***
## train1$TEAM_FIELDING_E         -1.810e-02  2.479e-03  -7.299 4.00e-13 ***
## train1$TEAM_FIELDING_DP        -1.183e-01  1.344e-02  -8.800  < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1    3.527e+00  9.645e-01   3.657 0.000262 ***
## train1$BoxCoxPitch_H           -2.286e+10  1.236e+10  -1.849 0.064517 .  
## as.factor(train1$pitch_h_bin)1 -2.771e+00  1.700e+00  -1.631 0.103105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.1 on 2258 degrees of freedom
## Multiple R-squared:  0.3136, Adjusted R-squared:  0.3084 
## F-statistic: 60.68 on 17 and 2258 DF,  p-value: < 2.2e-16
#remove Baserun CS
 summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_BATTING_HBP+train1$TEAM_PITCHING_HR+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin)))
## 
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B + 
##     train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB + 
##     train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_BATTING_HBP + 
##     train1$TEAM_PITCHING_HR + train1$TEAM_PITCHING_BB + train1$TEAM_PITCHING_SO + 
##     train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + as.factor(train1$BRCS_FLAG) + 
##     train1$BoxCoxPitch_H + as.factor(train1$pitch_h_bin), data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.982  -8.488   0.185   8.753  55.256 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -2.976e+00  8.882e+00  -0.335 0.737610    
## train1$TEAM_BATTING_H           4.853e-02  3.901e-03  12.440  < 2e-16 ***
## train1$TEAM_BATTING_2B         -2.289e-02  9.585e-03  -2.388 0.017011 *  
## train1$"log(Bat_3B)"            3.322e+00  9.266e-01   3.585 0.000344 ***
## train1$TEAM_BATTING_HR          3.605e-02  3.091e-02   1.166 0.243644    
## train1$TEAM_BATTING_BB          3.324e-02  8.098e-03   4.105 4.19e-05 ***
## train1$TEAM_BATTING_SO         -4.459e-03  2.479e-03  -1.799 0.072140 .  
## train1$"log(BR_SB)"             1.450e+00  5.048e-01   2.872 0.004114 ** 
## train1$TEAM_BATTING_HBP         1.525e-01  5.368e-02   2.842 0.004524 ** 
## train1$TEAM_PITCHING_HR         2.952e-02  2.747e-02   1.075 0.282677    
## train1$TEAM_PITCHING_BB        -1.583e-02  6.685e-03  -2.368 0.017986 *  
## train1$TEAM_PITCHING_SO         2.771e-03  6.985e-04   3.967 7.49e-05 ***
## train1$TEAM_FIELDING_E         -1.839e-02  2.462e-03  -7.468 1.16e-13 ***
## train1$TEAM_FIELDING_DP        -1.185e-01  1.344e-02  -8.817  < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1    3.180e+00  9.012e-01   3.529 0.000426 ***
## train1$BoxCoxPitch_H           -2.347e+10  1.234e+10  -1.902 0.057347 .  
## as.factor(train1$pitch_h_bin)1 -2.750e+00  1.699e+00  -1.618 0.105760    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.1 on 2259 degrees of freedom
## Multiple R-squared:  0.3133, Adjusted R-squared:  0.3084 
## F-statistic: 64.41 on 16 and 2259 DF,  p-value: < 2.2e-16
 #RemoveHBP
 summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_HR+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin)))
## 
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B + 
##     train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB + 
##     train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_HR + 
##     train1$TEAM_PITCHING_BB + train1$TEAM_PITCHING_SO + train1$TEAM_FIELDING_E + 
##     train1$TEAM_FIELDING_DP + as.factor(train1$BRCS_FLAG) + train1$BoxCoxPitch_H + 
##     as.factor(train1$pitch_h_bin), data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.330  -8.547   0.231   8.733  56.072 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     9.531e+00  7.728e+00   1.233 0.217581    
## train1$TEAM_BATTING_H           4.889e-02  3.905e-03  12.521  < 2e-16 ***
## train1$TEAM_BATTING_2B         -2.785e-02  9.440e-03  -2.950 0.003212 ** 
## train1$"log(Bat_3B)"            3.336e+00  9.281e-01   3.594 0.000332 ***
## train1$TEAM_BATTING_HR          2.676e-02  3.079e-02   0.869 0.384905    
## train1$TEAM_BATTING_BB          3.468e-02  8.095e-03   4.284 1.91e-05 ***
## train1$TEAM_BATTING_SO         -5.064e-03  2.473e-03  -2.047 0.040750 *  
## train1$"log(BR_SB)"             1.385e+00  5.051e-01   2.743 0.006137 ** 
## train1$TEAM_PITCHING_HR         3.605e-02  2.742e-02   1.315 0.188735    
## train1$TEAM_PITCHING_BB        -1.641e-02  6.692e-03  -2.452 0.014292 *  
## train1$TEAM_PITCHING_SO         2.850e-03  6.990e-04   4.077 4.71e-05 ***
## train1$TEAM_FIELDING_E         -1.864e-02  2.465e-03  -7.561 5.77e-14 ***
## train1$TEAM_FIELDING_DP        -1.179e-01  1.346e-02  -8.765  < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1    3.048e+00  9.015e-01   3.381 0.000734 ***
## train1$BoxCoxPitch_H           -2.433e+10  1.236e+10  -1.969 0.049101 *  
## as.factor(train1$pitch_h_bin)1 -2.906e+00  1.701e+00  -1.708 0.087717 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.12 on 2260 degrees of freedom
## Multiple R-squared:  0.3108, Adjusted R-squared:  0.3063 
## F-statistic: 67.95 on 15 and 2260 DF,  p-value: < 2.2e-16
 #Remove Pitching HR
 summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin)))
## 
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B + 
##     train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB + 
##     train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_BB + 
##     train1$TEAM_PITCHING_SO + train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + 
##     as.factor(train1$BRCS_FLAG) + train1$BoxCoxPitch_H + as.factor(train1$pitch_h_bin), 
##     data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.178  -8.576   0.237   8.700  56.502 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     9.528e+00  7.729e+00   1.233 0.217804    
## train1$TEAM_BATTING_H           4.904e-02  3.904e-03  12.562  < 2e-16 ***
## train1$TEAM_BATTING_2B         -2.836e-02  9.433e-03  -3.006 0.002672 ** 
## train1$"log(Bat_3B)"            3.457e+00  9.236e-01   3.743 0.000186 ***
## train1$TEAM_BATTING_HR          6.496e-02  1.018e-02   6.379 2.15e-10 ***
## train1$TEAM_BATTING_BB          2.882e-02  6.757e-03   4.265 2.08e-05 ***
## train1$TEAM_BATTING_SO         -4.836e-03  2.468e-03  -1.960 0.050172 .  
## train1$"log(BR_SB)"             1.404e+00  5.050e-01   2.781 0.005460 ** 
## train1$TEAM_PITCHING_BB        -1.115e-02  5.368e-03  -2.078 0.037864 *  
## train1$TEAM_PITCHING_SO         2.647e-03  6.819e-04   3.882 0.000107 ***
## train1$TEAM_FIELDING_E         -1.935e-02  2.404e-03  -8.050 1.32e-15 ***
## train1$TEAM_FIELDING_DP        -1.180e-01  1.346e-02  -8.767  < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1    3.024e+00  9.014e-01   3.354 0.000809 ***
## train1$BoxCoxPitch_H           -2.601e+10  1.229e+10  -2.116 0.034465 *  
## as.factor(train1$pitch_h_bin)1 -3.091e+00  1.696e+00  -1.823 0.068491 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.12 on 2261 degrees of freedom
## Multiple R-squared:  0.3103, Adjusted R-squared:  0.306 
## F-statistic: 72.66 on 14 and 2261 DF,  p-value: < 2.2e-16
 #Remove Pitching_BB
 summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin)))
## 
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B + 
##     train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB + 
##     train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_SO + 
##     train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + as.factor(train1$BRCS_FLAG) + 
##     train1$BoxCoxPitch_H + as.factor(train1$pitch_h_bin), data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.947  -8.469   0.198   8.696  57.301 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     6.688e+00  7.613e+00   0.879 0.379729    
## train1$TEAM_BATTING_H           4.935e-02  3.903e-03  12.643  < 2e-16 ***
## train1$TEAM_BATTING_2B         -2.676e-02  9.408e-03  -2.844 0.004496 ** 
## train1$"log(Bat_3B)"            3.664e+00  9.189e-01   3.987 6.89e-05 ***
## train1$TEAM_BATTING_HR          6.642e-02  1.017e-02   6.534 7.87e-11 ***
## train1$TEAM_BATTING_BB          1.660e-02  3.325e-03   4.991 6.46e-07 ***
## train1$TEAM_BATTING_SO         -4.502e-03  2.464e-03  -1.827 0.067826 .  
## train1$"log(BR_SB)"             1.338e+00  5.043e-01   2.652 0.008048 ** 
## train1$TEAM_PITCHING_SO         2.062e-03  6.214e-04   3.318 0.000921 ***
## train1$TEAM_FIELDING_E         -2.125e-02  2.225e-03  -9.551  < 2e-16 ***
## train1$TEAM_FIELDING_DP        -1.177e-01  1.347e-02  -8.742  < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1    3.241e+00  8.960e-01   3.617 0.000305 ***
## train1$BoxCoxPitch_H           -1.531e+10  1.117e+10  -1.370 0.170667    
## as.factor(train1$pitch_h_bin)1 -2.472e+00  1.671e+00  -1.480 0.139025    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.13 on 2262 degrees of freedom
## Multiple R-squared:  0.309,  Adjusted R-squared:  0.305 
## F-statistic:  77.8 on 13 and 2262 DF,  p-value: < 2.2e-16
 ## Remove the BoxCox Pitching H
 summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+as.factor(train1$pitch_h_bin)))
## 
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B + 
##     train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB + 
##     train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_BB + 
##     train1$TEAM_PITCHING_SO + train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + 
##     as.factor(train1$BRCS_FLAG) + as.factor(train1$pitch_h_bin), 
##     data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.554  -8.452   0.224   8.635  56.264 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -1.8408813  5.5600550  -0.331 0.740607    
## train1$TEAM_BATTING_H           0.0519962  0.0036475  14.255  < 2e-16 ***
## train1$TEAM_BATTING_2B         -0.0246335  0.0092743  -2.656 0.007960 ** 
## train1$"log(Bat_3B)"            3.8152797  0.9086639   4.199 2.79e-05 ***
## train1$TEAM_BATTING_HR          0.0670858  0.0101404   6.616 4.60e-11 ***
## train1$TEAM_BATTING_BB          0.0236201  0.0062991   3.750 0.000181 ***
## train1$TEAM_BATTING_SO         -0.0053982  0.0024552  -2.199 0.027999 *  
## train1$"log(BR_SB)"             1.2741535  0.5015940   2.540 0.011145 *  
## train1$TEAM_PITCHING_BB        -0.0063922  0.0048773  -1.311 0.190123    
## train1$TEAM_PITCHING_SO         0.0027110  0.0006817   3.977 7.20e-05 ***
## train1$TEAM_FIELDING_E         -0.0181807  0.0023411  -7.766 1.22e-14 ***
## train1$TEAM_FIELDING_DP        -0.1157762  0.0134277  -8.622  < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1    3.4350255  0.8808642   3.900 9.92e-05 ***
## as.factor(train1$pitch_h_bin)1 -1.6295306  1.5499227  -1.051 0.293204    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.13 on 2262 degrees of freedom
## Multiple R-squared:  0.3089, Adjusted R-squared:  0.305 
## F-statistic: 77.79 on 13 and 2262 DF,  p-value: < 2.2e-16
 ##Remove Pitching BB
 summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+as.factor(train1$pitch_h_bin)))
## 
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B + 
##     train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB + 
##     train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_SO + 
##     train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + as.factor(train1$BRCS_FLAG) + 
##     as.factor(train1$pitch_h_bin), data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.286  -8.442   0.196   8.698  56.887 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -0.5626977  5.4747220  -0.103 0.918146    
## train1$TEAM_BATTING_H           0.0513690  0.0036165  14.204  < 2e-16 ***
## train1$TEAM_BATTING_2B         -0.0245845  0.0092757  -2.650 0.008095 ** 
## train1$"log(Bat_3B)"            3.8566941  0.9082583   4.246 2.26e-05 ***
## train1$TEAM_BATTING_HR          0.0674959  0.0101372   6.658 3.47e-11 ***
## train1$TEAM_BATTING_BB          0.0166087  0.0033260   4.994 6.38e-07 ***
## train1$TEAM_BATTING_SO         -0.0050055  0.0024372  -2.054 0.040110 *  
## train1$"log(BR_SB)"             1.2649856  0.5016248   2.522 0.011745 *  
## train1$TEAM_PITCHING_SO         0.0022858  0.0005996   3.812 0.000142 ***
## train1$TEAM_FIELDING_E         -0.0198367  0.0019711 -10.064  < 2e-16 ***
## train1$TEAM_FIELDING_DP        -0.1162299  0.0134253  -8.658  < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1    3.4681487  0.8806412   3.938 8.46e-05 ***
## as.factor(train1$pitch_h_bin)1 -1.6176962  1.5501421  -1.044 0.296792    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.13 on 2263 degrees of freedom
## Multiple R-squared:  0.3084, Adjusted R-squared:  0.3047 
## F-statistic:  84.1 on 12 and 2263 DF,  p-value: < 2.2e-16
 ## Remove the Factor of Pitching hits binned
 summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)))
## 
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B + 
##     train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB + 
##     train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_SO + 
##     train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + as.factor(train1$BRCS_FLAG), 
##     data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.402  -8.486   0.201   8.675  57.014 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -1.1105640  5.4495997  -0.204 0.838537    
## train1$TEAM_BATTING_H         0.0510399  0.0036028  14.167  < 2e-16 ***
## train1$TEAM_BATTING_2B       -0.0253578  0.0092462  -2.742 0.006145 ** 
## train1$"log(Bat_3B)"          3.8270093  0.9078306   4.216 2.59e-05 ***
## train1$TEAM_BATTING_HR        0.0667032  0.0101089   6.598 5.16e-11 ***
## train1$TEAM_BATTING_BB        0.0165692  0.0033259   4.982 6.77e-07 ***
## train1$TEAM_BATTING_SO       -0.0048403  0.0024321  -1.990 0.046691 *  
## train1$"log(BR_SB)"           1.2381115  0.5009732   2.471 0.013531 *  
## train1$TEAM_PITCHING_SO       0.0022600  0.0005991   3.772 0.000166 ***
## train1$TEAM_FIELDING_E       -0.0198499  0.0019711 -10.070  < 2e-16 ***
## train1$TEAM_FIELDING_DP      -0.1167500  0.0134163  -8.702  < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1  3.4035656  0.8784813   3.874 0.000110 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.13 on 2264 degrees of freedom
## Multiple R-squared:  0.3081, Adjusted R-squared:  0.3047 
## F-statistic: 91.64 on 11 and 2264 DF,  p-value: < 2.2e-16
 model2<-lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG))

  ### Now we have a model that has R~AdjR~0.34 with 11 Variables
 z<-data.frame(train1$TEAM_BATTING_H,train1$TEAM_BATTING_2B,train1$'log(Bat_3B)',train1$TEAM_BATTING_HR,train1$TEAM_BATTING_BB,train1$TEAM_BATTING_SO,train1$'log(BR_SB)',train1$TEAM_PITCHING_SO,train1$TEAM_FIELDING_E,train1$TEAM_FIELDING_DP,as.numeric(train1$BRCS_FLAG))
 colnames(z)<-c("Bat_H","Bat_2B","Bat_3B","Bat_HR","Bat_BB","Bat_SO","BaseRun_SB","PITCH_SO","Field_E","Field_DP","BaseRun_CS")
  matmod2<-as.matrix(cor(z))
 corrplot(matmod2,tl.cex=0.8)

 ## Let's look at the unique correlations with the outcome variable.
 z<-data.frame("Wins"=train1$TARGET_WINS,z)
 part<-data.frame(pcor(z)$estimate)
 sig<-data.frame(pcor(z)$p.value)$Wins
 VarName<-row.names(part)
 partw<-data.frame(VarName,part$Wins,sig)
 partw
##       VarName   part.Wins          sig
## 1        Wins  1.00000000 0.000000e+00
## 2       Bat_H  0.28535467 1.026740e-43
## 3      Bat_2B -0.05754238 6.145391e-03
## 4      Bat_3B  0.08825080 2.589519e-05
## 5      Bat_HR  0.13736304 5.155775e-11
## 6      Bat_BB  0.10413246 6.773677e-07
## 7      Bat_SO -0.04179032 4.669064e-02
## 8  BaseRun_SB  0.05187069 1.353112e-02
## 9    PITCH_SO  0.07902899 1.660205e-04
## 10    Field_E -0.20705700 2.301129e-23
## 11   Field_DP -0.17990378 6.154157e-18
## 12 BaseRun_CS  0.08115746 1.099348e-04
 ## Note  Bat 3B, Bat SO, Pitch SO, BaseRun CS are all the smallest correlations with Wins.
 ##Remove all 3 for parsimony
 #Model3:
 model3<-lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$'log(BR_SB)'+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP)
 summary(model3)
## 
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_HR + 
##     train1$TEAM_BATTING_BB + train1$"log(BR_SB)" + train1$TEAM_FIELDING_E + 
##     train1$TEAM_FIELDING_DP, data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.937  -8.646   0.156   8.580  56.463 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              8.780494   3.964577   2.215  0.02688 *  
## train1$TEAM_BATTING_H    0.052093   0.002081  25.038  < 2e-16 ***
## train1$TEAM_BATTING_HR   0.019435   0.006414   3.030  0.00247 ** 
## train1$TEAM_BATTING_BB   0.019253   0.003306   5.823 6.60e-09 ***
## train1$"log(BR_SB)"      1.477988   0.480309   3.077  0.00211 ** 
## train1$TEAM_FIELDING_E  -0.015051   0.001867  -8.061 1.21e-15 ***
## train1$TEAM_FIELDING_DP -0.130996   0.013307  -9.844  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.33 on 2269 degrees of freedom
## Multiple R-squared:  0.2856, Adjusted R-squared:  0.2837 
## F-statistic: 151.2 on 6 and 2269 DF,  p-value: < 2.2e-16
 ##Rsq = 0.32
 
 #Lets compare Histograms of residuals
 par(mfrow=c(1,3))
 hist(model1$residuals,xlab="",main="Model1")
 hist(model2$residuals,xlab="",main="Model2")
 hist(model3$residuals,xlab="",main="Model3")

 mean(model1$residuals^2)
## [1] 170.2429
 mean(model2$residuals^2)
## [1] 171.6107
 mean(model3$residuals^2)
## [1] 177.188
 #### Let's take the test data, impute missing values then run the models on them to get how many wins we predict.
 
 test1<-test
 
 ###Transform data
 
 ##Batting H
 ##Leave as is
 
 ##Batting 2B
 ##Leave as is
 
 ##Batting H
 ##Leave as is
 
 ##Batting BB
 ##Leave as is
 
 #Batting 3B Transform and impute
 
 test1[test1$TEAM_BATTING_3B==0,5 ]<-8
 test1$TEAM_BATTING_3B<-log(test1$TEAM_BATTING_3B)
 colnames(test1)[5] <- "log(Bat_3B)"
 
 ## Baserun CS Introduce randomness when imputing
 brcs<-test$TEAM_BASERUN_CS
 brcs[is.na(brcs)]<-rnorm(n=1,mean = mean(brcs,na.rm=TRUE),sd=sd(brcs,na.rm = TRUE))
 test1$TEAM_BASERUN_CS<-brcs
 ##Create flag for missing or not
 y<-as.integer(is.na(test$TEAM_BASERUN_CS))
 test1$BRCS_FLAG<-y
 
 ## Baserun SB Introduce randomness when imputing
 brsb<-test$TEAM_BASERUN_SB
 brsb[brsb==0]<-15
 brsb<-log(brsb)
 brsb[is.na(brsb)]<-rnorm(n=1,mean = mean(brsb,na.rm=TRUE),sd=sd(brsb,na.rm = TRUE))
 test1$TEAM_BASERUN_SB<-brsb
 colnames(test1)[9] <- "log(BR_SB)"
 
 ##Batting SO
 
 btso<-test$TEAM_BATTING_SO
 btso[is.na(btso)]<-rnorm(n=1,mean = mean(btso,na.rm=TRUE),sd=sd(btso,na.rm = TRUE))
 test1$TEAM_BATTING_SO<-btso
 
 
 ##Not At All significant
 #Now let's check Whether null or not is different
 par(mfrow=c(2,1))
 #hist(test1$TARGET_WINS[!is.na(test1$TEAM_BATTING_HBP)],xlab="",main="Not Null")
 #hist(test1$TARGET_WINS[is.na(test1$TEAM_BATTING_HBP)],xlab="",main="Null")
 #t.test(x=test1$TARGET_WINS[!is.na(test1$TEAM_BATTING_HBP)],y=test1$TARGET_WINS[is.na(test1$TEAM_BATTING_HBP)])
 
 hbp<-test1$TEAM_BATTING_HBP
 hbp[is.na(hbp)]<-rnorm(n=1,mean = mean(hbp,na.rm=TRUE),sd=sd(hbp,na.rm = TRUE))
 test1$TEAM_BATTING_HBP<-hbp
 
 ### Fielding DP, impute
 
 fdp<-test$TEAM_FIELDING_DP
 fdp[is.na(fdp)]<-rnorm(n=1,mean = mean(fdp,na.rm=TRUE),sd=sd(fdp,na.rm = TRUE))
 test1$TEAM_FIELDING_DP<-fdp
 
 
 ###Pitching BB # Replace outliers with 99.9 percentile value
 
 bb<-test1$TEAM_PITCHING_BB
 outliers<-mean(bb)+3*sd(bb)
 bb[bb>outliers]<-outliers
 test1$TEAM_PITCHING_BB<-bb
 
 ### Fielding E
 #cor(test1$TARGET_WINS,test1$TEAM_FIELDING_E)
 ##Keep this the way it is.
 
 ## PITCHING H
 #Use Box cox transform
 lambda<-powerTransform(test1$TEAM_PITCHING_H)$lambda
## Warning in estimateTransform.default(X, Y, weights, family, start,
## method, : Convergence failure: return code = 52
 test1$BoxCoxPitch_H<- test1$TEAM_PITCHING_H^lambda
 ##Try a bin with seperation level = Pitch hits = 1300
 test1$pitch_h_bin<-as.integer(test1$TEAM_PITCHING_H>1300)
 
 
 ## Pitching SO replace missing values and get rid of outliers
 #pp<-test1$TEAM_PITCHING_SO
 #outliersp<-mean(pp,na.rm = TRUE)+3*sd(pp,na.rm=TRUE)
 #pp[pp>out]<-10000
 #test1$TEAM_PITCHING_SO<-pp
 
 pso<-test$TEAM_PITCHING_SO
 pso[is.na(pso)]<-rnorm(n=1,mean = mean(pso,na.rm=TRUE),sd=sd(pso,na.rm = TRUE))
 test1$TEAM_PITCHING_SO<-pso
 ####
 test1$TEAM_BATTING_HR<-test$TEAM_BATTING_HR
# Now run the model 3 on this data
 rm(Wins)
## Warning in rm(Wins): object 'Wins' not found
 Wins<- (-17.5)+0.05155*test1$TEAM_BATTING_H+
   0.02673*test1$TEAM_BATTING_HR+
   0.018462*test1$TEAM_BATTING_BB+
   6.0527*test1$TEAM_BASERUN_SB-
   0.019737*test1$TEAM_FIELDING_E-
   0.08637*test1$TEAM_FIELDING_DP
 
 Wins<-data.frame("Wins"=round(Wins/1),"Index"=test1$INDEX)
 
 par(mfrow=c(2,1))
 hist(Wins$Wins,xlab="",main = "Predicted Wins",xlim = c(0,150))
 hist(train1$TARGET_WINS,xlab="",main = "Train DataSet Observed Wins",xlim = c(0,150))

 summary(Wins$Wins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   31.00   75.00   81.00   80.47   87.00  114.00
 #hist(train1$TARGET_WINS,xlab="",main = "Predicted Wins")
 summary(train1$TARGET_WINS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   71.00   82.00   80.79   92.00  146.00