library(corrplot)
library(car)
library(ppcor)
## Loading required package: MASS
train<-read.csv("https://raw.githubusercontent.com/scottogden10/621_hw/master/moneyball-training-data.csv")
test<-read.csv("https://raw.githubusercontent.com/scottogden10/621_hw/master/moneyball-evaluation-data.csv")
##Plot Boxplots with scatterplots to get a view of all the variables
hist(train$TARGET_WINS,xlab = "Wins",ylab = "Probability",main = "Target",probability = TRUE)
##Look into Independent variables
#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_H, xlab="Team Batting H",
ylab="Wins",col = "dark red")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_H, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_H),1),"SD=",round(sd(train$TEAM_BATTING_H),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_H))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_2B, xlab="Team Batting 2B",
ylab="Wins",col = "dark blue")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_2B, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_2B),1),"SD=",round(sd(train$TEAM_BATTING_2B),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_2B))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=log(train$TEAM_BATTING_3B), xlab="Team Batting 3B",
ylab="Wins",col = "dark green")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(log(train$TEAM_BATTING_3B), axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_3B),1),"SD=",round(sd(train$TEAM_BATTING_3B),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_3B))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_HR, xlab="Team Batting HR",
ylab="Wins",col = "brown")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_HR, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_HR),1),"SD=",round(sd(train$TEAM_BATTING_HR),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_HR))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_BB, xlab="Team Batting BB",
ylab="Wins",col = "dark grey")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_BB, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_BB),1),"SD=",round(sd(train$TEAM_BATTING_BB),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_BB))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_SO, xlab="Team Batting SO",
ylab="Wins",col = "purple")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_SO, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_SO,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_BATTING_SO,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_SO))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=log(train$TEAM_BASERUN_SB), xlab="Team Baserun SB",
ylab="Wins",col = "green")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(log(train$TEAM_BASERUN_SB), axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BASERUN_SB,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_BASERUN_SB,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BASERUN_SB))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BASERUN_CS, xlab="Team Baserun CS",
ylab="Wins",col = "hot pink")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BASERUN_CS, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BASERUN_CS,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_BASERUN_CS,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BASERUN_CS))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_BATTING_HBP, xlab="Team Batting HBP",
ylab="Wins",col = "magenta")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_BATTING_HBP, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_BATTING_HBP,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_BATTING_HBP,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_BATTING_HBP))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_PITCHING_H, xlab="Team PITCHING_H",
ylab="Wins",col = "black")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_PITCHING_H, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_PITCHING_H,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_PITCHING_H,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_PITCHING_H))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_PITCHING_HR, xlab="Team PITCHING_HR",
ylab="Wins",col = "red")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_HR, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_PITCHING_HR, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_PITCHING_HR,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_PITCHING_HR,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_PITCHING_HR))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_PITCHING_BB, xlab="Team PITCHING_BB",
ylab="Wins",col = "green")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_BB, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_PITCHING_BB, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_PITCHING_BB,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_PITCHING_BB,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_PITCHING_BB))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_PITCHING_SO, xlab="Team PITCHING_SO",
ylab="Wins",col = "maroon")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_SO, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_PITCHING_SO, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_PITCHING_SO,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_PITCHING_SO,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_PITCHING_SO))),outer=TRUE, line=-4)

#dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot
plot(y=train$TARGET_WINS, x=train$TEAM_FIELDING_DP, xlab="Team FIELDING_DP",
ylab="Wins",col = "dark blue")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_SO, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_FIELDING_DP, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_FIELDING_DP,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_FIELDING_DP,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_FIELDING_DP))),outer=TRUE, line=-4)

####Correlation Plot
mat<-as.matrix(cor(train[-1],use="pairwise.complete.obs"))
corrplot(mat,tl.cex=.5)

###Impute missing values by generating random data with noise based on mean and SD
train1<-train
###Transform data
##Batting H
##Leave as is
##Batting 2B
##Leave as is
##Batting H
##Leave as is
##Batting BB
##Leave as is
#Batting 3B Transform and impute
train1[train1$TEAM_BATTING_3B==0,5 ]<-8
train1$TEAM_BATTING_3B<-log(train1$TEAM_BATTING_3B)
colnames(train1)[5] <- "log(Bat_3B)"
## Baserun CS Introduce randomness when imputing
brcs<-train$TEAM_BASERUN_CS
brcs[is.na(brcs)]<-rnorm(n=1,mean = mean(brcs,na.rm=TRUE),sd=sd(brcs,na.rm = TRUE))
train1$TEAM_BASERUN_CS<-brcs
##Create flag for missing or not
y<-as.integer(is.na(train$TEAM_BASERUN_CS))
train1$BRCS_FLAG<-y
## Baserun SB Introduce randomness when imputing
brsb<-train$TEAM_BASERUN_SB
brsb[brsb==0]<-15
brsb<-log(brsb)
brsb[is.na(brsb)]<-rnorm(n=1,mean = mean(brsb,na.rm=TRUE),sd=sd(brsb,na.rm = TRUE))
train1$TEAM_BASERUN_SB<-brsb
colnames(train1)[9] <- "log(BR_SB)"
##Batting SO
btso<-train$TEAM_BATTING_SO
btso[is.na(btso)]<-rnorm(n=1,mean = mean(btso,na.rm=TRUE),sd=sd(btso,na.rm = TRUE))
train1$TEAM_BATTING_SO<-btso
##Batting HBP Not a useful variable because:
cor(train1$TARGET_WINS,train1$TEAM_BATTING_HBP,use="complete.obs")
## [1] 0.07350424
cor.test(train1$TARGET_WINS,train1$TEAM_BATTING_HBP,use="complete.obs")
##
## Pearson's product-moment correlation
##
## data: train1$TARGET_WINS and train1$TEAM_BATTING_HBP
## t = 1.0133, df = 189, p-value = 0.3122
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06919725 0.21325801
## sample estimates:
## cor
## 0.07350424
##Not At All significant
#Now let's check Whether null or not is different
par(mfrow=c(2,1))
hist(train1$TARGET_WINS[!is.na(train1$TEAM_BATTING_HBP)],xlab="",main="Not Null")
hist(train1$TARGET_WINS[is.na(train1$TEAM_BATTING_HBP)],xlab="",main="Null")
t.test(x=train1$TARGET_WINS[!is.na(train1$TEAM_BATTING_HBP)],y=train1$TARGET_WINS[is.na(train1$TEAM_BATTING_HBP)])
##
## Welch Two Sample t-test
##
## data: train1$TARGET_WINS[!is.na(train1$TEAM_BATTING_HBP)] and train1$TARGET_WINS[is.na(train1$TEAM_BATTING_HBP)]
## t = 0.15701, df = 255.37, p-value = 0.8754
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.711574 2.008143
## sample estimates:
## mean of x mean of y
## 80.92670 80.77842
hbp<-train1$TEAM_BATTING_HBP
hbp[is.na(hbp)]<-rnorm(n=1,mean = mean(hbp,na.rm=TRUE),sd=sd(hbp,na.rm = TRUE))
train1$TEAM_BATTING_HBP<-hbp
### Fielding DP, impute
fdp<-train$TEAM_FIELDING_DP
fdp[is.na(fdp)]<-rnorm(n=1,mean = mean(fdp,na.rm=TRUE),sd=sd(fdp,na.rm = TRUE))
train1$TEAM_FIELDING_DP<-fdp
###Pitching BB # Replace outliers with 99.9 percentile value
bb<-train1$TEAM_PITCHING_BB
outliers<-mean(bb)+3*sd(bb)
bb[bb>outliers]<-outliers
train1$TEAM_PITCHING_BB<-bb
### Fielding E
cor(train1$TARGET_WINS,train1$TEAM_FIELDING_E)
## [1] -0.1764848
##Keep this the way it is.
## PITCHING H
#Use Box cox transform
lambda<-powerTransform(train1$TEAM_PITCHING_H)$lambda
train1$BoxCoxPitch_H<- train1$TEAM_PITCHING_H^lambda
##Try a bin with seperation level = Pitch hits = 1300
train1$pitch_h_bin<-as.integer(train1$TEAM_PITCHING_H>1300)
# dev.off()
par(fig=c(0,0.8,0,0.8), new=TRUE)
plot(y=train$TARGET_WINS, x=train$TEAM_PITCHING_H^lambda, xlab="Team PITCHING_H",
ylab="Wins",col = "black")
par(fig=c(0,0.8,0.45,1), new=TRUE)
#boxplot(train$TEAM_BATTING_H, horizontal=TRUE, axes=FALSE)
hist(train$TEAM_PITCHING_H^lambda, axes=FALSE,xlab="",ylab="",labels=FALSE,main=NULL)
mtext(paste("Mean =",round(mean(train$TEAM_PITCHING_H,na.rm = TRUE),1),"SD=",round(sd(train$TEAM_PITCHING_H,na.rm=TRUE),1)),outer=TRUE, line=-3)
mtext(paste("Missing Values = ",sum(is.na(train$TEAM_PITCHING_H))),outer=TRUE, line=-4)

## Pitching SO replace missing values and get rid of outliers
#pp<-train1$TEAM_PITCHING_SO
#outliersp<-mean(pp,na.rm = TRUE)+3*sd(pp,na.rm=TRUE)
#pp[pp>out]<-10000
#train1$TEAM_PITCHING_SO<-pp
pso<-train$TEAM_PITCHING_SO
pso[is.na(pso)]<-rnorm(n=1,mean = mean(pso,na.rm=TRUE),sd=sd(pso,na.rm = TRUE))
train1$TEAM_PITCHING_SO<-pso
##Building Models
head(train1,1)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B log(Bat_3B)
## 1 1 39 1445 194 3.663562
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO log(BR_SB)
## 1 13 143 842 3.741591
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 49.85987 74.08955 9364 84
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 145.6695
## BRCS_FLAG BoxCoxPitch_H pitch_h_bin
## 1 1 4.652053e-13 1
model1<-lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_BASERUN_CS+train1$TEAM_BATTING_HBP+train1$TEAM_PITCHING_HR+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin))
summary(model1)
##
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B +
## train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB +
## train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_BASERUN_CS +
## train1$TEAM_BATTING_HBP + train1$TEAM_PITCHING_HR + train1$TEAM_PITCHING_BB +
## train1$TEAM_PITCHING_SO + train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP +
## as.factor(train1$BRCS_FLAG) + train1$BoxCoxPitch_H + as.factor(train1$pitch_h_bin),
## data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.883 -8.511 0.189 8.785 55.047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.317e+00 8.889e+00 -0.373 0.709051
## train1$TEAM_BATTING_H 4.867e-02 3.903e-03 12.469 < 2e-16 ***
## train1$TEAM_BATTING_2B -2.338e-02 9.597e-03 -2.436 0.014936 *
## train1$"log(Bat_3B)" 3.283e+00 9.274e-01 3.540 0.000408 ***
## train1$TEAM_BATTING_HR 3.827e-02 3.099e-02 1.235 0.216994
## train1$TEAM_BATTING_BB 3.360e-02 8.106e-03 4.145 3.52e-05 ***
## train1$TEAM_BATTING_SO -4.377e-03 2.480e-03 -1.765 0.077688 .
## train1$"log(BR_SB)" 1.250e+00 5.423e-01 2.306 0.021223 *
## train1$TEAM_BASERUN_CS 1.781e-02 1.766e-02 1.009 0.313222
## train1$TEAM_BATTING_HBP 1.472e-01 5.394e-02 2.728 0.006413 **
## train1$TEAM_PITCHING_HR 2.944e-02 2.747e-02 1.072 0.284025
## train1$TEAM_PITCHING_BB -1.573e-02 6.686e-03 -2.353 0.018726 *
## train1$TEAM_PITCHING_SO 2.765e-03 6.985e-04 3.958 7.80e-05 ***
## train1$TEAM_FIELDING_E -1.810e-02 2.479e-03 -7.299 4.00e-13 ***
## train1$TEAM_FIELDING_DP -1.183e-01 1.344e-02 -8.800 < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1 3.527e+00 9.645e-01 3.657 0.000262 ***
## train1$BoxCoxPitch_H -2.286e+10 1.236e+10 -1.849 0.064517 .
## as.factor(train1$pitch_h_bin)1 -2.771e+00 1.700e+00 -1.631 0.103105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.1 on 2258 degrees of freedom
## Multiple R-squared: 0.3136, Adjusted R-squared: 0.3084
## F-statistic: 60.68 on 17 and 2258 DF, p-value: < 2.2e-16
##Use backwards selection to build models to converge on max(adjusted RSME)
#Remove Pitching Hits
summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_BASERUN_CS+train1$TEAM_BATTING_HBP+train1$TEAM_PITCHING_HR+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin)))
##
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B +
## train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB +
## train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_BASERUN_CS +
## train1$TEAM_BATTING_HBP + train1$TEAM_PITCHING_HR + train1$TEAM_PITCHING_BB +
## train1$TEAM_PITCHING_SO + train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP +
## as.factor(train1$BRCS_FLAG) + train1$BoxCoxPitch_H + as.factor(train1$pitch_h_bin),
## data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.883 -8.511 0.189 8.785 55.047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.317e+00 8.889e+00 -0.373 0.709051
## train1$TEAM_BATTING_H 4.867e-02 3.903e-03 12.469 < 2e-16 ***
## train1$TEAM_BATTING_2B -2.338e-02 9.597e-03 -2.436 0.014936 *
## train1$"log(Bat_3B)" 3.283e+00 9.274e-01 3.540 0.000408 ***
## train1$TEAM_BATTING_HR 3.827e-02 3.099e-02 1.235 0.216994
## train1$TEAM_BATTING_BB 3.360e-02 8.106e-03 4.145 3.52e-05 ***
## train1$TEAM_BATTING_SO -4.377e-03 2.480e-03 -1.765 0.077688 .
## train1$"log(BR_SB)" 1.250e+00 5.423e-01 2.306 0.021223 *
## train1$TEAM_BASERUN_CS 1.781e-02 1.766e-02 1.009 0.313222
## train1$TEAM_BATTING_HBP 1.472e-01 5.394e-02 2.728 0.006413 **
## train1$TEAM_PITCHING_HR 2.944e-02 2.747e-02 1.072 0.284025
## train1$TEAM_PITCHING_BB -1.573e-02 6.686e-03 -2.353 0.018726 *
## train1$TEAM_PITCHING_SO 2.765e-03 6.985e-04 3.958 7.80e-05 ***
## train1$TEAM_FIELDING_E -1.810e-02 2.479e-03 -7.299 4.00e-13 ***
## train1$TEAM_FIELDING_DP -1.183e-01 1.344e-02 -8.800 < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1 3.527e+00 9.645e-01 3.657 0.000262 ***
## train1$BoxCoxPitch_H -2.286e+10 1.236e+10 -1.849 0.064517 .
## as.factor(train1$pitch_h_bin)1 -2.771e+00 1.700e+00 -1.631 0.103105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.1 on 2258 degrees of freedom
## Multiple R-squared: 0.3136, Adjusted R-squared: 0.3084
## F-statistic: 60.68 on 17 and 2258 DF, p-value: < 2.2e-16
#remove Baserun CS
summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_BATTING_HBP+train1$TEAM_PITCHING_HR+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin)))
##
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B +
## train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB +
## train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_BATTING_HBP +
## train1$TEAM_PITCHING_HR + train1$TEAM_PITCHING_BB + train1$TEAM_PITCHING_SO +
## train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + as.factor(train1$BRCS_FLAG) +
## train1$BoxCoxPitch_H + as.factor(train1$pitch_h_bin), data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.982 -8.488 0.185 8.753 55.256
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.976e+00 8.882e+00 -0.335 0.737610
## train1$TEAM_BATTING_H 4.853e-02 3.901e-03 12.440 < 2e-16 ***
## train1$TEAM_BATTING_2B -2.289e-02 9.585e-03 -2.388 0.017011 *
## train1$"log(Bat_3B)" 3.322e+00 9.266e-01 3.585 0.000344 ***
## train1$TEAM_BATTING_HR 3.605e-02 3.091e-02 1.166 0.243644
## train1$TEAM_BATTING_BB 3.324e-02 8.098e-03 4.105 4.19e-05 ***
## train1$TEAM_BATTING_SO -4.459e-03 2.479e-03 -1.799 0.072140 .
## train1$"log(BR_SB)" 1.450e+00 5.048e-01 2.872 0.004114 **
## train1$TEAM_BATTING_HBP 1.525e-01 5.368e-02 2.842 0.004524 **
## train1$TEAM_PITCHING_HR 2.952e-02 2.747e-02 1.075 0.282677
## train1$TEAM_PITCHING_BB -1.583e-02 6.685e-03 -2.368 0.017986 *
## train1$TEAM_PITCHING_SO 2.771e-03 6.985e-04 3.967 7.49e-05 ***
## train1$TEAM_FIELDING_E -1.839e-02 2.462e-03 -7.468 1.16e-13 ***
## train1$TEAM_FIELDING_DP -1.185e-01 1.344e-02 -8.817 < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1 3.180e+00 9.012e-01 3.529 0.000426 ***
## train1$BoxCoxPitch_H -2.347e+10 1.234e+10 -1.902 0.057347 .
## as.factor(train1$pitch_h_bin)1 -2.750e+00 1.699e+00 -1.618 0.105760
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.1 on 2259 degrees of freedom
## Multiple R-squared: 0.3133, Adjusted R-squared: 0.3084
## F-statistic: 64.41 on 16 and 2259 DF, p-value: < 2.2e-16
#RemoveHBP
summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_HR+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin)))
##
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B +
## train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB +
## train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_HR +
## train1$TEAM_PITCHING_BB + train1$TEAM_PITCHING_SO + train1$TEAM_FIELDING_E +
## train1$TEAM_FIELDING_DP + as.factor(train1$BRCS_FLAG) + train1$BoxCoxPitch_H +
## as.factor(train1$pitch_h_bin), data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.330 -8.547 0.231 8.733 56.072
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.531e+00 7.728e+00 1.233 0.217581
## train1$TEAM_BATTING_H 4.889e-02 3.905e-03 12.521 < 2e-16 ***
## train1$TEAM_BATTING_2B -2.785e-02 9.440e-03 -2.950 0.003212 **
## train1$"log(Bat_3B)" 3.336e+00 9.281e-01 3.594 0.000332 ***
## train1$TEAM_BATTING_HR 2.676e-02 3.079e-02 0.869 0.384905
## train1$TEAM_BATTING_BB 3.468e-02 8.095e-03 4.284 1.91e-05 ***
## train1$TEAM_BATTING_SO -5.064e-03 2.473e-03 -2.047 0.040750 *
## train1$"log(BR_SB)" 1.385e+00 5.051e-01 2.743 0.006137 **
## train1$TEAM_PITCHING_HR 3.605e-02 2.742e-02 1.315 0.188735
## train1$TEAM_PITCHING_BB -1.641e-02 6.692e-03 -2.452 0.014292 *
## train1$TEAM_PITCHING_SO 2.850e-03 6.990e-04 4.077 4.71e-05 ***
## train1$TEAM_FIELDING_E -1.864e-02 2.465e-03 -7.561 5.77e-14 ***
## train1$TEAM_FIELDING_DP -1.179e-01 1.346e-02 -8.765 < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1 3.048e+00 9.015e-01 3.381 0.000734 ***
## train1$BoxCoxPitch_H -2.433e+10 1.236e+10 -1.969 0.049101 *
## as.factor(train1$pitch_h_bin)1 -2.906e+00 1.701e+00 -1.708 0.087717 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.12 on 2260 degrees of freedom
## Multiple R-squared: 0.3108, Adjusted R-squared: 0.3063
## F-statistic: 67.95 on 15 and 2260 DF, p-value: < 2.2e-16
#Remove Pitching HR
summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin)))
##
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B +
## train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB +
## train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_BB +
## train1$TEAM_PITCHING_SO + train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP +
## as.factor(train1$BRCS_FLAG) + train1$BoxCoxPitch_H + as.factor(train1$pitch_h_bin),
## data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.178 -8.576 0.237 8.700 56.502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.528e+00 7.729e+00 1.233 0.217804
## train1$TEAM_BATTING_H 4.904e-02 3.904e-03 12.562 < 2e-16 ***
## train1$TEAM_BATTING_2B -2.836e-02 9.433e-03 -3.006 0.002672 **
## train1$"log(Bat_3B)" 3.457e+00 9.236e-01 3.743 0.000186 ***
## train1$TEAM_BATTING_HR 6.496e-02 1.018e-02 6.379 2.15e-10 ***
## train1$TEAM_BATTING_BB 2.882e-02 6.757e-03 4.265 2.08e-05 ***
## train1$TEAM_BATTING_SO -4.836e-03 2.468e-03 -1.960 0.050172 .
## train1$"log(BR_SB)" 1.404e+00 5.050e-01 2.781 0.005460 **
## train1$TEAM_PITCHING_BB -1.115e-02 5.368e-03 -2.078 0.037864 *
## train1$TEAM_PITCHING_SO 2.647e-03 6.819e-04 3.882 0.000107 ***
## train1$TEAM_FIELDING_E -1.935e-02 2.404e-03 -8.050 1.32e-15 ***
## train1$TEAM_FIELDING_DP -1.180e-01 1.346e-02 -8.767 < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1 3.024e+00 9.014e-01 3.354 0.000809 ***
## train1$BoxCoxPitch_H -2.601e+10 1.229e+10 -2.116 0.034465 *
## as.factor(train1$pitch_h_bin)1 -3.091e+00 1.696e+00 -1.823 0.068491 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.12 on 2261 degrees of freedom
## Multiple R-squared: 0.3103, Adjusted R-squared: 0.306
## F-statistic: 72.66 on 14 and 2261 DF, p-value: < 2.2e-16
#Remove Pitching_BB
summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+train1$BoxCoxPitch_H+as.factor(train1$pitch_h_bin)))
##
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B +
## train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB +
## train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_SO +
## train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + as.factor(train1$BRCS_FLAG) +
## train1$BoxCoxPitch_H + as.factor(train1$pitch_h_bin), data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.947 -8.469 0.198 8.696 57.301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.688e+00 7.613e+00 0.879 0.379729
## train1$TEAM_BATTING_H 4.935e-02 3.903e-03 12.643 < 2e-16 ***
## train1$TEAM_BATTING_2B -2.676e-02 9.408e-03 -2.844 0.004496 **
## train1$"log(Bat_3B)" 3.664e+00 9.189e-01 3.987 6.89e-05 ***
## train1$TEAM_BATTING_HR 6.642e-02 1.017e-02 6.534 7.87e-11 ***
## train1$TEAM_BATTING_BB 1.660e-02 3.325e-03 4.991 6.46e-07 ***
## train1$TEAM_BATTING_SO -4.502e-03 2.464e-03 -1.827 0.067826 .
## train1$"log(BR_SB)" 1.338e+00 5.043e-01 2.652 0.008048 **
## train1$TEAM_PITCHING_SO 2.062e-03 6.214e-04 3.318 0.000921 ***
## train1$TEAM_FIELDING_E -2.125e-02 2.225e-03 -9.551 < 2e-16 ***
## train1$TEAM_FIELDING_DP -1.177e-01 1.347e-02 -8.742 < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1 3.241e+00 8.960e-01 3.617 0.000305 ***
## train1$BoxCoxPitch_H -1.531e+10 1.117e+10 -1.370 0.170667
## as.factor(train1$pitch_h_bin)1 -2.472e+00 1.671e+00 -1.480 0.139025
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.13 on 2262 degrees of freedom
## Multiple R-squared: 0.309, Adjusted R-squared: 0.305
## F-statistic: 77.8 on 13 and 2262 DF, p-value: < 2.2e-16
## Remove the BoxCox Pitching H
summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_BB+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+as.factor(train1$pitch_h_bin)))
##
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B +
## train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB +
## train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_BB +
## train1$TEAM_PITCHING_SO + train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP +
## as.factor(train1$BRCS_FLAG) + as.factor(train1$pitch_h_bin),
## data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.554 -8.452 0.224 8.635 56.264
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.8408813 5.5600550 -0.331 0.740607
## train1$TEAM_BATTING_H 0.0519962 0.0036475 14.255 < 2e-16 ***
## train1$TEAM_BATTING_2B -0.0246335 0.0092743 -2.656 0.007960 **
## train1$"log(Bat_3B)" 3.8152797 0.9086639 4.199 2.79e-05 ***
## train1$TEAM_BATTING_HR 0.0670858 0.0101404 6.616 4.60e-11 ***
## train1$TEAM_BATTING_BB 0.0236201 0.0062991 3.750 0.000181 ***
## train1$TEAM_BATTING_SO -0.0053982 0.0024552 -2.199 0.027999 *
## train1$"log(BR_SB)" 1.2741535 0.5015940 2.540 0.011145 *
## train1$TEAM_PITCHING_BB -0.0063922 0.0048773 -1.311 0.190123
## train1$TEAM_PITCHING_SO 0.0027110 0.0006817 3.977 7.20e-05 ***
## train1$TEAM_FIELDING_E -0.0181807 0.0023411 -7.766 1.22e-14 ***
## train1$TEAM_FIELDING_DP -0.1157762 0.0134277 -8.622 < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1 3.4350255 0.8808642 3.900 9.92e-05 ***
## as.factor(train1$pitch_h_bin)1 -1.6295306 1.5499227 -1.051 0.293204
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.13 on 2262 degrees of freedom
## Multiple R-squared: 0.3089, Adjusted R-squared: 0.305
## F-statistic: 77.79 on 13 and 2262 DF, p-value: < 2.2e-16
##Remove Pitching BB
summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)+as.factor(train1$pitch_h_bin)))
##
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B +
## train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB +
## train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_SO +
## train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + as.factor(train1$BRCS_FLAG) +
## as.factor(train1$pitch_h_bin), data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.286 -8.442 0.196 8.698 56.887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5626977 5.4747220 -0.103 0.918146
## train1$TEAM_BATTING_H 0.0513690 0.0036165 14.204 < 2e-16 ***
## train1$TEAM_BATTING_2B -0.0245845 0.0092757 -2.650 0.008095 **
## train1$"log(Bat_3B)" 3.8566941 0.9082583 4.246 2.26e-05 ***
## train1$TEAM_BATTING_HR 0.0674959 0.0101372 6.658 3.47e-11 ***
## train1$TEAM_BATTING_BB 0.0166087 0.0033260 4.994 6.38e-07 ***
## train1$TEAM_BATTING_SO -0.0050055 0.0024372 -2.054 0.040110 *
## train1$"log(BR_SB)" 1.2649856 0.5016248 2.522 0.011745 *
## train1$TEAM_PITCHING_SO 0.0022858 0.0005996 3.812 0.000142 ***
## train1$TEAM_FIELDING_E -0.0198367 0.0019711 -10.064 < 2e-16 ***
## train1$TEAM_FIELDING_DP -0.1162299 0.0134253 -8.658 < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1 3.4681487 0.8806412 3.938 8.46e-05 ***
## as.factor(train1$pitch_h_bin)1 -1.6176962 1.5501421 -1.044 0.296792
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.13 on 2263 degrees of freedom
## Multiple R-squared: 0.3084, Adjusted R-squared: 0.3047
## F-statistic: 84.1 on 12 and 2263 DF, p-value: < 2.2e-16
## Remove the Factor of Pitching hits binned
summary(lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG)))
##
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_2B +
## train1$"log(Bat_3B)" + train1$TEAM_BATTING_HR + train1$TEAM_BATTING_BB +
## train1$TEAM_BATTING_SO + train1$"log(BR_SB)" + train1$TEAM_PITCHING_SO +
## train1$TEAM_FIELDING_E + train1$TEAM_FIELDING_DP + as.factor(train1$BRCS_FLAG),
## data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.402 -8.486 0.201 8.675 57.014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.1105640 5.4495997 -0.204 0.838537
## train1$TEAM_BATTING_H 0.0510399 0.0036028 14.167 < 2e-16 ***
## train1$TEAM_BATTING_2B -0.0253578 0.0092462 -2.742 0.006145 **
## train1$"log(Bat_3B)" 3.8270093 0.9078306 4.216 2.59e-05 ***
## train1$TEAM_BATTING_HR 0.0667032 0.0101089 6.598 5.16e-11 ***
## train1$TEAM_BATTING_BB 0.0165692 0.0033259 4.982 6.77e-07 ***
## train1$TEAM_BATTING_SO -0.0048403 0.0024321 -1.990 0.046691 *
## train1$"log(BR_SB)" 1.2381115 0.5009732 2.471 0.013531 *
## train1$TEAM_PITCHING_SO 0.0022600 0.0005991 3.772 0.000166 ***
## train1$TEAM_FIELDING_E -0.0198499 0.0019711 -10.070 < 2e-16 ***
## train1$TEAM_FIELDING_DP -0.1167500 0.0134163 -8.702 < 2e-16 ***
## as.factor(train1$BRCS_FLAG)1 3.4035656 0.8784813 3.874 0.000110 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.13 on 2264 degrees of freedom
## Multiple R-squared: 0.3081, Adjusted R-squared: 0.3047
## F-statistic: 91.64 on 11 and 2264 DF, p-value: < 2.2e-16
model2<-lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_2B+train1$'log(Bat_3B)'+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$TEAM_BATTING_SO+train1$'log(BR_SB)'+train1$TEAM_PITCHING_SO+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP+as.factor(train1$BRCS_FLAG))
### Now we have a model that has R~AdjR~0.34 with 11 Variables
z<-data.frame(train1$TEAM_BATTING_H,train1$TEAM_BATTING_2B,train1$'log(Bat_3B)',train1$TEAM_BATTING_HR,train1$TEAM_BATTING_BB,train1$TEAM_BATTING_SO,train1$'log(BR_SB)',train1$TEAM_PITCHING_SO,train1$TEAM_FIELDING_E,train1$TEAM_FIELDING_DP,as.numeric(train1$BRCS_FLAG))
colnames(z)<-c("Bat_H","Bat_2B","Bat_3B","Bat_HR","Bat_BB","Bat_SO","BaseRun_SB","PITCH_SO","Field_E","Field_DP","BaseRun_CS")
matmod2<-as.matrix(cor(z))
corrplot(matmod2,tl.cex=0.8)

## Let's look at the unique correlations with the outcome variable.
z<-data.frame("Wins"=train1$TARGET_WINS,z)
part<-data.frame(pcor(z)$estimate)
sig<-data.frame(pcor(z)$p.value)$Wins
VarName<-row.names(part)
partw<-data.frame(VarName,part$Wins,sig)
partw
## VarName part.Wins sig
## 1 Wins 1.00000000 0.000000e+00
## 2 Bat_H 0.28535467 1.026740e-43
## 3 Bat_2B -0.05754238 6.145391e-03
## 4 Bat_3B 0.08825080 2.589519e-05
## 5 Bat_HR 0.13736304 5.155775e-11
## 6 Bat_BB 0.10413246 6.773677e-07
## 7 Bat_SO -0.04179032 4.669064e-02
## 8 BaseRun_SB 0.05187069 1.353112e-02
## 9 PITCH_SO 0.07902899 1.660205e-04
## 10 Field_E -0.20705700 2.301129e-23
## 11 Field_DP -0.17990378 6.154157e-18
## 12 BaseRun_CS 0.08115746 1.099348e-04
## Note Bat 3B, Bat SO, Pitch SO, BaseRun CS are all the smallest correlations with Wins.
##Remove all 3 for parsimony
#Model3:
model3<-lm(data=train1,TARGET_WINS~train1$TEAM_BATTING_H+train1$TEAM_BATTING_HR+train1$TEAM_BATTING_BB+train1$'log(BR_SB)'+train1$TEAM_FIELDING_E+train1$TEAM_FIELDING_DP)
summary(model3)
##
## Call:
## lm(formula = TARGET_WINS ~ train1$TEAM_BATTING_H + train1$TEAM_BATTING_HR +
## train1$TEAM_BATTING_BB + train1$"log(BR_SB)" + train1$TEAM_FIELDING_E +
## train1$TEAM_FIELDING_DP, data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.937 -8.646 0.156 8.580 56.463
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.780494 3.964577 2.215 0.02688 *
## train1$TEAM_BATTING_H 0.052093 0.002081 25.038 < 2e-16 ***
## train1$TEAM_BATTING_HR 0.019435 0.006414 3.030 0.00247 **
## train1$TEAM_BATTING_BB 0.019253 0.003306 5.823 6.60e-09 ***
## train1$"log(BR_SB)" 1.477988 0.480309 3.077 0.00211 **
## train1$TEAM_FIELDING_E -0.015051 0.001867 -8.061 1.21e-15 ***
## train1$TEAM_FIELDING_DP -0.130996 0.013307 -9.844 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.33 on 2269 degrees of freedom
## Multiple R-squared: 0.2856, Adjusted R-squared: 0.2837
## F-statistic: 151.2 on 6 and 2269 DF, p-value: < 2.2e-16
##Rsq = 0.32
#Lets compare Histograms of residuals
par(mfrow=c(1,3))
hist(model1$residuals,xlab="",main="Model1")
hist(model2$residuals,xlab="",main="Model2")
hist(model3$residuals,xlab="",main="Model3")

mean(model1$residuals^2)
## [1] 170.2429
mean(model2$residuals^2)
## [1] 171.6107
mean(model3$residuals^2)
## [1] 177.188
#### Let's take the test data, impute missing values then run the models on them to get how many wins we predict.
test1<-test
###Transform data
##Batting H
##Leave as is
##Batting 2B
##Leave as is
##Batting H
##Leave as is
##Batting BB
##Leave as is
#Batting 3B Transform and impute
test1[test1$TEAM_BATTING_3B==0,5 ]<-8
test1$TEAM_BATTING_3B<-log(test1$TEAM_BATTING_3B)
colnames(test1)[5] <- "log(Bat_3B)"
## Baserun CS Introduce randomness when imputing
brcs<-test$TEAM_BASERUN_CS
brcs[is.na(brcs)]<-rnorm(n=1,mean = mean(brcs,na.rm=TRUE),sd=sd(brcs,na.rm = TRUE))
test1$TEAM_BASERUN_CS<-brcs
##Create flag for missing or not
y<-as.integer(is.na(test$TEAM_BASERUN_CS))
test1$BRCS_FLAG<-y
## Baserun SB Introduce randomness when imputing
brsb<-test$TEAM_BASERUN_SB
brsb[brsb==0]<-15
brsb<-log(brsb)
brsb[is.na(brsb)]<-rnorm(n=1,mean = mean(brsb,na.rm=TRUE),sd=sd(brsb,na.rm = TRUE))
test1$TEAM_BASERUN_SB<-brsb
colnames(test1)[9] <- "log(BR_SB)"
##Batting SO
btso<-test$TEAM_BATTING_SO
btso[is.na(btso)]<-rnorm(n=1,mean = mean(btso,na.rm=TRUE),sd=sd(btso,na.rm = TRUE))
test1$TEAM_BATTING_SO<-btso
##Not At All significant
#Now let's check Whether null or not is different
par(mfrow=c(2,1))
#hist(test1$TARGET_WINS[!is.na(test1$TEAM_BATTING_HBP)],xlab="",main="Not Null")
#hist(test1$TARGET_WINS[is.na(test1$TEAM_BATTING_HBP)],xlab="",main="Null")
#t.test(x=test1$TARGET_WINS[!is.na(test1$TEAM_BATTING_HBP)],y=test1$TARGET_WINS[is.na(test1$TEAM_BATTING_HBP)])
hbp<-test1$TEAM_BATTING_HBP
hbp[is.na(hbp)]<-rnorm(n=1,mean = mean(hbp,na.rm=TRUE),sd=sd(hbp,na.rm = TRUE))
test1$TEAM_BATTING_HBP<-hbp
### Fielding DP, impute
fdp<-test$TEAM_FIELDING_DP
fdp[is.na(fdp)]<-rnorm(n=1,mean = mean(fdp,na.rm=TRUE),sd=sd(fdp,na.rm = TRUE))
test1$TEAM_FIELDING_DP<-fdp
###Pitching BB # Replace outliers with 99.9 percentile value
bb<-test1$TEAM_PITCHING_BB
outliers<-mean(bb)+3*sd(bb)
bb[bb>outliers]<-outliers
test1$TEAM_PITCHING_BB<-bb
### Fielding E
#cor(test1$TARGET_WINS,test1$TEAM_FIELDING_E)
##Keep this the way it is.
## PITCHING H
#Use Box cox transform
lambda<-powerTransform(test1$TEAM_PITCHING_H)$lambda
## Warning in estimateTransform.default(X, Y, weights, family, start,
## method, : Convergence failure: return code = 52
test1$BoxCoxPitch_H<- test1$TEAM_PITCHING_H^lambda
##Try a bin with seperation level = Pitch hits = 1300
test1$pitch_h_bin<-as.integer(test1$TEAM_PITCHING_H>1300)
## Pitching SO replace missing values and get rid of outliers
#pp<-test1$TEAM_PITCHING_SO
#outliersp<-mean(pp,na.rm = TRUE)+3*sd(pp,na.rm=TRUE)
#pp[pp>out]<-10000
#test1$TEAM_PITCHING_SO<-pp
pso<-test$TEAM_PITCHING_SO
pso[is.na(pso)]<-rnorm(n=1,mean = mean(pso,na.rm=TRUE),sd=sd(pso,na.rm = TRUE))
test1$TEAM_PITCHING_SO<-pso
####
test1$TEAM_BATTING_HR<-test$TEAM_BATTING_HR
# Now run the model 3 on this data
rm(Wins)
## Warning in rm(Wins): object 'Wins' not found
Wins<- (-17.5)+0.05155*test1$TEAM_BATTING_H+
0.02673*test1$TEAM_BATTING_HR+
0.018462*test1$TEAM_BATTING_BB+
6.0527*test1$TEAM_BASERUN_SB-
0.019737*test1$TEAM_FIELDING_E-
0.08637*test1$TEAM_FIELDING_DP
Wins<-data.frame("Wins"=round(Wins/1),"Index"=test1$INDEX)
par(mfrow=c(2,1))
hist(Wins$Wins,xlab="",main = "Predicted Wins",xlim = c(0,150))
hist(train1$TARGET_WINS,xlab="",main = "Train DataSet Observed Wins",xlim = c(0,150))

summary(Wins$Wins)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31.00 75.00 81.00 80.47 87.00 114.00
#hist(train1$TARGET_WINS,xlab="",main = "Predicted Wins")
summary(train1$TARGET_WINS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 71.00 82.00 80.79 92.00 146.00