#read testing data set
data <- read.csv(file="https://cdn-stage.fedweb.org/fed-2/2/moneyball-training-data.csv",stringsAsFactors=T, header=T)

#display six first entries
head(data)
##   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1     1          39           1445             194              39
## 2     2          70           1339             219              22
## 3     3          86           1377             232              35
## 4     4          70           1387             209              38
## 5     5          82           1297             186              27
## 6     6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842              NA
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1              NA               NA            9364               84
## 2              28               NA            1347              191
## 3              27               NA            1377              137
## 4              30               NA            1396               97
## 5              39               NA            1297              102
## 6              59               NA            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011               NA
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149
#find dimentions
dim(data)
## [1] 2276   17
#chart of missing values
aggr(data[-1], prop = T, numbers = T, cex.axis=.4, ylab=c("Proportion of missingness","Missingness Pattern"),labels=names(data[-1]))

count_nas <- function(data){
  
variable_name_column <- c()
number_missing_column <- c()

for (i in 2:ncol(data)){
  variable_name <- colnames(data[i])
  number_missing <- sum(is.na(data[i]))
  variable_name_column <- c(variable_name_column,variable_name)
  number_missing_column <- c(number_missing_column,number_missing)
}

missing_table <- data.frame(variable_name_column,number_missing_column)
missing_table <- missing_table %>% mutate(percentage=round(number_missing_column*100/nrow(data),0)) %>% arrange(desc(percentage))
missing_table
}

#count NAs
count_nas(data)
##    variable_name_column number_missing_column percentage
## 1      TEAM_BATTING_HBP                  2085         92
## 2       TEAM_BASERUN_CS                   772         34
## 3      TEAM_FIELDING_DP                   286         13
## 4       TEAM_BASERUN_SB                   131          6
## 5       TEAM_BATTING_SO                   102          4
## 6      TEAM_PITCHING_SO                   102          4
## 7           TARGET_WINS                     0          0
## 8        TEAM_BATTING_H                     0          0
## 9       TEAM_BATTING_2B                     0          0
## 10      TEAM_BATTING_3B                     0          0
## 11      TEAM_BATTING_HR                     0          0
## 12      TEAM_BATTING_BB                     0          0
## 13      TEAM_PITCHING_H                     0          0
## 14     TEAM_PITCHING_HR                     0          0
## 15     TEAM_PITCHING_BB                     0          0
## 16      TEAM_FIELDING_E                     0          0
#store original values
TEAM_BASERUN_SB_value <- data$TEAM_BASERUN_SB
TEAM_FIELDING_DP_value <- data$TEAM_FIELDING_DP
TEAM_BASERUN_CS_value <- data$TEAM_BASERUN_CS
TEAM_BATTING_HBP_value <- data$TEAM_BATTING_HBP

#impute missing data process
exclude <- c('INDEX')
include <- setdiff(names(data), exclude)
data <- data[include]

imp.data <- mice(data, m=4, method='cart', printFlag=FALSE)

xyplot(imp.data, TARGET_WINS ~ TEAM_BATTING_SO,main="Imputed TEAM_BATTING_SO", cex=0.5)

#compare with original graph
plot(TARGET_WINS ~ TEAM_BATTING_SO,data,main="Original TEAM_BATTING_SO")

densityplot(imp.data, ~ TEAM_PITCHING_SO)

xyplot(imp.data, TARGET_WINS ~ TEAM_BATTING_SO,jitter.data = TRUE, cex=0.5)

densityplot(imp.data, ~ TEAM_BATTING_SO)

#merge imputed values for TEAM_PITCHING_SO and TEAM_BATTING_SO into our original data set
data <- complete(imp.data)
head(data)
##   TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1          39           1445             194              39
## 2          70           1339             219              22
## 3          86           1377             232              35
## 4          70           1387             209              38
## 5          82           1297             186              27
## 6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842             226
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1              66               53            9364               84
## 2              28               45            1347              191
## 3              27               48            1377              137
## 4              30               64            1396               97
## 5              39               62            1297              102
## 6              59               49            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011              113
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149
data$TEAM_BASERUN_SB <- TEAM_BASERUN_SB_value

imp.data <- mice(data, m=6, method='cart', printFlag=FALSE)
xyplot(imp.data, TARGET_WINS ~ TEAM_BASERUN_SB,jitter.data = TRUE, cex=0.5)

densityplot(imp.data, ~ TEAM_BASERUN_SB,jitter.data = TRUE, cex=0.5)

#merge imputed values for TEAM_BASERUN_SB into our original data set
data <- complete(imp.data)
head(data)
##   TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1          39           1445             194              39
## 2          70           1339             219              22
## 3          86           1377             232              35
## 4          70           1387             209              38
## 5          82           1297             186              27
## 6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842             136
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1              66               53            9364               84
## 2              28               45            1347              191
## 3              27               48            1377              137
## 4              30               64            1396               97
## 5              39               62            1297              102
## 6              59               49            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011              113
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149
data$TEAM_FIELDING_DP <- TEAM_FIELDING_DP_value

imp.data <- mice(data, m=13, method='cart', printFlag=FALSE)

xyplot(imp.data, TARGET_WINS ~ TEAM_FIELDING_DP,main="Imputed TEAM_FIELDING_DP", cex=0.5)

#compare with original graph
plot(TARGET_WINS ~ TEAM_BATTING_SO,data,main="Original TEAM_FIELDING_DP")

densityplot(imp.data, ~ TEAM_FIELDING_DP)

#merge imputed values for TEAM_FIELDING_DP into our original data set
data <- complete(imp.data)
head(data)
##   TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1          39           1445             194              39
## 2          70           1339             219              22
## 3          86           1377             232              35
## 4          70           1387             209              38
## 5          82           1297             186              27
## 6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842             136
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1              66               53            9364               84
## 2              28               45            1347              191
## 3              27               48            1377              137
## 4              30               64            1396               97
## 5              39               62            1297              102
## 6              59               49            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011              113
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149
data$TEAM_BASERUN_CS <- TEAM_BASERUN_CS_value

imp.data <- mice(data, m=34, method='cart', printFlag=FALSE)

xyplot(imp.data, TARGET_WINS ~ TEAM_BASERUN_CS,cex=0.5)

densityplot(imp.data, ~ TEAM_BASERUN_CS)

imp.data_mean <- mice(data, 
                           m=34, 
                           defaultMethod='mean',
                           printFlag=FALSE)

xyplot(imp.data_mean, TARGET_WINS ~ TEAM_BASERUN_CS,cex=0.5)

densityplot(imp.data_mean, ~ TEAM_BASERUN_CS)

data <- complete(imp.data_mean)
head(data)
##   TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1          39           1445             194              39
## 2          70           1339             219              22
## 3          86           1377             232              35
## 4          70           1387             209              38
## 5          82           1297             186              27
## 6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842             136
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1        52.80386               53            9364               84
## 2        28.00000               45            1347              191
## 3        27.00000               48            1377              137
## 4        30.00000               64            1396               97
## 5        39.00000               62            1297              102
## 6        59.00000               49            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011              113
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149
data$TEAM_BATTING_HBP <- TEAM_BATTING_HBP_value

imp.data_mean <- mice(data, 
                           m=92, 
                           defaultMethod='mean',
                           printFlag=FALSE)

xyplot(imp.data_mean, TARGET_WINS ~ TEAM_BASERUN_CS,cex=0.5)

#densityplot(imp.data_mean, ~ TEAM_BASERUN_CS)

data <- complete(imp.data_mean)
head(data)
##   TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1          39           1445             194              39
## 2          70           1339             219              22
## 3          86           1377             232              35
## 4          70           1387             209              38
## 5          82           1297             186              27
## 6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842             136
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1        52.80386         59.35602            9364               84
## 2        28.00000         59.35602            1347              191
## 3        27.00000         59.35602            1377              137
## 4        30.00000         59.35602            1396               97
## 5        39.00000         59.35602            1297              102
## 6        59.00000         59.35602            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011              113
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149
#Confirm no NAs
count_nas(data)
##    variable_name_column number_missing_column percentage
## 1        TEAM_BATTING_H                     0          0
## 2       TEAM_BATTING_2B                     0          0
## 3       TEAM_BATTING_3B                     0          0
## 4       TEAM_BATTING_HR                     0          0
## 5       TEAM_BATTING_BB                     0          0
## 6       TEAM_BATTING_SO                     0          0
## 7       TEAM_BASERUN_SB                     0          0
## 8       TEAM_BASERUN_CS                     0          0
## 9      TEAM_BATTING_HBP                     0          0
## 10      TEAM_PITCHING_H                     0          0
## 11     TEAM_PITCHING_HR                     0          0
## 12     TEAM_PITCHING_BB                     0          0
## 13     TEAM_PITCHING_SO                     0          0
## 14      TEAM_FIELDING_E                     0          0
## 15     TEAM_FIELDING_DP                     0          0
# histograms and density lines
par(mfrow=c(2,2))
colnames <- dimnames(data)[[2]]
for (i in 2:ncol(data)) {
    n<-max(data[i])
    hist(data[,i], xlim=c(0, n), breaks=seq(0, n, 0.01*n), main=names(data)[i], probability=TRUE, col="gray", border="white")
    d <- density(data[,i])
    lines(d, col="red")
}

# Create separate boxplots for each attribute
par(mfrow=c(1,4))

for(i in 1:ncol(data)) {
    boxplot(data[,i], main=names(data)[i])
}

#verify lineriarity
par(mfrow=c(1,2))
colnames <- dimnames(data)[[2]]
for (i in 2:ncol(data)) {
    plot(data[,1]~data[,i],main=names(data)[i])
    reg_line <- lm(data[,1]~data[,i])
    abline(reg_line,col="red")
}

#verify normal distribution
par(mfrow=c(2,3))

colnames <- dimnames(data)[[2]]
for (i in 2:ncol(data)) {
    qqnorm(data[,i],main=names(data)[i])
    qqline(data[,i])
    
}

#verify multicollinearity
par(mfrow=c(1,1))

corrplot(cor(data), type = "upper", method = "number", tl.cex = 0.5, tl.col="black",number.cex = .5)

#review data
head(data)
##   TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1          39           1445             194              39
## 2          70           1339             219              22
## 3          86           1377             232              35
## 4          70           1387             209              38
## 5          82           1297             186              27
## 6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842             136
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1        52.80386         59.35602            9364               84
## 2        28.00000         59.35602            1347              191
## 3        27.00000         59.35602            1377              137
## 4        30.00000         59.35602            1396               97
## 5        39.00000         59.35602            1297              102
## 6        59.00000         59.35602            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011              113
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149
#confirm no NAs
count_nas(data)
##    variable_name_column number_missing_column percentage
## 1        TEAM_BATTING_H                     0          0
## 2       TEAM_BATTING_2B                     0          0
## 3       TEAM_BATTING_3B                     0          0
## 4       TEAM_BATTING_HR                     0          0
## 5       TEAM_BATTING_BB                     0          0
## 6       TEAM_BATTING_SO                     0          0
## 7       TEAM_BASERUN_SB                     0          0
## 8       TEAM_BASERUN_CS                     0          0
## 9      TEAM_BATTING_HBP                     0          0
## 10      TEAM_PITCHING_H                     0          0
## 11     TEAM_PITCHING_HR                     0          0
## 12     TEAM_PITCHING_BB                     0          0
## 13     TEAM_PITCHING_SO                     0          0
## 14      TEAM_FIELDING_E                     0          0
## 15     TEAM_FIELDING_DP                     0          0
#replace variables that don't satisfy assumptions with their logs
data_final <- data %>% mutate(TEAM_BATTING_3B=ifelse(log(TEAM_BATTING_3B)>0,log(TEAM_BATTING_3B),NA),TEAM_BATTING_3B=ifelse(log(TEAM_BATTING_3B)>0,log(TEAM_BATTING_3B),NA),TEAM_BATTING_HR = ifelse( log(TEAM_BATTING_HR)>0,log(TEAM_BATTING_HR),NA),TEAM_PITCHING_SO = ifelse(log(TEAM_PITCHING_SO)>0,log(TEAM_PITCHING_SO),NA),TEAM_BASERUN_SB=ifelse(log(TEAM_BASERUN_SB)>0,log(TEAM_BASERUN_SB),NA),TEAM_BASERUN_CS = ifelse(log(TEAM_BASERUN_CS)>0,log(TEAM_BASERUN_CS),NA),TEAM_BATTING_HBP = ifelse(log(TEAM_BATTING_HBP)>0,log(TEAM_BATTING_HBP),NA),TEAM_PITCHING_H = ifelse(log(TEAM_PITCHING_H)>0,log(TEAM_PITCHING_H),NA),TEAM_FIELDING_E = ifelse(log(TEAM_FIELDING_E)>0,log(TEAM_FIELDING_E),NA),TEAM_PITCHING_HR = ifelse(log(TEAM_PITCHING_HR)>0,log(TEAM_PITCHING_HR),NA),TEAM_PITCHING_BB = ifelse(log(TEAM_PITCHING_BB)>0,log(TEAM_PITCHING_BB),NA),TEAM_PITCHING_SO = ifelse(log(TEAM_PITCHING_SO)>0,log(TEAM_PITCHING_SO),NA),TEAM_FIELDING_DP = ifelse(log(TEAM_FIELDING_DP)>0,log(TEAM_FIELDING_DP),NA))  

#review data
head(data_final)
##   TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1          39           1445             194        1.298436
## 2          70           1339             219        1.128508
## 3          86           1377             232        1.268453
## 4          70           1387             209        1.291320
## 5          82           1297             186        1.192660
## 6          75           1279             200        1.276345
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1        2.564949             143             842        4.912655
## 2        5.247024             685            1075        3.610918
## 3        4.919981             602             917        3.828641
## 4        4.564348             451             922        3.761200
## 5        4.624973             472             920        3.891820
## 6        4.521789             443             973        4.672829
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1        3.966584         4.083554        9.144628         4.430817
## 2        3.332205         4.083554        7.205635         5.252273
## 3        3.295837         4.083554        7.227662         4.919981
## 4        3.401197         4.083554        7.241366         4.574711
## 5        3.663562         4.083554        7.167809         4.624973
## 6        4.077537         4.083554        7.153834         4.521789
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1         6.831954         2.152282        6.918695         4.727388
## 2         6.535241         1.943989        5.262690         5.043425
## 3         6.400257         1.920022        5.164786         5.030438
## 4         6.118097         1.921768        5.099866         5.049856
## 5         6.156979         1.920501        4.927254         5.123964
## 6         6.093570         1.928674        4.812184         5.003946
#dealing with Nans that were created by log
for (i in 2:ncol(data_final)){
  data_final[i][is.na(data_final[i]) ] <- NA
  data_final[i][which(data_final[i]==Inf)] <- NA
}

data_final <- mice(data_final, m=4, method='cart', printFlag=FALSE)
data_final <- complete(data_final)


model.null = lm(TARGET_WINS ~ 1,data=data_final)
model.full = lm(TARGET_WINS ~ .,data=data_final)  
     
x <- step(model.null,
     scope = list(upper=model.full),
             direction = "both",
             data = data_final)
## Start:  AIC=12550.76
## TARGET_WINS ~ 1
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_BATTING_H    1     85318 479178 12180
## + TEAM_BATTING_2B   1     47181 517315 12354
## + TEAM_BATTING_BB   1     30530 533966 12426
## + TEAM_BATTING_HR   1     15422 549075 12490
## + TEAM_PITCHING_BB  1     14964 549533 12492
## + TEAM_PITCHING_HR  1     14016 550481 12496
## + TEAM_FIELDING_E   1     12846 551650 12500
## + TEAM_BASERUN_SB   1      8357 556139 12519
## + TEAM_BATTING_3B   1      6776 557720 12525
## + TEAM_PITCHING_SO  1      5656 558841 12530
## <none>                          564496 12551
## + TEAM_BATTING_SO   1       358 564138 12551
## + TEAM_BATTING_HBP  1       180 564316 12552
## + TEAM_FIELDING_DP  1        88 564408 12552
## + TEAM_BASERUN_CS   1        74 564422 12552
## + TEAM_PITCHING_H   1        66 564430 12552
## 
## Step:  AIC=12179.81
## TARGET_WINS ~ TEAM_BATTING_H
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_BATTING_BB   1     38578 440601 11991
## + TEAM_FIELDING_E   1     35701 443478 12006
## + TEAM_PITCHING_H   1     35085 444093 12009
## + TEAM_BATTING_HR   1     15007 464172 12109
## + TEAM_BATTING_SO   1     14144 465034 12114
## + TEAM_PITCHING_BB  1     10258 468920 12133
## + TEAM_PITCHING_HR  1      8083 471095 12143
## + TEAM_BASERUN_SB   1      4304 474875 12161
## + TEAM_BATTING_2B   1      4082 475097 12162
## + TEAM_PITCHING_SO  1      3332 475847 12166
## <none>                          479178 12180
## + TEAM_BATTING_3B   1       400 478778 12180
## + TEAM_BATTING_HBP  1       228 478951 12181
## + TEAM_FIELDING_DP  1        34 479145 12182
## + TEAM_BASERUN_CS   1         3 479175 12182
## - TEAM_BATTING_H    1     85318 564496 12551
## 
## Step:  AIC=11990.78
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_BASERUN_SB   1     14643 425958 11916
## + TEAM_FIELDING_DP  1     11083 429518 11935
## + TEAM_PITCHING_H   1      7667 432934 11953
## + TEAM_FIELDING_E   1      7571 433029 11953
## + TEAM_PITCHING_BB  1      4102 436499 11972
## + TEAM_BATTING_SO   1      2373 438228 11980
## + TEAM_PITCHING_SO  1      1375 439226 11986
## + TEAM_BATTING_3B   1      1230 439371 11986
## + TEAM_BASERUN_CS   1       711 439890 11989
## <none>                          440601 11991
## + TEAM_BATTING_HBP  1       242 440359 11992
## + TEAM_BATTING_HR   1       139 440462 11992
## + TEAM_BATTING_2B   1        52 440548 11992
## + TEAM_PITCHING_HR  1         4 440596 11993
## - TEAM_BATTING_BB   1     38578 479178 12180
## - TEAM_BATTING_H    1     93365 533966 12426
## 
## Step:  AIC=11915.85
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_FIELDING_E   1     27097 398861 11768
## + TEAM_PITCHING_H   1     10308 415651 11862
## + TEAM_PITCHING_BB  1      8575 417383 11872
## + TEAM_BATTING_HR   1      5476 420483 11888
## + TEAM_BATTING_SO   1      3376 422582 11900
## + TEAM_PITCHING_HR  1      3006 422952 11902
## + TEAM_FIELDING_DP  1      2697 423261 11903
## + TEAM_PITCHING_SO  1      1012 424946 11912
## + TEAM_BASERUN_CS   1       865 425093 11913
## <none>                          425958 11916
## + TEAM_BATTING_HBP  1       220 425738 11917
## + TEAM_BATTING_2B   1       189 425769 11917
## + TEAM_BATTING_3B   1        44 425914 11918
## - TEAM_BASERUN_SB   1     14643 440601 11991
## - TEAM_BATTING_BB   1     48917 474875 12161
## - TEAM_BATTING_H    1     87575 513534 12339
## 
## Step:  AIC=11768.25
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_FIELDING_DP  1     16088 382773 11676
## + TEAM_BATTING_3B   1      7233 391628 11729
## + TEAM_BATTING_2B   1      4497 394363 11744
## + TEAM_BASERUN_CS   1      2836 396025 11754
## + TEAM_BATTING_SO   1      2551 396310 11756
## + TEAM_PITCHING_BB  1      1614 397247 11761
## + TEAM_BATTING_HR   1      1241 397620 11763
## + TEAM_PITCHING_HR  1       977 397884 11765
## + TEAM_PITCHING_SO  1       797 398064 11766
## + TEAM_PITCHING_H   1       740 398121 11766
## + TEAM_BATTING_HBP  1       508 398353 11767
## <none>                          398861 11768
## - TEAM_BATTING_BB   1      7316 406177 11808
## - TEAM_FIELDING_E   1     27097 425958 11916
## - TEAM_BASERUN_SB   1     34169 433029 11953
## - TEAM_BATTING_H    1    106965 505826 12307
## 
## Step:  AIC=11676.55
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_BATTING_3B   1      7533 375239 11633
## + TEAM_BATTING_2B   1      4020 378753 11654
## + TEAM_BATTING_SO   1      3193 379580 11660
## + TEAM_BASERUN_CS   1      1470 381303 11670
## + TEAM_PITCHING_BB  1       880 381893 11673
## + TEAM_PITCHING_SO  1       573 382199 11675
## + TEAM_PITCHING_H   1       516 382256 11676
## + TEAM_BATTING_HBP  1       471 382301 11676
## <none>                          382773 11676
## + TEAM_BATTING_HR   1       314 382458 11677
## + TEAM_PITCHING_HR  1       199 382573 11677
## - TEAM_BATTING_BB   1     11459 394231 11742
## - TEAM_FIELDING_DP  1     16088 398861 11768
## - TEAM_BASERUN_SB   1     17795 400568 11778
## - TEAM_FIELDING_E   1     40488 423261 11903
## - TEAM_BATTING_H    1    115808 498581 12276
## 
## Step:  AIC=11633.31
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_BASERUN_CS   1      2917 372322 11618
## + TEAM_BATTING_2B   1      2458 372782 11620
## + TEAM_BATTING_SO   1       678 374561 11631
## + TEAM_BATTING_HBP  1       611 374629 11632
## + TEAM_PITCHING_HR  1       585 374654 11632
## + TEAM_PITCHING_BB  1       337 374902 11633
## <none>                          375239 11633
## + TEAM_BATTING_HR   1       184 375055 11634
## + TEAM_PITCHING_SO  1        68 375172 11635
## + TEAM_PITCHING_H   1        42 375198 11635
## - TEAM_BATTING_3B   1      7533 382773 11676
## - TEAM_BATTING_BB   1      8921 384160 11685
## - TEAM_BASERUN_SB   1     15473 390712 11723
## - TEAM_FIELDING_DP  1     16389 391628 11729
## - TEAM_FIELDING_E   1     47755 422994 11904
## - TEAM_BATTING_H    1     94807 470046 12144
## 
## Step:  AIC=11617.55
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_BATTING_2B   1      2642 369680 11603
## + TEAM_BATTING_SO   1      1624 370698 11610
## + TEAM_BATTING_HBP  1       676 371646 11615
## + TEAM_PITCHING_BB  1       437 371885 11617
## <none>                          372322 11618
## + TEAM_PITCHING_HR  1       217 372105 11618
## + TEAM_PITCHING_SO  1        27 372295 11619
## + TEAM_BATTING_HR   1        23 372300 11619
## + TEAM_PITCHING_H   1        12 372310 11620
## - TEAM_BASERUN_CS   1      2917 375239 11633
## - TEAM_BATTING_BB   1      7465 379788 11661
## - TEAM_BATTING_3B   1      8981 381303 11670
## - TEAM_FIELDING_DP  1     14561 386883 11703
## - TEAM_BASERUN_SB   1     18327 390649 11725
## - TEAM_FIELDING_E   1     50294 422616 11904
## - TEAM_BATTING_H    1     93208 465531 12124
## 
## Step:  AIC=11603.34
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_2B
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_BATTING_SO   1       872 368808 11600
## + TEAM_BATTING_HBP  1       629 369051 11602
## + TEAM_PITCHING_HR  1       510 369170 11602
## <none>                          369680 11603
## + TEAM_PITCHING_BB  1       222 369458 11604
## + TEAM_BATTING_HR   1       135 369545 11604
## + TEAM_PITCHING_SO  1        94 369586 11605
## + TEAM_PITCHING_H   1        87 369592 11605
## - TEAM_BATTING_2B   1      2642 372322 11618
## - TEAM_BASERUN_CS   1      3102 372782 11620
## - TEAM_BATTING_3B   1      7284 376964 11646
## - TEAM_BATTING_BB   1      7701 377381 11648
## - TEAM_FIELDING_DP  1     14105 383785 11687
## - TEAM_BASERUN_SB   1     19504 389184 11718
## - TEAM_FIELDING_E   1     52506 422186 11904
## - TEAM_BATTING_H    1     64604 434284 11968
## 
## Step:  AIC=11599.96
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_2B + TEAM_BATTING_SO
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_PITCHING_SO  1      3191 365617 11582
## + TEAM_PITCHING_HR  1      1271 367537 11594
## + TEAM_BATTING_HBP  1       642 368166 11598
## + TEAM_BATTING_HR   1       602 368206 11598
## <none>                          368808 11600
## + TEAM_PITCHING_BB  1       201 368607 11601
## + TEAM_PITCHING_H   1        49 368759 11602
## - TEAM_BATTING_SO   1       872 369680 11603
## - TEAM_BATTING_2B   1      1890 370698 11610
## - TEAM_BASERUN_CS   1      3741 372549 11621
## - TEAM_BATTING_3B   1      5133 373940 11629
## - TEAM_BATTING_BB   1      7530 376338 11644
## - TEAM_FIELDING_DP  1     14271 383079 11684
## - TEAM_BASERUN_SB   1     19925 388733 11718
## - TEAM_FIELDING_E   1     48836 417644 11881
## - TEAM_BATTING_H    1     50541 419349 11890
## 
## Step:  AIC=11582.18
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_PITCHING_BB  1      5368 360249 11550
## + TEAM_PITCHING_H   1      2006 363611 11572
## + TEAM_BATTING_HBP  1       689 364928 11580
## + TEAM_BATTING_HR   1       443 365173 11581
## + TEAM_PITCHING_HR  1       416 365201 11582
## <none>                          365617 11582
## - TEAM_BATTING_2B   1      2778 368395 11597
## - TEAM_BASERUN_CS   1      3130 368747 11600
## - TEAM_PITCHING_SO  1      3191 368808 11600
## - TEAM_BATTING_SO   1      3969 369586 11605
## - TEAM_BATTING_3B   1      5752 371369 11616
## - TEAM_BATTING_BB   1      9027 374644 11636
## - TEAM_FIELDING_DP  1     15955 381572 11677
## - TEAM_BASERUN_SB   1     18554 384171 11693
## - TEAM_FIELDING_E   1     50816 416433 11876
## - TEAM_BATTING_H    1     53680 419297 11892
## 
## Step:  AIC=11550.52
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO + TEAM_PITCHING_BB
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_BATTING_HBP  1       738 359510 11548
## + TEAM_PITCHING_HR  1       495 359754 11549
## <none>                          360249 11550
## + TEAM_BATTING_HR   1       267 359982 11551
## + TEAM_PITCHING_H   1       136 360113 11552
## - TEAM_BASERUN_CS   1      2849 363098 11566
## - TEAM_BATTING_2B   1      2901 363150 11567
## - TEAM_BATTING_3B   1      5197 365446 11581
## - TEAM_PITCHING_BB  1      5368 365617 11582
## - TEAM_PITCHING_SO  1      8358 368607 11601
## - TEAM_BATTING_SO   1      8892 369141 11604
## - TEAM_BATTING_BB   1     12290 372539 11625
## - TEAM_FIELDING_DP  1     16231 376480 11649
## - TEAM_BASERUN_SB   1     18854 379103 11665
## - TEAM_FIELDING_E   1     47377 407626 11830
## - TEAM_BATTING_H    1     58491 418739 11891
## 
## Step:  AIC=11547.85
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO + TEAM_PITCHING_BB + 
##     TEAM_BATTING_HBP
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_PITCHING_HR  1       504 359006 11547
## <none>                          359510 11548
## + TEAM_BATTING_HR   1       273 359237 11548
## + TEAM_PITCHING_H   1       128 359383 11549
## - TEAM_BATTING_HBP  1       738 360249 11550
## - TEAM_BATTING_2B   1      2853 362364 11564
## - TEAM_BASERUN_CS   1      2913 362424 11564
## - TEAM_BATTING_3B   1      5332 364843 11579
## - TEAM_PITCHING_BB  1      5417 364928 11580
## - TEAM_PITCHING_SO  1      8456 367966 11599
## - TEAM_BATTING_SO   1      9002 368512 11602
## - TEAM_BATTING_BB   1     12260 371770 11622
## - TEAM_FIELDING_DP  1     16193 375704 11646
## - TEAM_BASERUN_SB   1     19014 378524 11663
## - TEAM_FIELDING_E   1     47831 407341 11830
## - TEAM_BATTING_H    1     58433 417943 11889
## 
## Step:  AIC=11546.65
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO + TEAM_PITCHING_BB + 
##     TEAM_BATTING_HBP + TEAM_PITCHING_HR
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_BATTING_HR   1       446 358560 11546
## <none>                          359006 11547
## + TEAM_PITCHING_H   1       257 358749 11547
## - TEAM_PITCHING_HR  1       504 359510 11548
## - TEAM_BATTING_HBP  1       748 359754 11549
## - TEAM_BASERUN_CS   1      2728 361734 11562
## - TEAM_BATTING_2B   1      2899 361905 11563
## - TEAM_PITCHING_BB  1      5497 364503 11579
## - TEAM_BATTING_3B   1      5831 364837 11581
## - TEAM_PITCHING_SO  1      7368 366375 11591
## - TEAM_BATTING_SO   1      9064 368070 11601
## - TEAM_BATTING_BB   1     11897 370903 11619
## - TEAM_FIELDING_DP  1     16695 375701 11648
## - TEAM_BASERUN_SB   1     19479 378485 11665
## - TEAM_FIELDING_E   1     44704 403710 11812
## - TEAM_BATTING_H    1     44865 403871 11813
## 
## Step:  AIC=11545.83
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO + TEAM_PITCHING_BB + 
##     TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_BATTING_HR
## 
##                    Df Sum of Sq    RSS   AIC
## + TEAM_PITCHING_H   1      1476 357084 11538
## <none>                          358560 11546
## - TEAM_BATTING_HR   1       446 359006 11547
## - TEAM_PITCHING_HR  1       677 359237 11548
## - TEAM_BATTING_HBP  1       755 359315 11549
## - TEAM_BASERUN_CS   1      2695 361255 11561
## - TEAM_BATTING_2B   1      2735 361295 11561
## - TEAM_PITCHING_SO  1      5138 363699 11576
## - TEAM_PITCHING_BB  1      5916 364476 11581
## - TEAM_BATTING_3B   1      6273 364834 11583
## - TEAM_BATTING_SO   1      6452 365012 11584
## - TEAM_BATTING_BB   1     11593 370153 11616
## - TEAM_FIELDING_DP  1     15937 374497 11643
## - TEAM_BASERUN_SB   1     19855 378415 11666
## - TEAM_BATTING_H    1     39964 398524 11784
## - TEAM_FIELDING_E   1     43142 401702 11802
## 
## Step:  AIC=11538.44
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO + TEAM_PITCHING_BB + 
##     TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_BATTING_HR + TEAM_PITCHING_H
## 
##                    Df Sum of Sq    RSS   AIC
## <none>                          357084 11538
## - TEAM_BATTING_HBP  1       733 357817 11541
## - TEAM_PITCHING_H   1      1476 358560 11546
## - TEAM_BATTING_HR   1      1665 358749 11547
## - TEAM_PITCHING_HR  1      2012 359096 11549
## - TEAM_BATTING_2B   1      2437 359521 11552
## - TEAM_BASERUN_CS   1      2733 359817 11554
## - TEAM_PITCHING_BB  1      3400 360484 11558
## - TEAM_BATTING_3B   1      4711 361795 11566
## - TEAM_PITCHING_SO  1      5882 362966 11574
## - TEAM_BATTING_BB   1      7370 364454 11583
## - TEAM_BATTING_SO   1      7585 364670 11584
## - TEAM_FIELDING_DP  1     16461 373545 11639
## - TEAM_BASERUN_SB   1     18908 375993 11654
## - TEAM_BATTING_H    1     38248 395332 11768
## - TEAM_FIELDING_E   1     39343 396427 11774
#build final model
final_model <- lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
    TEAM_BATTING_2B + TEAM_PITCHING_BB + TEAM_PITCHING_H + TEAM_BATTING_HBP + 
    TEAM_PITCHING_HR + TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_PITCHING_SO, data = data_final)

#verify linearity
plot(final_model)

#summary statistics
summary(final_model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + 
##     TEAM_BASERUN_SB + TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + 
##     TEAM_BASERUN_CS + TEAM_BATTING_2B + TEAM_PITCHING_BB + TEAM_PITCHING_H + 
##     TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_BATTING_HR + TEAM_BATTING_SO + 
##     TEAM_PITCHING_SO, data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.657  -8.132  -0.096   8.044  49.910 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       36.589803  35.888552   1.020 0.308056    
## TEAM_BATTING_H     0.061233   0.003936  15.559  < 2e-16 ***
## TEAM_BATTING_BB    0.046931   0.006872   6.830 1.09e-11 ***
## TEAM_BASERUN_SB    6.602591   0.603558  10.939  < 2e-16 ***
## TEAM_FIELDING_E  -17.009771   1.077948 -15.780  < 2e-16 ***
## TEAM_FIELDING_DP -18.385002   1.801216 -10.207  < 2e-16 ***
## TEAM_BATTING_3B   18.089543   3.312764   5.461 5.27e-08 ***
## TEAM_BASERUN_CS   -3.996066   0.960769  -4.159 3.31e-05 ***
## TEAM_BATTING_2B   -0.034629   0.008818  -3.927 8.86e-05 ***
## TEAM_PITCHING_BB -12.690867   2.735800  -4.639 3.70e-06 ***
## TEAM_PITCHING_H   -9.785024   3.201368  -3.057 0.002265 ** 
## TEAM_BATTING_HBP   8.751096   4.063441   2.154 0.031376 *  
## TEAM_PITCHING_HR  12.128606   3.398826   3.568 0.000367 ***
## TEAM_BATTING_HR  -10.830901   3.336164  -3.247 0.001185 ** 
## TEAM_BATTING_SO   -0.029374   0.004239  -6.929 5.51e-12 ***
## TEAM_PITCHING_SO 111.406518  18.259114   6.101 1.23e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.57 on 2260 degrees of freedom
## Multiple R-squared:  0.3674, Adjusted R-squared:  0.3632 
## F-statistic: 87.51 on 15 and 2260 DF,  p-value: < 2.2e-16
#verify nearly normal residuals
par(mfrow=c(1,2))
hist(final_model$residuals)
qqnorm(final_model$residuals)
qqline(final_model$residuals) 

#verify constant variability.
plot(jitter(final_model$residuals,100) ~ jitter(data_final$TARGET_WINS,100)) 
abline(h = 0, lty = 3)  # adds a horizontal dashed line at y = 0

reduced_model <- lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + TEAM_BATTING_2B + TEAM_PITCHING_BB, data = data_final)

anova(reduced_model,final_model)
## Analysis of Variance Table
## 
## Model 1: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_2B + TEAM_PITCHING_BB
## Model 2: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_2B + TEAM_PITCHING_BB + TEAM_PITCHING_H + TEAM_BATTING_HBP + 
##     TEAM_PITCHING_HR + TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_PITCHING_SO
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   2266 369458                                  
## 2   2260 357084  6     12373 13.052 1.449e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(final_model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + 
##     TEAM_BASERUN_SB + TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + 
##     TEAM_BASERUN_CS + TEAM_BATTING_2B + TEAM_PITCHING_BB + TEAM_PITCHING_H + 
##     TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_BATTING_HR + TEAM_BATTING_SO + 
##     TEAM_PITCHING_SO, data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.657  -8.132  -0.096   8.044  49.910 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       36.589803  35.888552   1.020 0.308056    
## TEAM_BATTING_H     0.061233   0.003936  15.559  < 2e-16 ***
## TEAM_BATTING_BB    0.046931   0.006872   6.830 1.09e-11 ***
## TEAM_BASERUN_SB    6.602591   0.603558  10.939  < 2e-16 ***
## TEAM_FIELDING_E  -17.009771   1.077948 -15.780  < 2e-16 ***
## TEAM_FIELDING_DP -18.385002   1.801216 -10.207  < 2e-16 ***
## TEAM_BATTING_3B   18.089543   3.312764   5.461 5.27e-08 ***
## TEAM_BASERUN_CS   -3.996066   0.960769  -4.159 3.31e-05 ***
## TEAM_BATTING_2B   -0.034629   0.008818  -3.927 8.86e-05 ***
## TEAM_PITCHING_BB -12.690867   2.735800  -4.639 3.70e-06 ***
## TEAM_PITCHING_H   -9.785024   3.201368  -3.057 0.002265 ** 
## TEAM_BATTING_HBP   8.751096   4.063441   2.154 0.031376 *  
## TEAM_PITCHING_HR  12.128606   3.398826   3.568 0.000367 ***
## TEAM_BATTING_HR  -10.830901   3.336164  -3.247 0.001185 ** 
## TEAM_BATTING_SO   -0.029374   0.004239  -6.929 5.51e-12 ***
## TEAM_PITCHING_SO 111.406518  18.259114   6.101 1.23e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.57 on 2260 degrees of freedom
## Multiple R-squared:  0.3674, Adjusted R-squared:  0.3632 
## F-statistic: 87.51 on 15 and 2260 DF,  p-value: < 2.2e-16
#read evaluation dataset
data_eval <- read.csv(file="https://cdn-stage.fedweb.org/fed-2/2/moneyball-evaluation-data.csv",stringsAsFactors=T, header=T)
#count missing values
count_nas(data_eval)
##    variable_name_column number_missing_column percentage
## 1      TEAM_BATTING_HBP                   240         93
## 2       TEAM_BASERUN_CS                    87         34
## 3      TEAM_FIELDING_DP                    31         12
## 4       TEAM_BATTING_SO                    18          7
## 5      TEAM_PITCHING_SO                    18          7
## 6       TEAM_BASERUN_SB                    13          5
## 7        TEAM_BATTING_H                     0          0
## 8       TEAM_BATTING_2B                     0          0
## 9       TEAM_BATTING_3B                     0          0
## 10      TEAM_BATTING_HR                     0          0
## 11      TEAM_BATTING_BB                     0          0
## 12      TEAM_PITCHING_H                     0          0
## 13     TEAM_PITCHING_HR                     0          0
## 14     TEAM_PITCHING_BB                     0          0
## 15      TEAM_FIELDING_E                     0          0
#replace certain variables with their logs
data_eval <- data_eval %>% mutate(TEAM_BATTING_3B=ifelse(log(TEAM_BATTING_3B)>0,log(TEAM_BATTING_3B),NA),TEAM_BATTING_3B=ifelse(log(TEAM_BATTING_3B)>0,log(TEAM_BATTING_3B),NA),TEAM_BATTING_HR = ifelse( log(TEAM_BATTING_HR)>0,log(TEAM_BATTING_HR),NA),TEAM_PITCHING_SO = ifelse(log(TEAM_PITCHING_SO)>0,log(TEAM_PITCHING_SO),NA),TEAM_BASERUN_SB=ifelse(log(TEAM_BASERUN_SB)>0,log(TEAM_BASERUN_SB),NA),TEAM_BASERUN_CS = ifelse(log(TEAM_BASERUN_CS)>0,log(TEAM_BASERUN_CS),NA),TEAM_BATTING_HBP = ifelse(log(TEAM_BATTING_HBP)>0,log(TEAM_BATTING_HBP),NA),TEAM_PITCHING_H = ifelse(log(TEAM_PITCHING_H)>0,log(TEAM_PITCHING_H),NA),TEAM_FIELDING_E = ifelse(log(TEAM_FIELDING_E)>0,log(TEAM_FIELDING_E),NA),TEAM_PITCHING_HR = ifelse(log(TEAM_PITCHING_HR)>0,log(TEAM_PITCHING_HR),NA),TEAM_PITCHING_BB = ifelse(log(TEAM_PITCHING_BB)>0,log(TEAM_PITCHING_BB),NA),TEAM_PITCHING_SO = ifelse(log(TEAM_PITCHING_SO)>0,log(TEAM_PITCHING_SO),NA),TEAM_FIELDING_DP = ifelse(log(TEAM_FIELDING_DP)>0,log(TEAM_FIELDING_DP),NA))  

#impute missing values
imp.data <- mice(data_eval, m=4, method='cart', printFlag=FALSE)
data_eval  <- complete(imp.data)

#count missing values
count_nas(data_eval)
##    variable_name_column number_missing_column percentage
## 1        TEAM_BATTING_H                     0          0
## 2       TEAM_BATTING_2B                     0          0
## 3       TEAM_BATTING_3B                     0          0
## 4       TEAM_BATTING_HR                     0          0
## 5       TEAM_BATTING_BB                     0          0
## 6       TEAM_BATTING_SO                     0          0
## 7       TEAM_BASERUN_SB                     0          0
## 8       TEAM_BASERUN_CS                     0          0
## 9      TEAM_BATTING_HBP                     0          0
## 10      TEAM_PITCHING_H                     0          0
## 11     TEAM_PITCHING_HR                     0          0
## 12     TEAM_PITCHING_BB                     0          0
## 13     TEAM_PITCHING_SO                     0          0
## 14      TEAM_FIELDING_E                     0          0
## 15     TEAM_FIELDING_DP                     0          0
#merge imputed values
data <- complete(imp.data)
head(data)
##   INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1     9           1209             170        1.251765        4.418841
## 2    10           1221             151        1.214110        4.477337
## 3    14           1395             183        1.214110        4.532599
## 4    47           1539             309        1.214110        5.068904
## 5    60           1445             203        1.439718        1.609438
## 6    63           1431             236        1.378840        2.302585
##   TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1             447            1080        4.127134        3.912023
## 2             516             929        3.988984        3.663562
## 3             509             816        4.077537        3.850148
## 4             486             914        4.997212        4.043051
## 5              95             416        5.023881        4.330733
## 6             215             377        5.697093        4.477337
##   TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 1         4.204693        7.097549         4.418841         6.102559
## 2         3.970292        7.107425         4.477337         6.246107
## 3         3.951244        7.240650         4.532599         6.232448
## 4         3.737670        7.338888         5.068904         6.186209
## 5         4.143135        8.269245         2.639057         5.549076
## 6         4.204693        7.934872         2.995732         6.040255
##   TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1         1.943724        4.941642         5.049856
## 2         1.921926        4.905275         5.099866
## 3         1.902766        5.049856         5.030438
## 4         1.919541        4.820282         5.036953
## 5         1.949299        6.423247         4.867534
## 6         1.887256        6.349139         4.653960
TARGET_WINS_pred<-predict(final_model, newdata=data_eval)

pred_data<-cbind(TARGET_WINS_pred,data_eval) 
pred_data$TARGET_WINS_pred <- round(pred_data$TARGET_WINS_pred,0)


pred_data <- pred_data %>% select(INDEX,TARGET_WINS_pred)
head(pred_data)
##   INDEX TARGET_WINS_pred
## 1     9               64
## 2    10               65
## 3    14               73
## 4    47               83
## 5    60               70
## 6    63               70
write.csv(pred_data,file="moneyball-prediction.csv")