#read testing data set
data <- read.csv(file="https://cdn-stage.fedweb.org/fed-2/2/moneyball-training-data.csv",stringsAsFactors=T, header=T)
#display six first entries
head(data)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 1 39 1445 194 39
## 2 2 70 1339 219 22
## 3 3 86 1377 232 35
## 4 4 70 1387 209 38
## 5 5 82 1297 186 27
## 6 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 NA
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 NA NA 9364 84
## 2 28 NA 1347 191
## 3 27 NA 1377 137
## 4 30 NA 1396 97
## 5 39 NA 1297 102
## 6 59 NA 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 NA
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
#find dimentions
dim(data)
## [1] 2276 17
#chart of missing values
aggr(data[-1], prop = T, numbers = T, cex.axis=.4, ylab=c("Proportion of missingness","Missingness Pattern"),labels=names(data[-1]))

count_nas <- function(data){
variable_name_column <- c()
number_missing_column <- c()
for (i in 2:ncol(data)){
variable_name <- colnames(data[i])
number_missing <- sum(is.na(data[i]))
variable_name_column <- c(variable_name_column,variable_name)
number_missing_column <- c(number_missing_column,number_missing)
}
missing_table <- data.frame(variable_name_column,number_missing_column)
missing_table <- missing_table %>% mutate(percentage=round(number_missing_column*100/nrow(data),0)) %>% arrange(desc(percentage))
missing_table
}
#count NAs
count_nas(data)
## variable_name_column number_missing_column percentage
## 1 TEAM_BATTING_HBP 2085 92
## 2 TEAM_BASERUN_CS 772 34
## 3 TEAM_FIELDING_DP 286 13
## 4 TEAM_BASERUN_SB 131 6
## 5 TEAM_BATTING_SO 102 4
## 6 TEAM_PITCHING_SO 102 4
## 7 TARGET_WINS 0 0
## 8 TEAM_BATTING_H 0 0
## 9 TEAM_BATTING_2B 0 0
## 10 TEAM_BATTING_3B 0 0
## 11 TEAM_BATTING_HR 0 0
## 12 TEAM_BATTING_BB 0 0
## 13 TEAM_PITCHING_H 0 0
## 14 TEAM_PITCHING_HR 0 0
## 15 TEAM_PITCHING_BB 0 0
## 16 TEAM_FIELDING_E 0 0
#store original values
TEAM_BASERUN_SB_value <- data$TEAM_BASERUN_SB
TEAM_FIELDING_DP_value <- data$TEAM_FIELDING_DP
TEAM_BASERUN_CS_value <- data$TEAM_BASERUN_CS
TEAM_BATTING_HBP_value <- data$TEAM_BATTING_HBP
#impute missing data process
exclude <- c('INDEX')
include <- setdiff(names(data), exclude)
data <- data[include]
imp.data <- mice(data, m=4, method='cart', printFlag=FALSE)
xyplot(imp.data, TARGET_WINS ~ TEAM_BATTING_SO,main="Imputed TEAM_BATTING_SO", cex=0.5)

#compare with original graph
plot(TARGET_WINS ~ TEAM_BATTING_SO,data,main="Original TEAM_BATTING_SO")

densityplot(imp.data, ~ TEAM_PITCHING_SO)

xyplot(imp.data, TARGET_WINS ~ TEAM_BATTING_SO,jitter.data = TRUE, cex=0.5)

densityplot(imp.data, ~ TEAM_BATTING_SO)

#merge imputed values for TEAM_PITCHING_SO and TEAM_BATTING_SO into our original data set
data <- complete(imp.data)
head(data)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 39 1445 194 39
## 2 70 1339 219 22
## 3 86 1377 232 35
## 4 70 1387 209 38
## 5 82 1297 186 27
## 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 226
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 66 53 9364 84
## 2 28 45 1347 191
## 3 27 48 1377 137
## 4 30 64 1396 97
## 5 39 62 1297 102
## 6 59 49 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 113
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
data$TEAM_BASERUN_SB <- TEAM_BASERUN_SB_value
imp.data <- mice(data, m=6, method='cart', printFlag=FALSE)
xyplot(imp.data, TARGET_WINS ~ TEAM_BASERUN_SB,jitter.data = TRUE, cex=0.5)

densityplot(imp.data, ~ TEAM_BASERUN_SB,jitter.data = TRUE, cex=0.5)

#merge imputed values for TEAM_BASERUN_SB into our original data set
data <- complete(imp.data)
head(data)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 39 1445 194 39
## 2 70 1339 219 22
## 3 86 1377 232 35
## 4 70 1387 209 38
## 5 82 1297 186 27
## 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 136
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 66 53 9364 84
## 2 28 45 1347 191
## 3 27 48 1377 137
## 4 30 64 1396 97
## 5 39 62 1297 102
## 6 59 49 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 113
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
data$TEAM_FIELDING_DP <- TEAM_FIELDING_DP_value
imp.data <- mice(data, m=13, method='cart', printFlag=FALSE)
xyplot(imp.data, TARGET_WINS ~ TEAM_FIELDING_DP,main="Imputed TEAM_FIELDING_DP", cex=0.5)

#compare with original graph
plot(TARGET_WINS ~ TEAM_BATTING_SO,data,main="Original TEAM_FIELDING_DP")

densityplot(imp.data, ~ TEAM_FIELDING_DP)

#merge imputed values for TEAM_FIELDING_DP into our original data set
data <- complete(imp.data)
head(data)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 39 1445 194 39
## 2 70 1339 219 22
## 3 86 1377 232 35
## 4 70 1387 209 38
## 5 82 1297 186 27
## 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 136
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 66 53 9364 84
## 2 28 45 1347 191
## 3 27 48 1377 137
## 4 30 64 1396 97
## 5 39 62 1297 102
## 6 59 49 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 113
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
data$TEAM_BASERUN_CS <- TEAM_BASERUN_CS_value
imp.data <- mice(data, m=34, method='cart', printFlag=FALSE)
xyplot(imp.data, TARGET_WINS ~ TEAM_BASERUN_CS,cex=0.5)

densityplot(imp.data, ~ TEAM_BASERUN_CS)

imp.data_mean <- mice(data,
m=34,
defaultMethod='mean',
printFlag=FALSE)
xyplot(imp.data_mean, TARGET_WINS ~ TEAM_BASERUN_CS,cex=0.5)

densityplot(imp.data_mean, ~ TEAM_BASERUN_CS)

data <- complete(imp.data_mean)
head(data)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 39 1445 194 39
## 2 70 1339 219 22
## 3 86 1377 232 35
## 4 70 1387 209 38
## 5 82 1297 186 27
## 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 136
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 52.80386 53 9364 84
## 2 28.00000 45 1347 191
## 3 27.00000 48 1377 137
## 4 30.00000 64 1396 97
## 5 39.00000 62 1297 102
## 6 59.00000 49 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 113
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
data$TEAM_BATTING_HBP <- TEAM_BATTING_HBP_value
imp.data_mean <- mice(data,
m=92,
defaultMethod='mean',
printFlag=FALSE)
xyplot(imp.data_mean, TARGET_WINS ~ TEAM_BASERUN_CS,cex=0.5)

#densityplot(imp.data_mean, ~ TEAM_BASERUN_CS)
data <- complete(imp.data_mean)
head(data)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 39 1445 194 39
## 2 70 1339 219 22
## 3 86 1377 232 35
## 4 70 1387 209 38
## 5 82 1297 186 27
## 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 136
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 52.80386 59.35602 9364 84
## 2 28.00000 59.35602 1347 191
## 3 27.00000 59.35602 1377 137
## 4 30.00000 59.35602 1396 97
## 5 39.00000 59.35602 1297 102
## 6 59.00000 59.35602 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 113
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
#Confirm no NAs
count_nas(data)
## variable_name_column number_missing_column percentage
## 1 TEAM_BATTING_H 0 0
## 2 TEAM_BATTING_2B 0 0
## 3 TEAM_BATTING_3B 0 0
## 4 TEAM_BATTING_HR 0 0
## 5 TEAM_BATTING_BB 0 0
## 6 TEAM_BATTING_SO 0 0
## 7 TEAM_BASERUN_SB 0 0
## 8 TEAM_BASERUN_CS 0 0
## 9 TEAM_BATTING_HBP 0 0
## 10 TEAM_PITCHING_H 0 0
## 11 TEAM_PITCHING_HR 0 0
## 12 TEAM_PITCHING_BB 0 0
## 13 TEAM_PITCHING_SO 0 0
## 14 TEAM_FIELDING_E 0 0
## 15 TEAM_FIELDING_DP 0 0
# histograms and density lines
par(mfrow=c(2,2))
colnames <- dimnames(data)[[2]]
for (i in 2:ncol(data)) {
n<-max(data[i])
hist(data[,i], xlim=c(0, n), breaks=seq(0, n, 0.01*n), main=names(data)[i], probability=TRUE, col="gray", border="white")
d <- density(data[,i])
lines(d, col="red")
}



# Create separate boxplots for each attribute
par(mfrow=c(1,4))

for(i in 1:ncol(data)) {
boxplot(data[,i], main=names(data)[i])
}




#verify lineriarity
par(mfrow=c(1,2))
colnames <- dimnames(data)[[2]]
for (i in 2:ncol(data)) {
plot(data[,1]~data[,i],main=names(data)[i])
reg_line <- lm(data[,1]~data[,i])
abline(reg_line,col="red")
}







#verify normal distribution
par(mfrow=c(2,3))

colnames <- dimnames(data)[[2]]
for (i in 2:ncol(data)) {
qqnorm(data[,i],main=names(data)[i])
qqline(data[,i])
}


#verify multicollinearity
par(mfrow=c(1,1))

corrplot(cor(data), type = "upper", method = "number", tl.cex = 0.5, tl.col="black",number.cex = .5)

#review data
head(data)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 39 1445 194 39
## 2 70 1339 219 22
## 3 86 1377 232 35
## 4 70 1387 209 38
## 5 82 1297 186 27
## 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 136
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 52.80386 59.35602 9364 84
## 2 28.00000 59.35602 1347 191
## 3 27.00000 59.35602 1377 137
## 4 30.00000 59.35602 1396 97
## 5 39.00000 59.35602 1297 102
## 6 59.00000 59.35602 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 113
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
#confirm no NAs
count_nas(data)
## variable_name_column number_missing_column percentage
## 1 TEAM_BATTING_H 0 0
## 2 TEAM_BATTING_2B 0 0
## 3 TEAM_BATTING_3B 0 0
## 4 TEAM_BATTING_HR 0 0
## 5 TEAM_BATTING_BB 0 0
## 6 TEAM_BATTING_SO 0 0
## 7 TEAM_BASERUN_SB 0 0
## 8 TEAM_BASERUN_CS 0 0
## 9 TEAM_BATTING_HBP 0 0
## 10 TEAM_PITCHING_H 0 0
## 11 TEAM_PITCHING_HR 0 0
## 12 TEAM_PITCHING_BB 0 0
## 13 TEAM_PITCHING_SO 0 0
## 14 TEAM_FIELDING_E 0 0
## 15 TEAM_FIELDING_DP 0 0
#replace variables that don't satisfy assumptions with their logs
data_final <- data %>% mutate(TEAM_BATTING_3B=ifelse(log(TEAM_BATTING_3B)>0,log(TEAM_BATTING_3B),NA),TEAM_BATTING_3B=ifelse(log(TEAM_BATTING_3B)>0,log(TEAM_BATTING_3B),NA),TEAM_BATTING_HR = ifelse( log(TEAM_BATTING_HR)>0,log(TEAM_BATTING_HR),NA),TEAM_PITCHING_SO = ifelse(log(TEAM_PITCHING_SO)>0,log(TEAM_PITCHING_SO),NA),TEAM_BASERUN_SB=ifelse(log(TEAM_BASERUN_SB)>0,log(TEAM_BASERUN_SB),NA),TEAM_BASERUN_CS = ifelse(log(TEAM_BASERUN_CS)>0,log(TEAM_BASERUN_CS),NA),TEAM_BATTING_HBP = ifelse(log(TEAM_BATTING_HBP)>0,log(TEAM_BATTING_HBP),NA),TEAM_PITCHING_H = ifelse(log(TEAM_PITCHING_H)>0,log(TEAM_PITCHING_H),NA),TEAM_FIELDING_E = ifelse(log(TEAM_FIELDING_E)>0,log(TEAM_FIELDING_E),NA),TEAM_PITCHING_HR = ifelse(log(TEAM_PITCHING_HR)>0,log(TEAM_PITCHING_HR),NA),TEAM_PITCHING_BB = ifelse(log(TEAM_PITCHING_BB)>0,log(TEAM_PITCHING_BB),NA),TEAM_PITCHING_SO = ifelse(log(TEAM_PITCHING_SO)>0,log(TEAM_PITCHING_SO),NA),TEAM_FIELDING_DP = ifelse(log(TEAM_FIELDING_DP)>0,log(TEAM_FIELDING_DP),NA))
#review data
head(data_final)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 39 1445 194 1.298436
## 2 70 1339 219 1.128508
## 3 86 1377 232 1.268453
## 4 70 1387 209 1.291320
## 5 82 1297 186 1.192660
## 6 75 1279 200 1.276345
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 2.564949 143 842 4.912655
## 2 5.247024 685 1075 3.610918
## 3 4.919981 602 917 3.828641
## 4 4.564348 451 922 3.761200
## 5 4.624973 472 920 3.891820
## 6 4.521789 443 973 4.672829
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 3.966584 4.083554 9.144628 4.430817
## 2 3.332205 4.083554 7.205635 5.252273
## 3 3.295837 4.083554 7.227662 4.919981
## 4 3.401197 4.083554 7.241366 4.574711
## 5 3.663562 4.083554 7.167809 4.624973
## 6 4.077537 4.083554 7.153834 4.521789
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 6.831954 2.152282 6.918695 4.727388
## 2 6.535241 1.943989 5.262690 5.043425
## 3 6.400257 1.920022 5.164786 5.030438
## 4 6.118097 1.921768 5.099866 5.049856
## 5 6.156979 1.920501 4.927254 5.123964
## 6 6.093570 1.928674 4.812184 5.003946
#dealing with Nans that were created by log
for (i in 2:ncol(data_final)){
data_final[i][is.na(data_final[i]) ] <- NA
data_final[i][which(data_final[i]==Inf)] <- NA
}
data_final <- mice(data_final, m=4, method='cart', printFlag=FALSE)
data_final <- complete(data_final)
model.null = lm(TARGET_WINS ~ 1,data=data_final)
model.full = lm(TARGET_WINS ~ .,data=data_final)
x <- step(model.null,
scope = list(upper=model.full),
direction = "both",
data = data_final)
## Start: AIC=12550.76
## TARGET_WINS ~ 1
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_H 1 85318 479178 12180
## + TEAM_BATTING_2B 1 47181 517315 12354
## + TEAM_BATTING_BB 1 30530 533966 12426
## + TEAM_BATTING_HR 1 15422 549075 12490
## + TEAM_PITCHING_BB 1 14964 549533 12492
## + TEAM_PITCHING_HR 1 14016 550481 12496
## + TEAM_FIELDING_E 1 12846 551650 12500
## + TEAM_BASERUN_SB 1 8357 556139 12519
## + TEAM_BATTING_3B 1 6776 557720 12525
## + TEAM_PITCHING_SO 1 5656 558841 12530
## <none> 564496 12551
## + TEAM_BATTING_SO 1 358 564138 12551
## + TEAM_BATTING_HBP 1 180 564316 12552
## + TEAM_FIELDING_DP 1 88 564408 12552
## + TEAM_BASERUN_CS 1 74 564422 12552
## + TEAM_PITCHING_H 1 66 564430 12552
##
## Step: AIC=12179.81
## TARGET_WINS ~ TEAM_BATTING_H
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_BB 1 38578 440601 11991
## + TEAM_FIELDING_E 1 35701 443478 12006
## + TEAM_PITCHING_H 1 35085 444093 12009
## + TEAM_BATTING_HR 1 15007 464172 12109
## + TEAM_BATTING_SO 1 14144 465034 12114
## + TEAM_PITCHING_BB 1 10258 468920 12133
## + TEAM_PITCHING_HR 1 8083 471095 12143
## + TEAM_BASERUN_SB 1 4304 474875 12161
## + TEAM_BATTING_2B 1 4082 475097 12162
## + TEAM_PITCHING_SO 1 3332 475847 12166
## <none> 479178 12180
## + TEAM_BATTING_3B 1 400 478778 12180
## + TEAM_BATTING_HBP 1 228 478951 12181
## + TEAM_FIELDING_DP 1 34 479145 12182
## + TEAM_BASERUN_CS 1 3 479175 12182
## - TEAM_BATTING_H 1 85318 564496 12551
##
## Step: AIC=11990.78
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB
##
## Df Sum of Sq RSS AIC
## + TEAM_BASERUN_SB 1 14643 425958 11916
## + TEAM_FIELDING_DP 1 11083 429518 11935
## + TEAM_PITCHING_H 1 7667 432934 11953
## + TEAM_FIELDING_E 1 7571 433029 11953
## + TEAM_PITCHING_BB 1 4102 436499 11972
## + TEAM_BATTING_SO 1 2373 438228 11980
## + TEAM_PITCHING_SO 1 1375 439226 11986
## + TEAM_BATTING_3B 1 1230 439371 11986
## + TEAM_BASERUN_CS 1 711 439890 11989
## <none> 440601 11991
## + TEAM_BATTING_HBP 1 242 440359 11992
## + TEAM_BATTING_HR 1 139 440462 11992
## + TEAM_BATTING_2B 1 52 440548 11992
## + TEAM_PITCHING_HR 1 4 440596 11993
## - TEAM_BATTING_BB 1 38578 479178 12180
## - TEAM_BATTING_H 1 93365 533966 12426
##
## Step: AIC=11915.85
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB
##
## Df Sum of Sq RSS AIC
## + TEAM_FIELDING_E 1 27097 398861 11768
## + TEAM_PITCHING_H 1 10308 415651 11862
## + TEAM_PITCHING_BB 1 8575 417383 11872
## + TEAM_BATTING_HR 1 5476 420483 11888
## + TEAM_BATTING_SO 1 3376 422582 11900
## + TEAM_PITCHING_HR 1 3006 422952 11902
## + TEAM_FIELDING_DP 1 2697 423261 11903
## + TEAM_PITCHING_SO 1 1012 424946 11912
## + TEAM_BASERUN_CS 1 865 425093 11913
## <none> 425958 11916
## + TEAM_BATTING_HBP 1 220 425738 11917
## + TEAM_BATTING_2B 1 189 425769 11917
## + TEAM_BATTING_3B 1 44 425914 11918
## - TEAM_BASERUN_SB 1 14643 440601 11991
## - TEAM_BATTING_BB 1 48917 474875 12161
## - TEAM_BATTING_H 1 87575 513534 12339
##
## Step: AIC=11768.25
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E
##
## Df Sum of Sq RSS AIC
## + TEAM_FIELDING_DP 1 16088 382773 11676
## + TEAM_BATTING_3B 1 7233 391628 11729
## + TEAM_BATTING_2B 1 4497 394363 11744
## + TEAM_BASERUN_CS 1 2836 396025 11754
## + TEAM_BATTING_SO 1 2551 396310 11756
## + TEAM_PITCHING_BB 1 1614 397247 11761
## + TEAM_BATTING_HR 1 1241 397620 11763
## + TEAM_PITCHING_HR 1 977 397884 11765
## + TEAM_PITCHING_SO 1 797 398064 11766
## + TEAM_PITCHING_H 1 740 398121 11766
## + TEAM_BATTING_HBP 1 508 398353 11767
## <none> 398861 11768
## - TEAM_BATTING_BB 1 7316 406177 11808
## - TEAM_FIELDING_E 1 27097 425958 11916
## - TEAM_BASERUN_SB 1 34169 433029 11953
## - TEAM_BATTING_H 1 106965 505826 12307
##
## Step: AIC=11676.55
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_3B 1 7533 375239 11633
## + TEAM_BATTING_2B 1 4020 378753 11654
## + TEAM_BATTING_SO 1 3193 379580 11660
## + TEAM_BASERUN_CS 1 1470 381303 11670
## + TEAM_PITCHING_BB 1 880 381893 11673
## + TEAM_PITCHING_SO 1 573 382199 11675
## + TEAM_PITCHING_H 1 516 382256 11676
## + TEAM_BATTING_HBP 1 471 382301 11676
## <none> 382773 11676
## + TEAM_BATTING_HR 1 314 382458 11677
## + TEAM_PITCHING_HR 1 199 382573 11677
## - TEAM_BATTING_BB 1 11459 394231 11742
## - TEAM_FIELDING_DP 1 16088 398861 11768
## - TEAM_BASERUN_SB 1 17795 400568 11778
## - TEAM_FIELDING_E 1 40488 423261 11903
## - TEAM_BATTING_H 1 115808 498581 12276
##
## Step: AIC=11633.31
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B
##
## Df Sum of Sq RSS AIC
## + TEAM_BASERUN_CS 1 2917 372322 11618
## + TEAM_BATTING_2B 1 2458 372782 11620
## + TEAM_BATTING_SO 1 678 374561 11631
## + TEAM_BATTING_HBP 1 611 374629 11632
## + TEAM_PITCHING_HR 1 585 374654 11632
## + TEAM_PITCHING_BB 1 337 374902 11633
## <none> 375239 11633
## + TEAM_BATTING_HR 1 184 375055 11634
## + TEAM_PITCHING_SO 1 68 375172 11635
## + TEAM_PITCHING_H 1 42 375198 11635
## - TEAM_BATTING_3B 1 7533 382773 11676
## - TEAM_BATTING_BB 1 8921 384160 11685
## - TEAM_BASERUN_SB 1 15473 390712 11723
## - TEAM_FIELDING_DP 1 16389 391628 11729
## - TEAM_FIELDING_E 1 47755 422994 11904
## - TEAM_BATTING_H 1 94807 470046 12144
##
## Step: AIC=11617.55
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_2B 1 2642 369680 11603
## + TEAM_BATTING_SO 1 1624 370698 11610
## + TEAM_BATTING_HBP 1 676 371646 11615
## + TEAM_PITCHING_BB 1 437 371885 11617
## <none> 372322 11618
## + TEAM_PITCHING_HR 1 217 372105 11618
## + TEAM_PITCHING_SO 1 27 372295 11619
## + TEAM_BATTING_HR 1 23 372300 11619
## + TEAM_PITCHING_H 1 12 372310 11620
## - TEAM_BASERUN_CS 1 2917 375239 11633
## - TEAM_BATTING_BB 1 7465 379788 11661
## - TEAM_BATTING_3B 1 8981 381303 11670
## - TEAM_FIELDING_DP 1 14561 386883 11703
## - TEAM_BASERUN_SB 1 18327 390649 11725
## - TEAM_FIELDING_E 1 50294 422616 11904
## - TEAM_BATTING_H 1 93208 465531 12124
##
## Step: AIC=11603.34
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_2B
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_SO 1 872 368808 11600
## + TEAM_BATTING_HBP 1 629 369051 11602
## + TEAM_PITCHING_HR 1 510 369170 11602
## <none> 369680 11603
## + TEAM_PITCHING_BB 1 222 369458 11604
## + TEAM_BATTING_HR 1 135 369545 11604
## + TEAM_PITCHING_SO 1 94 369586 11605
## + TEAM_PITCHING_H 1 87 369592 11605
## - TEAM_BATTING_2B 1 2642 372322 11618
## - TEAM_BASERUN_CS 1 3102 372782 11620
## - TEAM_BATTING_3B 1 7284 376964 11646
## - TEAM_BATTING_BB 1 7701 377381 11648
## - TEAM_FIELDING_DP 1 14105 383785 11687
## - TEAM_BASERUN_SB 1 19504 389184 11718
## - TEAM_FIELDING_E 1 52506 422186 11904
## - TEAM_BATTING_H 1 64604 434284 11968
##
## Step: AIC=11599.96
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_2B + TEAM_BATTING_SO
##
## Df Sum of Sq RSS AIC
## + TEAM_PITCHING_SO 1 3191 365617 11582
## + TEAM_PITCHING_HR 1 1271 367537 11594
## + TEAM_BATTING_HBP 1 642 368166 11598
## + TEAM_BATTING_HR 1 602 368206 11598
## <none> 368808 11600
## + TEAM_PITCHING_BB 1 201 368607 11601
## + TEAM_PITCHING_H 1 49 368759 11602
## - TEAM_BATTING_SO 1 872 369680 11603
## - TEAM_BATTING_2B 1 1890 370698 11610
## - TEAM_BASERUN_CS 1 3741 372549 11621
## - TEAM_BATTING_3B 1 5133 373940 11629
## - TEAM_BATTING_BB 1 7530 376338 11644
## - TEAM_FIELDING_DP 1 14271 383079 11684
## - TEAM_BASERUN_SB 1 19925 388733 11718
## - TEAM_FIELDING_E 1 48836 417644 11881
## - TEAM_BATTING_H 1 50541 419349 11890
##
## Step: AIC=11582.18
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO
##
## Df Sum of Sq RSS AIC
## + TEAM_PITCHING_BB 1 5368 360249 11550
## + TEAM_PITCHING_H 1 2006 363611 11572
## + TEAM_BATTING_HBP 1 689 364928 11580
## + TEAM_BATTING_HR 1 443 365173 11581
## + TEAM_PITCHING_HR 1 416 365201 11582
## <none> 365617 11582
## - TEAM_BATTING_2B 1 2778 368395 11597
## - TEAM_BASERUN_CS 1 3130 368747 11600
## - TEAM_PITCHING_SO 1 3191 368808 11600
## - TEAM_BATTING_SO 1 3969 369586 11605
## - TEAM_BATTING_3B 1 5752 371369 11616
## - TEAM_BATTING_BB 1 9027 374644 11636
## - TEAM_FIELDING_DP 1 15955 381572 11677
## - TEAM_BASERUN_SB 1 18554 384171 11693
## - TEAM_FIELDING_E 1 50816 416433 11876
## - TEAM_BATTING_H 1 53680 419297 11892
##
## Step: AIC=11550.52
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO + TEAM_PITCHING_BB
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_HBP 1 738 359510 11548
## + TEAM_PITCHING_HR 1 495 359754 11549
## <none> 360249 11550
## + TEAM_BATTING_HR 1 267 359982 11551
## + TEAM_PITCHING_H 1 136 360113 11552
## - TEAM_BASERUN_CS 1 2849 363098 11566
## - TEAM_BATTING_2B 1 2901 363150 11567
## - TEAM_BATTING_3B 1 5197 365446 11581
## - TEAM_PITCHING_BB 1 5368 365617 11582
## - TEAM_PITCHING_SO 1 8358 368607 11601
## - TEAM_BATTING_SO 1 8892 369141 11604
## - TEAM_BATTING_BB 1 12290 372539 11625
## - TEAM_FIELDING_DP 1 16231 376480 11649
## - TEAM_BASERUN_SB 1 18854 379103 11665
## - TEAM_FIELDING_E 1 47377 407626 11830
## - TEAM_BATTING_H 1 58491 418739 11891
##
## Step: AIC=11547.85
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO + TEAM_PITCHING_BB +
## TEAM_BATTING_HBP
##
## Df Sum of Sq RSS AIC
## + TEAM_PITCHING_HR 1 504 359006 11547
## <none> 359510 11548
## + TEAM_BATTING_HR 1 273 359237 11548
## + TEAM_PITCHING_H 1 128 359383 11549
## - TEAM_BATTING_HBP 1 738 360249 11550
## - TEAM_BATTING_2B 1 2853 362364 11564
## - TEAM_BASERUN_CS 1 2913 362424 11564
## - TEAM_BATTING_3B 1 5332 364843 11579
## - TEAM_PITCHING_BB 1 5417 364928 11580
## - TEAM_PITCHING_SO 1 8456 367966 11599
## - TEAM_BATTING_SO 1 9002 368512 11602
## - TEAM_BATTING_BB 1 12260 371770 11622
## - TEAM_FIELDING_DP 1 16193 375704 11646
## - TEAM_BASERUN_SB 1 19014 378524 11663
## - TEAM_FIELDING_E 1 47831 407341 11830
## - TEAM_BATTING_H 1 58433 417943 11889
##
## Step: AIC=11546.65
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO + TEAM_PITCHING_BB +
## TEAM_BATTING_HBP + TEAM_PITCHING_HR
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_HR 1 446 358560 11546
## <none> 359006 11547
## + TEAM_PITCHING_H 1 257 358749 11547
## - TEAM_PITCHING_HR 1 504 359510 11548
## - TEAM_BATTING_HBP 1 748 359754 11549
## - TEAM_BASERUN_CS 1 2728 361734 11562
## - TEAM_BATTING_2B 1 2899 361905 11563
## - TEAM_PITCHING_BB 1 5497 364503 11579
## - TEAM_BATTING_3B 1 5831 364837 11581
## - TEAM_PITCHING_SO 1 7368 366375 11591
## - TEAM_BATTING_SO 1 9064 368070 11601
## - TEAM_BATTING_BB 1 11897 370903 11619
## - TEAM_FIELDING_DP 1 16695 375701 11648
## - TEAM_BASERUN_SB 1 19479 378485 11665
## - TEAM_FIELDING_E 1 44704 403710 11812
## - TEAM_BATTING_H 1 44865 403871 11813
##
## Step: AIC=11545.83
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO + TEAM_PITCHING_BB +
## TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_BATTING_HR
##
## Df Sum of Sq RSS AIC
## + TEAM_PITCHING_H 1 1476 357084 11538
## <none> 358560 11546
## - TEAM_BATTING_HR 1 446 359006 11547
## - TEAM_PITCHING_HR 1 677 359237 11548
## - TEAM_BATTING_HBP 1 755 359315 11549
## - TEAM_BASERUN_CS 1 2695 361255 11561
## - TEAM_BATTING_2B 1 2735 361295 11561
## - TEAM_PITCHING_SO 1 5138 363699 11576
## - TEAM_PITCHING_BB 1 5916 364476 11581
## - TEAM_BATTING_3B 1 6273 364834 11583
## - TEAM_BATTING_SO 1 6452 365012 11584
## - TEAM_BATTING_BB 1 11593 370153 11616
## - TEAM_FIELDING_DP 1 15937 374497 11643
## - TEAM_BASERUN_SB 1 19855 378415 11666
## - TEAM_BATTING_H 1 39964 398524 11784
## - TEAM_FIELDING_E 1 43142 401702 11802
##
## Step: AIC=11538.44
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_2B + TEAM_BATTING_SO + TEAM_PITCHING_SO + TEAM_PITCHING_BB +
## TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_BATTING_HR + TEAM_PITCHING_H
##
## Df Sum of Sq RSS AIC
## <none> 357084 11538
## - TEAM_BATTING_HBP 1 733 357817 11541
## - TEAM_PITCHING_H 1 1476 358560 11546
## - TEAM_BATTING_HR 1 1665 358749 11547
## - TEAM_PITCHING_HR 1 2012 359096 11549
## - TEAM_BATTING_2B 1 2437 359521 11552
## - TEAM_BASERUN_CS 1 2733 359817 11554
## - TEAM_PITCHING_BB 1 3400 360484 11558
## - TEAM_BATTING_3B 1 4711 361795 11566
## - TEAM_PITCHING_SO 1 5882 362966 11574
## - TEAM_BATTING_BB 1 7370 364454 11583
## - TEAM_BATTING_SO 1 7585 364670 11584
## - TEAM_FIELDING_DP 1 16461 373545 11639
## - TEAM_BASERUN_SB 1 18908 375993 11654
## - TEAM_BATTING_H 1 38248 395332 11768
## - TEAM_FIELDING_E 1 39343 396427 11774
#build final model
final_model <- lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
TEAM_BATTING_2B + TEAM_PITCHING_BB + TEAM_PITCHING_H + TEAM_BATTING_HBP +
TEAM_PITCHING_HR + TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_PITCHING_SO, data = data_final)
#verify linearity
plot(final_model)




#summary statistics
summary(final_model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB +
## TEAM_BASERUN_SB + TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B +
## TEAM_BASERUN_CS + TEAM_BATTING_2B + TEAM_PITCHING_BB + TEAM_PITCHING_H +
## TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_BATTING_HR + TEAM_BATTING_SO +
## TEAM_PITCHING_SO, data = data_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.657 -8.132 -0.096 8.044 49.910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.589803 35.888552 1.020 0.308056
## TEAM_BATTING_H 0.061233 0.003936 15.559 < 2e-16 ***
## TEAM_BATTING_BB 0.046931 0.006872 6.830 1.09e-11 ***
## TEAM_BASERUN_SB 6.602591 0.603558 10.939 < 2e-16 ***
## TEAM_FIELDING_E -17.009771 1.077948 -15.780 < 2e-16 ***
## TEAM_FIELDING_DP -18.385002 1.801216 -10.207 < 2e-16 ***
## TEAM_BATTING_3B 18.089543 3.312764 5.461 5.27e-08 ***
## TEAM_BASERUN_CS -3.996066 0.960769 -4.159 3.31e-05 ***
## TEAM_BATTING_2B -0.034629 0.008818 -3.927 8.86e-05 ***
## TEAM_PITCHING_BB -12.690867 2.735800 -4.639 3.70e-06 ***
## TEAM_PITCHING_H -9.785024 3.201368 -3.057 0.002265 **
## TEAM_BATTING_HBP 8.751096 4.063441 2.154 0.031376 *
## TEAM_PITCHING_HR 12.128606 3.398826 3.568 0.000367 ***
## TEAM_BATTING_HR -10.830901 3.336164 -3.247 0.001185 **
## TEAM_BATTING_SO -0.029374 0.004239 -6.929 5.51e-12 ***
## TEAM_PITCHING_SO 111.406518 18.259114 6.101 1.23e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.57 on 2260 degrees of freedom
## Multiple R-squared: 0.3674, Adjusted R-squared: 0.3632
## F-statistic: 87.51 on 15 and 2260 DF, p-value: < 2.2e-16
#verify nearly normal residuals
par(mfrow=c(1,2))
hist(final_model$residuals)
qqnorm(final_model$residuals)
qqline(final_model$residuals)

#verify constant variability.
plot(jitter(final_model$residuals,100) ~ jitter(data_final$TARGET_WINS,100))
abline(h = 0, lty = 3) # adds a horizontal dashed line at y = 0
reduced_model <- lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + TEAM_BATTING_2B + TEAM_PITCHING_BB, data = data_final)
anova(reduced_model,final_model)
## Analysis of Variance Table
##
## Model 1: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_2B + TEAM_PITCHING_BB
## Model 2: TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_2B + TEAM_PITCHING_BB + TEAM_PITCHING_H + TEAM_BATTING_HBP +
## TEAM_PITCHING_HR + TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_PITCHING_SO
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2266 369458
## 2 2260 357084 6 12373 13.052 1.449e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(final_model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB +
## TEAM_BASERUN_SB + TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_BATTING_3B +
## TEAM_BASERUN_CS + TEAM_BATTING_2B + TEAM_PITCHING_BB + TEAM_PITCHING_H +
## TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_BATTING_HR + TEAM_BATTING_SO +
## TEAM_PITCHING_SO, data = data_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.657 -8.132 -0.096 8.044 49.910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.589803 35.888552 1.020 0.308056
## TEAM_BATTING_H 0.061233 0.003936 15.559 < 2e-16 ***
## TEAM_BATTING_BB 0.046931 0.006872 6.830 1.09e-11 ***
## TEAM_BASERUN_SB 6.602591 0.603558 10.939 < 2e-16 ***
## TEAM_FIELDING_E -17.009771 1.077948 -15.780 < 2e-16 ***
## TEAM_FIELDING_DP -18.385002 1.801216 -10.207 < 2e-16 ***
## TEAM_BATTING_3B 18.089543 3.312764 5.461 5.27e-08 ***
## TEAM_BASERUN_CS -3.996066 0.960769 -4.159 3.31e-05 ***
## TEAM_BATTING_2B -0.034629 0.008818 -3.927 8.86e-05 ***
## TEAM_PITCHING_BB -12.690867 2.735800 -4.639 3.70e-06 ***
## TEAM_PITCHING_H -9.785024 3.201368 -3.057 0.002265 **
## TEAM_BATTING_HBP 8.751096 4.063441 2.154 0.031376 *
## TEAM_PITCHING_HR 12.128606 3.398826 3.568 0.000367 ***
## TEAM_BATTING_HR -10.830901 3.336164 -3.247 0.001185 **
## TEAM_BATTING_SO -0.029374 0.004239 -6.929 5.51e-12 ***
## TEAM_PITCHING_SO 111.406518 18.259114 6.101 1.23e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.57 on 2260 degrees of freedom
## Multiple R-squared: 0.3674, Adjusted R-squared: 0.3632
## F-statistic: 87.51 on 15 and 2260 DF, p-value: < 2.2e-16
#read evaluation dataset
data_eval <- read.csv(file="https://cdn-stage.fedweb.org/fed-2/2/moneyball-evaluation-data.csv",stringsAsFactors=T, header=T)
#count missing values
count_nas(data_eval)
## variable_name_column number_missing_column percentage
## 1 TEAM_BATTING_HBP 240 93
## 2 TEAM_BASERUN_CS 87 34
## 3 TEAM_FIELDING_DP 31 12
## 4 TEAM_BATTING_SO 18 7
## 5 TEAM_PITCHING_SO 18 7
## 6 TEAM_BASERUN_SB 13 5
## 7 TEAM_BATTING_H 0 0
## 8 TEAM_BATTING_2B 0 0
## 9 TEAM_BATTING_3B 0 0
## 10 TEAM_BATTING_HR 0 0
## 11 TEAM_BATTING_BB 0 0
## 12 TEAM_PITCHING_H 0 0
## 13 TEAM_PITCHING_HR 0 0
## 14 TEAM_PITCHING_BB 0 0
## 15 TEAM_FIELDING_E 0 0
#replace certain variables with their logs
data_eval <- data_eval %>% mutate(TEAM_BATTING_3B=ifelse(log(TEAM_BATTING_3B)>0,log(TEAM_BATTING_3B),NA),TEAM_BATTING_3B=ifelse(log(TEAM_BATTING_3B)>0,log(TEAM_BATTING_3B),NA),TEAM_BATTING_HR = ifelse( log(TEAM_BATTING_HR)>0,log(TEAM_BATTING_HR),NA),TEAM_PITCHING_SO = ifelse(log(TEAM_PITCHING_SO)>0,log(TEAM_PITCHING_SO),NA),TEAM_BASERUN_SB=ifelse(log(TEAM_BASERUN_SB)>0,log(TEAM_BASERUN_SB),NA),TEAM_BASERUN_CS = ifelse(log(TEAM_BASERUN_CS)>0,log(TEAM_BASERUN_CS),NA),TEAM_BATTING_HBP = ifelse(log(TEAM_BATTING_HBP)>0,log(TEAM_BATTING_HBP),NA),TEAM_PITCHING_H = ifelse(log(TEAM_PITCHING_H)>0,log(TEAM_PITCHING_H),NA),TEAM_FIELDING_E = ifelse(log(TEAM_FIELDING_E)>0,log(TEAM_FIELDING_E),NA),TEAM_PITCHING_HR = ifelse(log(TEAM_PITCHING_HR)>0,log(TEAM_PITCHING_HR),NA),TEAM_PITCHING_BB = ifelse(log(TEAM_PITCHING_BB)>0,log(TEAM_PITCHING_BB),NA),TEAM_PITCHING_SO = ifelse(log(TEAM_PITCHING_SO)>0,log(TEAM_PITCHING_SO),NA),TEAM_FIELDING_DP = ifelse(log(TEAM_FIELDING_DP)>0,log(TEAM_FIELDING_DP),NA))
#impute missing values
imp.data <- mice(data_eval, m=4, method='cart', printFlag=FALSE)
data_eval <- complete(imp.data)
#count missing values
count_nas(data_eval)
## variable_name_column number_missing_column percentage
## 1 TEAM_BATTING_H 0 0
## 2 TEAM_BATTING_2B 0 0
## 3 TEAM_BATTING_3B 0 0
## 4 TEAM_BATTING_HR 0 0
## 5 TEAM_BATTING_BB 0 0
## 6 TEAM_BATTING_SO 0 0
## 7 TEAM_BASERUN_SB 0 0
## 8 TEAM_BASERUN_CS 0 0
## 9 TEAM_BATTING_HBP 0 0
## 10 TEAM_PITCHING_H 0 0
## 11 TEAM_PITCHING_HR 0 0
## 12 TEAM_PITCHING_BB 0 0
## 13 TEAM_PITCHING_SO 0 0
## 14 TEAM_FIELDING_E 0 0
## 15 TEAM_FIELDING_DP 0 0
#merge imputed values
data <- complete(imp.data)
head(data)
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1 9 1209 170 1.251765 4.418841
## 2 10 1221 151 1.214110 4.477337
## 3 14 1395 183 1.214110 4.532599
## 4 47 1539 309 1.214110 5.068904
## 5 60 1445 203 1.439718 1.609438
## 6 63 1431 236 1.378840 2.302585
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1 447 1080 4.127134 3.912023
## 2 516 929 3.988984 3.663562
## 3 509 816 4.077537 3.850148
## 4 486 914 4.997212 4.043051
## 5 95 416 5.023881 4.330733
## 6 215 377 5.697093 4.477337
## TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 1 4.204693 7.097549 4.418841 6.102559
## 2 3.970292 7.107425 4.477337 6.246107
## 3 3.951244 7.240650 4.532599 6.232448
## 4 3.737670 7.338888 5.068904 6.186209
## 5 4.143135 8.269245 2.639057 5.549076
## 6 4.204693 7.934872 2.995732 6.040255
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 1.943724 4.941642 5.049856
## 2 1.921926 4.905275 5.099866
## 3 1.902766 5.049856 5.030438
## 4 1.919541 4.820282 5.036953
## 5 1.949299 6.423247 4.867534
## 6 1.887256 6.349139 4.653960
TARGET_WINS_pred<-predict(final_model, newdata=data_eval)
pred_data<-cbind(TARGET_WINS_pred,data_eval)
pred_data$TARGET_WINS_pred <- round(pred_data$TARGET_WINS_pred,0)
pred_data <- pred_data %>% select(INDEX,TARGET_WINS_pred)
head(pred_data)
## INDEX TARGET_WINS_pred
## 1 9 64
## 2 10 65
## 3 14 73
## 4 47 83
## 5 60 70
## 6 63 70
write.csv(pred_data,file="moneyball-prediction.csv")