library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## Warning: package 'stringr' was built under R version 3.6.2
## Warning: package 'forcats' was built under R version 3.6.2
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(broom)
## Warning: package 'broom' was built under R version 3.6.2
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.6.2
## Loading required package: lattice
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.2
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.2
## corrplot 0.84 loaded
library(mice)
## Warning: package 'mice' was built under R version 3.6.2
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Warning: package 'caret' was built under R version 3.6.2
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
# set working directory
setwd("~/school_of_professional_studies/621. Business Analytics and Data Mining/hw1")
# read data
dfTrain <- read.csv("moneyball-training-data.csv")
dfEval <- read.csv("moneyball-evaluation-data.csv")
# head, dim
head(dfTrain); dim(dfTrain)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 1 39 1445 194 39
## 2 2 70 1339 219 22
## 3 3 86 1377 232 35
## 4 4 70 1387 209 38
## 5 5 82 1297 186 27
## 6 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 NA
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 NA NA 9364 84
## 2 28 NA 1347 191
## 3 27 NA 1377 137
## 4 30 NA 1396 97
## 5 39 NA 1297 102
## 6 59 NA 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 NA
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
## [1] 2276 17
head(dfEval); dim(dfEval)
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1 9 1209 170 33 83
## 2 10 1221 151 29 88
## 3 14 1395 183 29 93
## 4 47 1539 309 29 159
## 5 60 1445 203 68 5
## 6 63 1431 236 53 10
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1 447 1080 62 50
## 2 516 929 54 39
## 3 509 816 59 47
## 4 486 914 148 57
## 5 95 416 NA NA
## 6 215 377 NA NA
## TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 1 NA 1209 83 447
## 2 NA 1221 88 516
## 3 NA 1395 93 509
## 4 42 1539 159 486
## 5 NA 3902 14 257
## 6 NA 2793 20 420
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 1080 140 156
## 2 929 135 164
## 3 816 156 153
## 4 914 124 154
## 5 1123 616 130
## 6 736 572 105
## [1] 259 16
# what is the difference between the two data.frames?
paste0(names(dfTrain)[names(dfTrain) %nin% names(dfEval)], " is not found in the evaluation data set")
## [1] "TARGET_WINS is not found in the evaluation data set"
# data structure
str(dfTrain)
## 'data.frame': 2276 obs. of 17 variables:
## $ INDEX : int 1 2 3 4 5 6 7 8 11 12 ...
## $ TARGET_WINS : int 39 70 86 70 82 75 80 85 86 76 ...
## $ TEAM_BATTING_H : int 1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
## $ TEAM_BATTING_2B : int 194 219 232 209 186 200 179 171 197 213 ...
## $ TEAM_BATTING_3B : int 39 22 35 38 27 36 54 37 40 18 ...
## $ TEAM_BATTING_HR : int 13 190 137 96 102 92 122 115 114 96 ...
## $ TEAM_BATTING_BB : int 143 685 602 451 472 443 525 456 447 441 ...
## $ TEAM_BATTING_SO : int 842 1075 917 922 920 973 1062 1027 922 827 ...
## $ TEAM_BASERUN_SB : int NA 37 46 43 49 107 80 40 69 72 ...
## $ TEAM_BASERUN_CS : int NA 28 27 30 39 59 54 36 27 34 ...
## $ TEAM_BATTING_HBP: int NA NA NA NA NA NA NA NA NA NA ...
## $ TEAM_PITCHING_H : int 9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
## $ TEAM_PITCHING_HR: int 84 191 137 97 102 92 122 116 114 96 ...
## $ TEAM_PITCHING_BB: int 927 689 602 454 472 443 525 459 447 441 ...
## $ TEAM_PITCHING_SO: int 5456 1082 917 928 920 973 1062 1033 922 827 ...
## $ TEAM_FIELDING_E : int 1011 193 175 164 138 123 136 112 127 131 ...
## $ TEAM_FIELDING_DP: int NA 155 153 156 168 149 186 136 169 159 ...
# descriptive statistics
summary(dfTrain)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0
## Median :1270.5 Median : 82.00 Median :1454 Median :238.0
## Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2
## 3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0
## Median : 47.00 Median :102.00 Median :512.0 Median : 750.0
## Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6
## 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0
## NA's :102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137
## 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419
## Median :101.0 Median : 49.0 Median :58.00 Median : 1518
## Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779
## 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682
## Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132
## NA's :131 NA's :772 NA's :2085
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0
## Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0
## Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5
## 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
## NA's :102
## TEAM_FIELDING_DP
## Min. : 52.0
## 1st Qu.:131.0
## Median :149.0
## Mean :146.4
## 3rd Qu.:164.0
## Max. :228.0
## NA's :286
# length of unique value for each variable
sapply(dfTrain, function(x) unique(length(x)))
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## 2276 2276 2276 2276
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## 2276 2276 2276 2276
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## 2276 2276 2276 2276
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## 2276 2276 2276 2276
## TEAM_FIELDING_DP
## 2276
# any missing
colSums(is.na(dfTrain))
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## 0 0 0 0
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## 0 0 0 102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## 131 772 2085 0
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## 0 0 102 0
## TEAM_FIELDING_DP
## 286
### Visualization of data distribution, spread, outliners and correlations among variables ###
# boxplot
dfTrain %>%
tidyr::gather(key, value, -INDEX) %>%
ggplot(aes(x = key, y = value, fill = key)) +
geom_boxplot() +
# scale_y_continuous(labels = scales::dollar) +
geom_boxplot(outlier.colour = "red") +
theme(legend.position = "none",
axis.title.y = element_blank()) +
coord_flip()
## Warning: Removed 3478 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3478 rows containing non-finite values (stat_boxplot).
# corrplot
dfTrain %>%
complete.cases() %>%
dfTrain[., ] %>%
dplyr::select(-INDEX) %>%
cor() %>%
corrplot(method = "number")
TARGET_WINS has moderate, positive correlation with TEAM_BATTING_H (r = 0.47), TEAM_BATTING_BB (0.47), TEAM_PITCHING_H (0.47), and TEAM_PITCHING_BB (0.47).
# let's look at number of missing values again
colSums(is.na(dfTrain))
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## 0 0 0 0
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## 0 0 0 102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## 131 772 2085 0
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## 0 0 102 0
## TEAM_FIELDING_DP
## 286
# let's impute missing values using the mice package
dfImpute <- dfTrain %>%
mice::mice(m = 1, # number of imputed data set
maxit = 10, # number of iterations to impute missing values
method = "pmm", # method used in imputation, and we choose "predictive mean matching" for all our numeric variables
seed = 1234)
##
## iter imp variable
## 1 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 6 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 7 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 8 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 9 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 10 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## Warning: Number of logged events: 10
dfClean <- mice::complete(dfImpute)
str(dfClean)
## 'data.frame': 2276 obs. of 17 variables:
## $ INDEX : int 1 2 3 4 5 6 7 8 11 12 ...
## $ TARGET_WINS : int 39 70 86 70 82 75 80 85 86 76 ...
## $ TEAM_BATTING_H : int 1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
## $ TEAM_BATTING_2B : int 194 219 232 209 186 200 179 171 197 213 ...
## $ TEAM_BATTING_3B : int 39 22 35 38 27 36 54 37 40 18 ...
## $ TEAM_BATTING_HR : int 13 190 137 96 102 92 122 115 114 96 ...
## $ TEAM_BATTING_BB : int 143 685 602 451 472 443 525 456 447 441 ...
## $ TEAM_BATTING_SO : int 842 1075 917 922 920 973 1062 1027 922 827 ...
## $ TEAM_BASERUN_SB : int 251 37 46 43 49 107 80 40 69 72 ...
## $ TEAM_BASERUN_CS : int 160 28 27 30 39 59 54 36 27 34 ...
## $ TEAM_BATTING_HBP: int 51 57 58 52 44 45 55 54 51 42 ...
## $ TEAM_PITCHING_H : int 9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
## $ TEAM_PITCHING_HR: int 84 191 137 97 102 92 122 116 114 96 ...
## $ TEAM_PITCHING_BB: int 927 689 602 454 472 443 525 459 447 441 ...
## $ TEAM_PITCHING_SO: int 5456 1082 917 928 920 973 1062 1033 922 827 ...
## $ TEAM_FIELDING_E : int 1011 193 175 164 138 123 136 112 127 131 ...
## $ TEAM_FIELDING_DP: int 219 155 153 156 168 149 186 136 169 159 ...
# check again
colSums(is.na(dfClean))
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## 0 0 0 0
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## 0 0 0 0
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## 0 0 0 0
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## 0 0 0 0
## TEAM_FIELDING_DP
## 0
# split into train, test
set.seed(1234)
index <- sample(1:nrow(dfClean), size = nrow(dfClean) * 0.7)
train <- dfClean[index, ]
test <- dfClean[-index, ]
# build full model
fullModel <- lm(TARGET_WINS ~., data = train %>% dplyr::select(-INDEX))
broom::glance(fullModel)
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.377 0.371 12.3 63.5 9.88e-150 16 -6254. 12542. 12633.
## # ... with 2 more variables: deviance <dbl>, df.residual <int>
# build full model - sqrt transformation for taking care of outliners
DV <- names(dfClean)[names(dfClean) %nin% c("INDEX", "TARGET_WINS")]
fullModelSqrt <- lm(TARGET_WINS ~ ., data = train %>% dplyr::select(-INDEX) %>% dplyr::mutate_at(vars(DV), sqrt))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(DV)` instead of `DV` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
broom::glance(fullModelSqrt)
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.385 0.379 12.2 65.8 2.62e-154 16 -6243. 12520. 12612.
## # ... with 2 more variables: deviance <dbl>, df.residual <int>
# stepwise regression - direction default to both
step <- MASS::stepAIC(fullModel, trace = FALSE)
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Final Model:
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 1577 239744.5 8019.230
## 2 - TEAM_BASERUN_CS 1 9.360252 1578 239753.8 8017.292
## 3 - TEAM_PITCHING_HR 1 68.955644 1579 239822.8 8015.750
## 4 - TEAM_BATTING_HBP 1 131.239698 1580 239954.0 8014.621
# 2 degree of interactions - full model
step2 <- MASS::stepAIC(fullModel, ~ .^2, trace = FALSE)
step2$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Final Model:
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP +
## TEAM_BATTING_BB:TEAM_FIELDING_DP + TEAM_BATTING_H:TEAM_BATTING_BB +
## TEAM_BATTING_2B:TEAM_FIELDING_DP + TEAM_BATTING_BB:TEAM_FIELDING_E +
## TEAM_BATTING_3B:TEAM_BATTING_SO + TEAM_BATTING_BB:TEAM_BASERUN_SB +
## TEAM_PITCHING_HR:TEAM_PITCHING_SO + TEAM_BATTING_HR:TEAM_BASERUN_SB +
## TEAM_BATTING_HR:TEAM_BATTING_BB + TEAM_BATTING_SO:TEAM_PITCHING_BB +
## TEAM_BATTING_BB:TEAM_PITCHING_SO + TEAM_BATTING_H:TEAM_PITCHING_H +
## TEAM_BATTING_2B:TEAM_BATTING_HR + TEAM_BATTING_BB:TEAM_BASERUN_CS +
## TEAM_BATTING_HBP:TEAM_PITCHING_H + TEAM_BASERUN_CS:TEAM_PITCHING_HR +
## TEAM_BATTING_HBP:TEAM_FIELDING_E + TEAM_BATTING_2B:TEAM_PITCHING_BB +
## TEAM_BATTING_H:TEAM_FIELDING_E + TEAM_BATTING_H:TEAM_BATTING_SO +
## TEAM_BATTING_3B:TEAM_BASERUN_SB + TEAM_BATTING_HR:TEAM_BASERUN_CS +
## TEAM_BASERUN_CS:TEAM_PITCHING_BB + TEAM_BATTING_BB:TEAM_BATTING_SO +
## TEAM_BATTING_H:TEAM_PITCHING_BB + TEAM_BATTING_2B:TEAM_BATTING_SO +
## TEAM_BATTING_2B:TEAM_PITCHING_SO + TEAM_BATTING_2B:TEAM_BATTING_BB +
## TEAM_BATTING_3B:TEAM_PITCHING_SO + TEAM_PITCHING_H:TEAM_PITCHING_HR +
## TEAM_BATTING_H:TEAM_PITCHING_SO + TEAM_BATTING_SO:TEAM_PITCHING_H +
## TEAM_BATTING_2B:TEAM_PITCHING_HR + TEAM_BATTING_HR:TEAM_FIELDING_E +
## TEAM_BATTING_2B:TEAM_BASERUN_SB + TEAM_PITCHING_SO:TEAM_FIELDING_E +
## TEAM_PITCHING_BB:TEAM_FIELDING_E + TEAM_BASERUN_SB:TEAM_FIELDING_DP +
## TEAM_PITCHING_HR:TEAM_FIELDING_E
##
##
## Step Df Deviance Resid. Df Resid. Dev
## 1 1577 239744.5
## 2 + TEAM_BATTING_BB:TEAM_FIELDING_DP 1 15004.269138 1576 224740.2
## 3 + TEAM_BATTING_H:TEAM_BATTING_BB 1 5070.688320 1575 219669.5
## 4 + TEAM_BATTING_2B:TEAM_FIELDING_DP 1 3033.652228 1574 216635.8
## 5 + TEAM_BATTING_BB:TEAM_FIELDING_E 1 2116.761370 1573 214519.1
## 6 + TEAM_BATTING_3B:TEAM_BATTING_SO 1 2512.142515 1572 212006.9
## 7 + TEAM_BATTING_BB:TEAM_BASERUN_SB 1 2519.492435 1571 209487.5
## 8 + TEAM_PITCHING_HR:TEAM_PITCHING_SO 1 3023.933100 1570 206463.5
## 9 + TEAM_BATTING_HR:TEAM_BASERUN_SB 1 3440.618163 1569 203022.9
## 10 + TEAM_BATTING_HR:TEAM_BATTING_BB 1 1610.728931 1568 201412.2
## 11 + TEAM_BATTING_SO:TEAM_PITCHING_BB 1 1638.727143 1567 199773.4
## 12 + TEAM_BATTING_BB:TEAM_PITCHING_SO 1 2452.343130 1566 197321.1
## 13 + TEAM_BASERUN_SB:TEAM_BATTING_HBP 1 1315.504382 1565 196005.6
## 14 + TEAM_BATTING_H:TEAM_PITCHING_H 1 1054.715640 1564 194950.9
## 15 + TEAM_BATTING_2B:TEAM_BATTING_HR 1 788.531985 1563 194162.3
## 16 + TEAM_BASERUN_SB:TEAM_BASERUN_CS 1 641.631898 1562 193520.7
## 17 + TEAM_BATTING_BB:TEAM_BASERUN_CS 1 639.258891 1561 192881.5
## 18 + TEAM_PITCHING_BB:TEAM_FIELDING_E 1 538.975644 1560 192342.5
## 19 + TEAM_BATTING_HBP:TEAM_PITCHING_H 1 708.613055 1559 191633.9
## 20 + TEAM_BASERUN_CS:TEAM_PITCHING_HR 1 474.472323 1558 191159.4
## 21 + TEAM_BATTING_H:TEAM_FIELDING_DP 1 414.611034 1557 190744.8
## 22 + TEAM_BATTING_HBP:TEAM_FIELDING_E 1 457.467627 1556 190287.3
## 23 + TEAM_BATTING_2B:TEAM_PITCHING_BB 1 391.884725 1555 189895.4
## 24 + TEAM_BATTING_SO:TEAM_FIELDING_E 1 363.858099 1554 189531.6
## 25 + TEAM_BATTING_H:TEAM_FIELDING_E 1 710.391280 1553 188821.2
## 26 + TEAM_BATTING_H:TEAM_BATTING_SO 1 479.053977 1552 188342.1
## 27 + TEAM_BATTING_3B:TEAM_BASERUN_SB 1 528.972852 1551 187813.2
## 28 + TEAM_BATTING_2B:TEAM_FIELDING_E 1 377.949029 1550 187435.2
## 29 + TEAM_BATTING_HR:TEAM_BASERUN_CS 1 385.148586 1549 187050.1
## 30 + TEAM_BASERUN_CS:TEAM_PITCHING_BB 1 1612.853299 1548 185437.2
## 31 + TEAM_BATTING_BB:TEAM_BATTING_SO 1 495.978066 1547 184941.2
## 32 - TEAM_PITCHING_BB:TEAM_FIELDING_E 1 29.990816 1548 184971.2
## 33 + TEAM_BATTING_H:TEAM_PITCHING_BB 1 457.305156 1547 184513.9
## 34 + TEAM_BATTING_2B:TEAM_BATTING_SO 1 327.289613 1546 184186.6
## 35 + TEAM_BATTING_2B:TEAM_PITCHING_SO 1 1026.235684 1545 183160.4
## 36 + TEAM_BATTING_2B:TEAM_BATTING_BB 1 947.061018 1544 182213.3
## 37 + TEAM_BATTING_3B:TEAM_PITCHING_SO 1 623.930903 1543 181589.4
## 38 + TEAM_PITCHING_H:TEAM_PITCHING_HR 1 409.808122 1542 181179.6
## 39 + TEAM_BATTING_H:TEAM_PITCHING_SO 1 657.657058 1541 180521.9
## 40 + TEAM_BATTING_SO:TEAM_PITCHING_H 1 406.557562 1540 180115.4
## 41 + TEAM_BATTING_2B:TEAM_PITCHING_HR 1 394.359244 1539 179721.0
## 42 - TEAM_BASERUN_SB:TEAM_BATTING_HBP 1 108.827484 1540 179829.8
## 43 + TEAM_BATTING_HR:TEAM_FIELDING_E 1 334.334825 1539 179495.5
## 44 + TEAM_BATTING_2B:TEAM_BASERUN_SB 1 382.703284 1538 179112.8
## 45 - TEAM_BATTING_2B:TEAM_FIELDING_E 1 1.965609 1539 179114.8
## 46 + TEAM_PITCHING_SO:TEAM_FIELDING_E 1 312.059711 1538 178802.7
## 47 - TEAM_BATTING_SO:TEAM_FIELDING_E 1 24.601189 1539 178827.3
## 48 + TEAM_PITCHING_BB:TEAM_FIELDING_E 1 376.199962 1538 178451.1
## 49 + TEAM_BASERUN_SB:TEAM_FIELDING_DP 1 387.224502 1537 178063.9
## 50 - TEAM_BASERUN_SB:TEAM_BASERUN_CS 1 33.996087 1538 178097.9
## 51 - TEAM_BATTING_H:TEAM_FIELDING_DP 1 94.697723 1539 178192.6
## 52 + TEAM_PITCHING_HR:TEAM_FIELDING_E 1 224.807668 1538 177967.8
## AIC
## 1 8019.230
## 2 7918.276
## 3 7883.923
## 4 7863.770
## 5 7850.128
## 6 7833.363
## 7 7816.318
## 8 7795.156
## 9 7770.386
## 10 7759.697
## 11 7748.683
## 12 7731.007
## 13 7722.351
## 14 7715.756
## 15 7711.299
## 16 7708.026
## 17 7704.756
## 18 7702.298
## 19 7698.418
## 20 7696.469
## 21 7695.010
## 22 7693.185
## 23 7691.901
## 24 7690.846
## 25 7686.864
## 26 7684.817
## 27 7682.337
## 28 7681.128
## 29 7679.851
## 30 7668.056
## 31 7665.790
## 32 7664.048
## 33 7662.105
## 34 7661.276
## 35 7654.376
## 36 7648.118
## 37 7644.654
## 38 7643.054
## 39 7639.262
## 40 7637.670
## 41 7636.178
## 42 7635.143
## 43 7634.178
## 44 7632.778
## 45 7630.796
## 46 7630.018
## 47 7628.237
## 48 7626.882
## 49 7625.422
## 50 7623.726
## 51 7622.573
## 52 7622.562
# 2 degree of interactions - sqrt transformation
step2.1 <- MASS::stepAIC(fullModelSqrt, ~.^2, trace = FALSE)
step2.1$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Final Model:
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP +
## TEAM_BATTING_BB:TEAM_FIELDING_DP + TEAM_BASERUN_SB:TEAM_PITCHING_HR +
## TEAM_BATTING_SO:TEAM_BASERUN_SB + TEAM_BATTING_2B:TEAM_FIELDING_DP +
## TEAM_BATTING_HBP:TEAM_PITCHING_H + TEAM_BATTING_H:TEAM_FIELDING_E +
## TEAM_BATTING_HR:TEAM_PITCHING_H + TEAM_PITCHING_H:TEAM_PITCHING_SO +
## TEAM_BATTING_2B:TEAM_BATTING_HR + TEAM_BATTING_3B:TEAM_PITCHING_SO +
## TEAM_BATTING_H:TEAM_BATTING_HR + TEAM_BASERUN_SB:TEAM_PITCHING_H +
## TEAM_PITCHING_SO:TEAM_FIELDING_E + TEAM_BATTING_BB:TEAM_FIELDING_E +
## TEAM_BATTING_SO:TEAM_PITCHING_SO + TEAM_BATTING_3B:TEAM_PITCHING_H +
## TEAM_BATTING_SO:TEAM_PITCHING_H + TEAM_PITCHING_H:TEAM_PITCHING_BB +
## TEAM_BATTING_H:TEAM_BATTING_3B + TEAM_BATTING_HR:TEAM_BASERUN_SB +
## TEAM_BATTING_BB:TEAM_BASERUN_SB + TEAM_BATTING_H:TEAM_PITCHING_SO +
## TEAM_BASERUN_CS:TEAM_PITCHING_BB + TEAM_BATTING_3B:TEAM_FIELDING_E +
## TEAM_BATTING_3B:TEAM_BASERUN_SB + TEAM_BATTING_HBP:TEAM_FIELDING_E +
## TEAM_FIELDING_E:TEAM_FIELDING_DP + TEAM_BASERUN_SB:TEAM_PITCHING_BB +
## TEAM_BATTING_SO:TEAM_FIELDING_E + TEAM_PITCHING_BB:TEAM_FIELDING_E +
## TEAM_BATTING_HR:TEAM_BATTING_BB + TEAM_BATTING_2B:TEAM_BATTING_HBP +
## TEAM_BATTING_2B:TEAM_BASERUN_SB + TEAM_BASERUN_SB:TEAM_PITCHING_SO +
## TEAM_PITCHING_HR:TEAM_FIELDING_DP + TEAM_BASERUN_SB:TEAM_BASERUN_CS +
## TEAM_BASERUN_CS:TEAM_PITCHING_HR + TEAM_BASERUN_SB:TEAM_FIELDING_DP
##
##
## Step Df Deviance Resid. Df Resid. Dev
## 1 1577 236519.2
## 2 + TEAM_BATTING_BB:TEAM_FIELDING_DP 1 11235.510742 1576 225283.7
## 3 + TEAM_BATTING_2B:TEAM_FIELDING_E 1 5837.131422 1575 219446.5
## 4 + TEAM_BASERUN_SB:TEAM_PITCHING_HR 1 5548.780379 1574 213897.8
## 5 + TEAM_BATTING_SO:TEAM_BASERUN_SB 1 3539.064939 1573 210358.7
## 6 + TEAM_BASERUN_SB:TEAM_BATTING_HBP 1 2335.849121 1572 208022.8
## 7 + TEAM_BATTING_2B:TEAM_FIELDING_DP 1 1723.783607 1571 206299.1
## 8 + TEAM_BATTING_HBP:TEAM_PITCHING_H 1 1170.555810 1570 205128.5
## 9 + TEAM_BASERUN_SB:TEAM_FIELDING_DP 1 1230.911801 1569 203897.6
## 10 + TEAM_BATTING_H:TEAM_FIELDING_E 1 847.175424 1568 203050.4
## 11 + TEAM_BATTING_HR:TEAM_PITCHING_H 1 1528.480283 1567 201521.9
## 12 + TEAM_PITCHING_H:TEAM_PITCHING_SO 1 1082.786411 1566 200439.1
## 13 + TEAM_BATTING_2B:TEAM_BATTING_HR 1 1259.863720 1565 199179.3
## 14 + TEAM_BATTING_3B:TEAM_PITCHING_BB 1 801.537395 1564 198377.7
## 15 + TEAM_BATTING_3B:TEAM_PITCHING_SO 1 1244.096517 1563 197133.7
## 16 + TEAM_BATTING_H:TEAM_PITCHING_H 1 952.543878 1562 196181.1
## 17 + TEAM_BATTING_H:TEAM_BATTING_HR 1 1401.821856 1561 194779.3
## 18 + TEAM_BASERUN_SB:TEAM_PITCHING_H 1 1353.568725 1560 193425.7
## 19 + TEAM_PITCHING_SO:TEAM_FIELDING_E 1 660.014384 1559 192765.7
## 20 + TEAM_BATTING_BB:TEAM_FIELDING_E 1 713.269933 1558 192052.4
## 21 + TEAM_BATTING_3B:TEAM_BATTING_HBP 1 630.409805 1557 191422.0
## 22 + TEAM_BATTING_SO:TEAM_PITCHING_SO 1 488.222477 1556 190933.8
## 23 + TEAM_BATTING_H:TEAM_BATTING_SO 1 864.248744 1555 190069.6
## 24 + TEAM_BATTING_3B:TEAM_PITCHING_H 1 837.988380 1554 189231.6
## 25 + TEAM_BATTING_SO:TEAM_PITCHING_H 1 983.134553 1553 188248.4
## 26 + TEAM_PITCHING_H:TEAM_PITCHING_BB 1 1090.365465 1552 187158.1
## 27 + TEAM_BATTING_H:TEAM_BATTING_3B 1 1296.354267 1551 185861.7
## 28 + TEAM_BATTING_HR:TEAM_BASERUN_SB 1 624.190113 1550 185237.5
## 29 + TEAM_BATTING_BB:TEAM_BASERUN_SB 1 613.068132 1549 184624.5
## 30 + TEAM_BATTING_H:TEAM_PITCHING_SO 1 652.318536 1548 183972.1
## 31 + TEAM_BASERUN_CS:TEAM_PITCHING_BB 1 838.582157 1547 183133.6
## 32 + TEAM_BASERUN_CS:TEAM_PITCHING_SO 1 710.824867 1546 182422.7
## 33 - TEAM_BATTING_H:TEAM_PITCHING_H 1 7.397843 1547 182430.1
## 34 + TEAM_BATTING_3B:TEAM_FIELDING_E 1 434.563535 1546 181995.6
## 35 + TEAM_BATTING_3B:TEAM_BASERUN_SB 1 503.525743 1545 181492.0
## 36 + TEAM_BATTING_HBP:TEAM_FIELDING_E 1 580.620713 1544 180911.4
## 37 + TEAM_FIELDING_E:TEAM_FIELDING_DP 1 500.384918 1543 180411.0
## 38 + TEAM_BASERUN_SB:TEAM_PITCHING_BB 1 470.736901 1542 179940.3
## 39 + TEAM_BATTING_SO:TEAM_FIELDING_E 1 451.902701 1541 179488.4
## 40 + TEAM_PITCHING_BB:TEAM_FIELDING_E 1 1661.374759 1540 177827.0
## 41 + TEAM_BATTING_HR:TEAM_BATTING_BB 1 524.255734 1539 177302.8
## 42 - TEAM_BATTING_3B:TEAM_PITCHING_BB 1 20.202348 1540 177323.0
## 43 - TEAM_BASERUN_SB:TEAM_FIELDING_DP 1 46.616343 1541 177369.6
## 44 - TEAM_BATTING_H:TEAM_BATTING_SO 1 107.380324 1542 177477.0
## 45 + TEAM_BATTING_2B:TEAM_BATTING_HBP 1 364.215651 1541 177112.7
## 46 - TEAM_BATTING_3B:TEAM_BATTING_HBP 1 112.450583 1542 177225.2
## 47 + TEAM_BATTING_2B:TEAM_BASERUN_SB 1 272.853465 1541 176952.3
## 48 - TEAM_BATTING_2B:TEAM_FIELDING_E 1 76.780363 1542 177029.1
## 49 + TEAM_BASERUN_SB:TEAM_PITCHING_SO 1 352.938525 1541 176676.2
## 50 + TEAM_PITCHING_HR:TEAM_FIELDING_DP 1 338.905567 1540 176337.3
## 51 + TEAM_BASERUN_SB:TEAM_BASERUN_CS 1 310.593928 1539 176026.7
## 52 + TEAM_BASERUN_CS:TEAM_PITCHING_HR 1 460.875181 1538 175565.8
## 53 - TEAM_BASERUN_SB:TEAM_BATTING_HBP 1 160.838823 1539 175726.6
## 54 - TEAM_BASERUN_CS:TEAM_PITCHING_SO 1 146.814584 1540 175873.5
## 55 + TEAM_BASERUN_SB:TEAM_FIELDING_DP 1 270.911471 1539 175602.5
## AIC
## 1 7997.654
## 2 7922.124
## 3 7882.305
## 4 7843.507
## 5 7818.930
## 6 7803.142
## 7 7791.887
## 8 7784.822
## 9 7777.234
## 10 7772.602
## 11 7762.565
## 12 7755.982
## 13 7747.938
## 14 7743.515
## 15 7735.493
## 16 7729.777
## 17 7720.353
## 18 7711.244
## 19 7707.799
## 20 7703.894
## 21 7700.656
## 22 7698.588
## 23 7693.361
## 24 7688.322
## 25 7682.025
## 26 7674.771
## 27 7665.699
## 28 7662.340
## 29 7659.059
## 30 7655.420
## 31 7650.142
## 32 7645.947
## 33 7644.012
## 34 7642.213
## 35 7639.799
## 36 7636.695
## 37 7634.283
## 38 7632.121
## 39 7630.115
## 40 7617.301
## 41 7614.598
## 42 7612.779
## 43 7611.198
## 44 7610.162
## 45 7608.890
## 46 7607.901
## 47 7607.446
## 48 7606.137
## 49 7604.958
## 50 7603.900
## 51 7603.091
## 52 7600.915
## 53 7600.374
## 54 7599.704
## 55 7599.248
# final model (based on the sqrt transformation)
finalModelSqrt <- lm(
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR +
TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP +
TEAM_BATTING_BB:TEAM_FIELDING_DP + TEAM_BASERUN_SB:TEAM_PITCHING_HR +
TEAM_BATTING_SO:TEAM_BASERUN_SB + TEAM_BATTING_2B:TEAM_FIELDING_DP +
TEAM_BATTING_HBP:TEAM_PITCHING_H + TEAM_BATTING_H:TEAM_FIELDING_E +
TEAM_BATTING_HR:TEAM_PITCHING_H + TEAM_PITCHING_H:TEAM_PITCHING_SO +
TEAM_BATTING_2B:TEAM_BATTING_HR + TEAM_BATTING_3B:TEAM_PITCHING_SO +
TEAM_BATTING_H:TEAM_BATTING_HR + TEAM_BASERUN_SB:TEAM_PITCHING_H +
TEAM_PITCHING_SO:TEAM_FIELDING_E + TEAM_BATTING_BB:TEAM_FIELDING_E +
TEAM_BATTING_SO:TEAM_PITCHING_SO + TEAM_BATTING_3B:TEAM_PITCHING_H +
TEAM_BATTING_SO:TEAM_PITCHING_H + TEAM_PITCHING_H:TEAM_PITCHING_BB +
TEAM_BATTING_H:TEAM_BATTING_3B + TEAM_BATTING_HR:TEAM_BASERUN_SB +
TEAM_BATTING_BB:TEAM_BASERUN_SB + TEAM_BATTING_H:TEAM_PITCHING_SO +
TEAM_BASERUN_CS:TEAM_PITCHING_BB + TEAM_BATTING_3B:TEAM_FIELDING_E +
TEAM_BATTING_3B:TEAM_BASERUN_SB + TEAM_BATTING_HBP:TEAM_FIELDING_E +
TEAM_FIELDING_E:TEAM_FIELDING_DP + TEAM_BASERUN_SB:TEAM_PITCHING_BB +
TEAM_BATTING_SO:TEAM_FIELDING_E + TEAM_PITCHING_BB:TEAM_FIELDING_E +
TEAM_BATTING_HR:TEAM_BATTING_BB + TEAM_BATTING_2B:TEAM_BATTING_HBP +
TEAM_BATTING_2B:TEAM_BASERUN_SB + TEAM_BASERUN_SB:TEAM_PITCHING_SO +
TEAM_PITCHING_HR:TEAM_FIELDING_DP + TEAM_BASERUN_SB:TEAM_BASERUN_CS +
TEAM_BASERUN_CS:TEAM_PITCHING_HR + TEAM_BASERUN_SB:TEAM_FIELDING_DP,
data = train %>% dplyr::select(-INDEX) %>% dplyr::mutate_at(vars(DV), sqrt)
)
broom::glance(finalModelSqrt)
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.543 0.528 10.7 34.6 1.49e-221 54 -6006. 12122. 12418.
## # ... with 2 more variables: deviance <dbl>, df.residual <int>
broom::tidy(finalModelSqrt) %>% arrange(p.value)
## # A tibble: 54 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 TEAM_PITCHING_H:TEAM_PITCHING_BB -0.171 0.0218 -7.84 8.55e-15
## 2 TEAM_PITCHING_HR -68.1 9.38 -7.26 6.30e-13
## 3 TEAM_BATTING_BB:TEAM_FIELDING_E -0.860 0.119 -7.24 7.22e-13
## 4 TEAM_PITCHING_H:TEAM_PITCHING_SO 0.0863 0.0137 6.28 4.44e-10
## 5 TEAM_BATTING_3B:TEAM_PITCHING_H -0.226 0.0366 -6.18 7.95e-10
## 6 TEAM_BATTING_HR 64.3 10.6 6.07 1.58e- 9
## 7 TEAM_FIELDING_E -16.0 2.68 -5.96 3.06e- 9
## 8 TEAM_BATTING_HR:TEAM_PITCHING_H 1.25 0.217 5.73 1.20e- 8
## 9 TEAM_BATTING_H:TEAM_PITCHING_SO 0.360 0.0633 5.69 1.48e- 8
## 10 TEAM_BATTING_SO:TEAM_FIELDING_E 0.562 0.0990 5.68 1.62e- 8
## # ... with 44 more rows
The final model (sqrt transformed) derived from the 2 degree of interactions between variables has considerably enhanced the adjusted R-squared from 0.377 (fullModel) to 0.543 (finalModelSqrt)!
# diagnostic
par(mfrow = c(2, 2))
plot(finalModelSqrt)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
# plot predicted outcomes with actuals with confidence intervals
testFinal <- test %>%
dplyr::mutate(predictedOutcome = round(predict(finalModelSqrt,
newdata = test %>% dplyr::select(-INDEX) %>% dplyr::mutate_at(vars(DV), sqrt),
type = "response")))
test.pred <- predict(finalModelSqrt,
newdata = test %>% dplyr::select(-INDEX) %>% dplyr::mutate_at(vars(DV), sqrt),
interval = "prediction")
testCombined <- cbind(testFinal, test.pred) %>%
dplyr::select(-fit)
test.subset <- testCombined %>%
dplyr::select(target_wins = TARGET_WINS,
predicted_wins = predictedOutcome,
lwr,
upr) %>%
dplyr::filter(predicted_wins >0)
p <- test.subset%>%
ggplot(aes(target_wins, predicted_wins)) +
geom_point()
p + geom_point(aes(y = lwr), col = "red") +
geom_point(aes(y = upr), col = "green") +
geom_abline(intercept = 0, slope = 1)
# what is the correlation betweent predicted and actuals?
cor.test(test.subset$target_wins, test.subset$predicted_wins)
##
## Pearson's product-moment correlation
##
## data: test.subset$target_wins and test.subset$predicted_wins
## t = 18.067, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5162526 0.6178620
## sample estimates:
## cor
## 0.5692268
# what is the RMSE of this final model (sqrt transformed)?
caret::RMSE(pred = test.subset$predicted_wins, obs = test.subset$target_wins)
## [1] 13.73734
# finally, let's apply it to our evaluation set
colSums(is.na(dfEval))
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 0 0 0 0
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 0 0 18 13
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 87 240 0 0
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 0 18 0 31
# let's impute missing values using the mice package
dfImpute2 <- dfEval %>%
mice::mice(m = 1, # number of imputed data set
maxit = 10, # number of iterations to impute missing values
method = "pmm", # method used in imputation, and we choose "predictive mean matching" for all our numeric variables
seed = 1234)
##
## iter imp variable
## 1 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 6 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 7 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 8 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 9 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 10 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## Warning: Number of logged events: 20
dfEvalClean <- mice::complete(dfImpute2)
dfEval$evalPredictedOutcome = round(predict(finalModelSqrt,
newdata = dfEvalClean %>% dplyr::select(-INDEX) %>% dplyr::mutate_at(vars(DV), sqrt),
type = "response"))
head(dfEval)
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1 9 1209 170 33 83
## 2 10 1221 151 29 88
## 3 14 1395 183 29 93
## 4 47 1539 309 29 159
## 5 60 1445 203 68 5
## 6 63 1431 236 53 10
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1 447 1080 62 50
## 2 516 929 54 39
## 3 509 816 59 47
## 4 486 914 148 57
## 5 95 416 NA NA
## 6 215 377 NA NA
## TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 1 NA 1209 83 447
## 2 NA 1221 88 516
## 3 NA 1395 93 509
## 4 42 1539 159 486
## 5 NA 3902 14 257
## 6 NA 2793 20 420
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP evalPredictedOutcome
## 1 1080 140 156 60
## 2 929 135 164 66
## 3 816 156 153 70
## 4 914 124 154 78
## 5 1123 616 130 53
## 6 736 572 105 74