Team Members
Nathan
Raymir
Kayo
# Packages to iterate over
packages <- c("dplyr", "stargazer", "ggplot2", "Amelia", "ggcorrplot", "lmtest", "e1071", "MASS")
# Install and load packages if not already installed
for (pkg in packages) {
if (!pkg %in% rownames(installed.packages())) {
install.packages(pkg, repos = "http://cran.rstudio.com/", dependencies = TRUE)
}
library(pkg, character.only = TRUE)
}
rm(packages, pkg) # Clean up
require("Amelia") # dependencies
rm(list = ls()) # Clear environment-remove all files from your workspace
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 919517 49.2 1758918 94 1393315 74.5
## Vcells 1607334 12.3 8388608 64 2780740 21.3
graphics.off() # Clear all graphs
cat("\f") # Clear the console
moneyball <- read.csv('moneyball-training-data.csv') # import data
visdat::vis_dat(moneyball) # visualize data
names(moneyball) <- gsub("TEAM_", "", names(moneyball)) # replace all "TEAM_" with nothing in order to make models more readable
Amelia::missmap(moneyball) # visualizing mizzing data
moneyball <- na.omit(moneyball) # remove all rows with N/A values
head(moneyball) # preview data
## INDEX TARGET_WINS BATTING_H BATTING_2B BATTING_3B BATTING_HR BATTING_BB
## 38 41 82 1574 309 34 236 608
## 39 42 75 1447 275 26 158 494
## 40 43 99 1603 333 32 152 462
## 41 44 77 1473 276 33 150 476
## 42 45 92 1603 272 37 162 450
## 43 46 95 1520 278 30 147 447
## BATTING_SO BASERUN_SB BASERUN_CS BATTING_HBP PITCHING_H PITCHING_HR
## 38 1024 93 52 47 1574 236
## 39 1001 116 52 77 1447 158
## 40 805 117 51 74 1603 152
## 41 838 129 61 56 1473 150
## 42 942 143 46 73 1603 162
## 43 848 161 57 29 1520 147
## PITCHING_BB PITCHING_SO FIELDING_E FIELDING_DP
## 38 608 1024 134 184
## 39 494 1001 103 142
## 40 462 805 87 151
## 41 476 838 105 138
## 42 450 942 90 126
## 43 447 848 87 139
stargazer(moneyball, type = "text", digits = 0) # summary statistics
##
## ==========================================
## Statistic N Mean St. Dev. Min Max
## ------------------------------------------
## INDEX 191 1,384 765 41 2,534
## TARGET_WINS 191 81 12 43 116
## BATTING_H 191 1,479 76 1,308 1,667
## BATTING_2B 191 297 26 201 373
## BATTING_3B 191 31 9 12 61
## BATTING_HR 191 178 32 116 260
## BATTING_BB 191 543 75 365 775
## BATTING_SO 191 1,051 104 805 1,399
## BASERUN_SB 191 91 30 31 177
## BASERUN_CS 191 40 12 12 74
## BATTING_HBP 191 59 13 29 95
## PITCHING_H 191 1,480 76 1,312 1,667
## PITCHING_HR 191 178 32 116 260
## PITCHING_BB 191 544 75 367 775
## PITCHING_SO 191 1,052 104 805 1,399
## FIELDING_E 191 107 17 65 145
## FIELDING_DP 191 152 18 113 204
## ------------------------------------------
moneyball_pos_model <- lm(TARGET_WINS ~ PITCHING_HR + BATTING_HR, data = moneyball)
summary(moneyball) # create liner regression model that predicts target wins based on pitching and batting homerun
## INDEX TARGET_WINS BATTING_H BATTING_2B
## Min. : 41.0 Min. : 43.00 Min. :1308 Min. :201.0
## 1st Qu.: 824.5 1st Qu.: 71.50 1st Qu.:1426 1st Qu.:279.5
## Median :1380.0 Median : 82.00 Median :1477 Median :296.0
## Mean :1383.6 Mean : 80.93 Mean :1479 Mean :297.2
## 3rd Qu.:2092.0 3rd Qu.: 90.00 3rd Qu.:1524 3rd Qu.:312.5
## Max. :2534.0 Max. :116.00 Max. :1667 Max. :373.0
## BATTING_3B BATTING_HR BATTING_BB BATTING_SO
## Min. :12.00 Min. :116.0 Min. :365.0 Min. : 805
## 1st Qu.:24.00 1st Qu.:152.5 1st Qu.:492.0 1st Qu.: 982
## Median :29.00 Median :175.0 Median :535.0 Median :1050
## Mean :30.74 Mean :178.1 Mean :543.3 Mean :1051
## 3rd Qu.:36.00 3rd Qu.:199.5 3rd Qu.:595.0 3rd Qu.:1107
## Max. :61.00 Max. :260.0 Max. :775.0 Max. :1399
## BASERUN_SB BASERUN_CS BATTING_HBP PITCHING_H
## Min. : 31.00 Min. :12.00 Min. :29.00 Min. :1312
## 1st Qu.: 67.50 1st Qu.:32.00 1st Qu.:50.50 1st Qu.:1430
## Median : 87.00 Median :38.00 Median :58.00 Median :1480
## Mean : 90.91 Mean :39.94 Mean :59.36 Mean :1480
## 3rd Qu.:110.00 3rd Qu.:48.00 3rd Qu.:67.00 3rd Qu.:1526
## Max. :177.00 Max. :74.00 Max. :95.00 Max. :1667
## PITCHING_HR PITCHING_BB PITCHING_SO FIELDING_E FIELDING_DP
## Min. :116.0 Min. :367.0 Min. : 805 Min. : 65.0 Min. :113.0
## 1st Qu.:152.5 1st Qu.:492.0 1st Qu.: 982 1st Qu.: 95.0 1st Qu.:139.0
## Median :175.0 Median :537.0 Median :1052 Median :106.0 Median :152.0
## Mean :178.2 Mean :543.7 Mean :1052 Mean :107.1 Mean :152.3
## 3rd Qu.:200.0 3rd Qu.:595.0 3rd Qu.:1108 3rd Qu.:118.0 3rd Qu.:165.0
## Max. :260.0 Max. :775.0 Max. :1399 Max. :145.0 Max. :204.0
plot(moneyball_pos_model) # produce models
names(moneyball)
## [1] "INDEX" "TARGET_WINS" "BATTING_H" "BATTING_2B" "BATTING_3B"
## [6] "BATTING_HR" "BATTING_BB" "BATTING_SO" "BASERUN_SB" "BASERUN_CS"
## [11] "BATTING_HBP" "PITCHING_H" "PITCHING_HR" "PITCHING_BB" "PITCHING_SO"
## [16] "FIELDING_E" "FIELDING_DP"
moneyball_pos_predictions <- moneyball %>%
mutate(predictions = predict(moneyball_pos_model, newdata = moneyball)) %>%
dplyr::select(INDEX, TARGET_WINS, PITCHING_HR, BATTING_HR, predictions) # compare actual target_wins with predictions
# create line graphs for visualization of predictions vs actual
ggplot(moneyball_pos_predictions, aes(x = BATTING_HR)) +
geom_line(aes(y = TARGET_WINS), color = "darkred") +
geom_line(aes(y = predictions), color = "darkblue") +
ggtitle("Actual vs Predicted Target Wins") +
xlab("Team Batting Homeruns") +
ylab("Target Wins")
ggplot(moneyball_pos_predictions, aes(x = PITCHING_HR)) +
geom_line(aes(y = TARGET_WINS), color = "darkred") +
geom_line(aes(y = predictions), color = "darkblue") +
ggtitle("Actual vs Predicted Target Wins") +
xlab("Team Pitching Homeruns") +
ylab("Target Wins")
# linear model for field errors and hits allowed
moneyball_neg_model <- lm(TARGET_WINS ~ FIELDING_E + PITCHING_H, data = moneyball)
summary(moneyball_neg_model)
##
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_E + PITCHING_H, data = moneyball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.0697 -7.2835 0.7463 6.8837 25.5026
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.80236 16.85662 0.522 0.602
## FIELDING_E -0.20873 0.04595 -4.543 9.91e-06 ***
## PITCHING_H 0.06384 0.01008 6.332 1.74e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.2 on 188 degrees of freedom
## Multiple R-squared: 0.299, Adjusted R-squared: 0.2916
## F-statistic: 40.1 on 2 and 188 DF, p-value: 3.14e-15
# plot model diagnostics
plot(moneyball_neg_model)
moneyball_neg_predictions <- moneyball %>%
mutate(predictions = predict(moneyball_neg_model, newdata = moneyball)) %>%
dplyr::select(INDEX, TARGET_WINS, FIELDING_E, PITCHING_H, predictions) # predict target_wins based on field errors and pitching hits (negative theoretical effects)
ggplot(moneyball_neg_predictions, aes(x = FIELDING_E)) +
geom_line(aes(y = TARGET_WINS), color = "darkred") +
geom_line(aes(y = predictions), color = "darkblue") +
ggtitle("Actual vs Predicted Target Wins") +
xlab("Team Fielding Errors") +
ylab("Target Wins")
moneyball_cor <- cor(moneyball[,c(2:17)]) # input: every column besides index
ggcorrplot(moneyball_cor,
hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 2, # Increased label size
method = "square",
colors = c("tomato2", "white", "springgreen3"),
title = "Correlation Matrix",
ggtheme = ggplot2::theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 1),
axis.text.y = element_text(size = 12), # increase size of y-axis labels
plot.margin = unit(c(1,1,1,1), "cm")) # increasing plot margins
)
rm(moneyball_cor)
ggplot(moneyball, mapping = aes(x =BATTING_HR)) + geom_histogram() # create histogram for frequency of team batting homeruns
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
BATTING_HR and PITCHING_HR have a
significant positive impact on TARGET_WINS, indicating that
teams with higher home run statistics tend to wind more games.
moneyball_pos_predictions line get compared against the
TARGET_WINS line in the actual vs prediction model. The
prediction line is on a constant linear increase and the
TARGET_WINS followed the graph, with averagely higher
wins.On average, when we look at the correlation between two negative impacting variables (field errors and pitching hits), we see the predicted target wins is significantly lower than the actual target win.
When you compare moneyball_neg_predictions line to
TARGET_WINS line, you get a incosistent prediction line
that is downward sloping.
Overall, both lines final point is significantly lower then the first point. This negative relationship demonstrates the negative effect of field errors and pitching hits on the team’s overall wins.
The correlation matrix reveals that BATTING_HR and
PITCHING_HR has the highest positive correlation with
TARGET_WINS, while BATTING_H and
PITCHING_H and FIELDING_ERRORS has a
significantly lower or negative correlation with
TARGET_WINS. This demonstrates the constrast between the
two.
The residual plots for the linear regression of both the positive
and negative correlation models indicate some heteroscedacity, which is
apparent through the more outward variance on the y-axis, however,
eye-balling isn’t enough, as when you use the BP test to test
homoscedasticity, both p-values are greater than .05 which
means there isn’t enough evidence to reject the null of the hypothesisi
of homoscedasticity for both models.
# use Breusch-Pagan Test to determine if heteroscedasticity is present
bptest(moneyball_pos_model)
##
## studentized Breusch-Pagan test
##
## data: moneyball_pos_model
## BP = 0.93579, df = 2, p-value = 0.6263
bptest(moneyball_neg_model)
##
## studentized Breusch-Pagan test
##
## data: moneyball_neg_model
## BP = 5.0905, df = 2, p-value = 0.07845
The distribution of BATTING_HR is right-skewed. This
implies that teams batting homeruns has a tendency towards higher
values, but not excessively.
skewness(moneyball$BATTING_HR)
## [1] 0.2980673
kurtosis(moneyball$BATTING_HR)
## [1] -0.7172373
BATTING_HR means that outliers
are extremely infrequent, which adds to the base of using this
distrubution as a postive effect.stargazer(moneyball_pos_predictions, type="text")
##
## ================================================
## Statistic N Mean St. Dev. Min Max
## ------------------------------------------------
## INDEX 191 1,383.592 765.240 41 2,534
## TARGET_WINS 191 80.927 12.115 43 116
## PITCHING_HR 191 178.178 32.392 116 260
## BATTING_HR 191 178.052 32.413 116 260
## predictions 191 80.927 5.118 71.092 93.851
## ------------------------------------------------
stargazer(moneyball_neg_predictions, type = "text")
##
## ================================================
## Statistic N Mean St. Dev. Min Max
## ------------------------------------------------
## INDEX 191 1,383.592 765.240 41 2,534
## TARGET_WINS 191 80.927 12.115 43 116
## FIELDING_E 191 107.052 16.632 65 145
## PITCHING_H 191 1,479.702 75.789 1,312 1,667
## predictions 191 80.927 6.625 63.761 98.413
## ------------------------------------------------
rm(moneyball_neg_model, moneyball_pos_model)
names(moneyball)[3:17]
## [1] "BATTING_H" "BATTING_2B" "BATTING_3B" "BATTING_HR" "BATTING_BB"
## [6] "BATTING_SO" "BASERUN_SB" "BASERUN_CS" "BATTING_HBP" "PITCHING_H"
## [11] "PITCHING_HR" "PITCHING_BB" "PITCHING_SO" "FIELDING_E" "FIELDING_DP"
kitchen_sink_model <- lm(TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB + BATTING_SO + BASERUN_SB + BASERUN_CS + BATTING_HBP + PITCHING_H + PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP, data = moneyball)
plot(kitchen_sink_model)
best_kitchen_sink_model <- stepAIC(object = kitchen_sink_model, direction = "backward")
## Start: AIC=831.31
## TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + BATTING_HR +
## BATTING_BB + BATTING_SO + BASERUN_SB + BASERUN_CS + BATTING_HBP +
## PITCHING_H + PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E +
## FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - BATTING_SO 1 1.24 12547 829.33
## - PITCHING_SO 1 1.48 12547 829.33
## - BASERUN_CS 1 1.71 12548 829.34
## - BATTING_HR 1 15.23 12561 829.54
## - PITCHING_HR 1 15.79 12562 829.55
## - PITCHING_H 1 33.63 12580 829.82
## - BATTING_H 1 34.42 12580 829.83
## - BATTING_2B 1 54.41 12600 830.14
## - BASERUN_SB 1 95.22 12641 830.76
## - BATTING_BB 1 107.84 12654 830.95
## - PITCHING_BB 1 110.48 12656 830.99
## - BATTING_3B 1 122.16 12668 831.16
## <none> 12546 831.31
## - BATTING_HBP 1 198.21 12744 832.31
## - FIELDING_DP 1 628.49 13174 838.65
## - FIELDING_E 1 1237.79 13784 847.28
##
## Step: AIC=829.33
## TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + BATTING_HR +
## BATTING_BB + BASERUN_SB + BASERUN_CS + BATTING_HBP + PITCHING_H +
## PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - BASERUN_CS 1 1.59 12549 827.35
## - BATTING_HR 1 15.82 12563 827.57
## - PITCHING_HR 1 16.39 12564 827.58
## - BATTING_2B 1 53.47 12601 828.14
## - PITCHING_H 1 88.45 12636 828.67
## - BATTING_H 1 90.30 12637 828.70
## - BASERUN_SB 1 94.19 12641 828.76
## - BATTING_BB 1 107.95 12655 828.97
## - PITCHING_BB 1 110.60 12658 829.01
## - BATTING_3B 1 122.20 12669 829.18
## <none> 12547 829.33
## - BATTING_HBP 1 197.11 12744 830.31
## - FIELDING_DP 1 630.68 13178 836.70
## - FIELDING_E 1 1240.80 13788 845.34
## - PITCHING_SO 1 1312.89 13860 846.34
##
## Step: AIC=827.35
## TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + BATTING_HR +
## BATTING_BB + BASERUN_SB + BATTING_HBP + PITCHING_H + PITCHING_HR +
## PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - BATTING_HR 1 16.06 12565 825.60
## - PITCHING_HR 1 16.64 12565 825.61
## - BATTING_2B 1 53.05 12602 826.16
## - PITCHING_H 1 90.24 12639 826.72
## - BATTING_H 1 92.13 12641 826.75
## - BATTING_BB 1 110.31 12659 827.03
## - PITCHING_BB 1 113.00 12662 827.07
## - BASERUN_SB 1 123.42 12672 827.22
## - BATTING_3B 1 129.33 12678 827.31
## <none> 12549 827.35
## - BATTING_HBP 1 197.23 12746 828.33
## - FIELDING_DP 1 635.62 13184 834.79
## - PITCHING_SO 1 1311.88 13861 844.35
## - FIELDING_E 1 1322.05 13871 844.49
##
## Step: AIC=825.6
## TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + BATTING_BB +
## BASERUN_SB + BATTING_HBP + PITCHING_H + PITCHING_HR + PITCHING_BB +
## PITCHING_SO + FIELDING_E + FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - BATTING_2B 1 55.48 12620 824.44
## - PITCHING_H 1 89.26 12654 824.95
## - BATTING_H 1 91.97 12657 824.99
## - BATTING_BB 1 104.58 12669 825.18
## - PITCHING_BB 1 107.19 12672 825.22
## <none> 12565 825.60
## - BATTING_3B 1 137.48 12702 825.68
## - BASERUN_SB 1 146.90 12712 825.82
## - BATTING_HBP 1 200.36 12765 826.62
## - FIELDING_DP 1 628.95 13194 832.93
## - PITCHING_HR 1 853.54 13418 836.15
## - PITCHING_SO 1 1316.68 13882 842.63
## - FIELDING_E 1 1333.15 13898 842.86
##
## Step: AIC=824.44
## TARGET_WINS ~ BATTING_H + BATTING_3B + BATTING_BB + BASERUN_SB +
## BATTING_HBP + PITCHING_H + PITCHING_HR + PITCHING_BB + PITCHING_SO +
## FIELDING_E + FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - PITCHING_H 1 84.47 12705 823.71
## - BATTING_H 1 87.79 12708 823.76
## - BATTING_BB 1 98.92 12719 823.93
## - PITCHING_BB 1 101.48 12722 823.97
## - BASERUN_SB 1 109.27 12730 824.09
## <none> 12620 824.44
## - BATTING_3B 1 147.01 12767 824.65
## - BATTING_HBP 1 204.39 12825 825.51
## - FIELDING_DP 1 649.12 13269 832.02
## - PITCHING_HR 1 812.92 13433 834.36
## - PITCHING_SO 1 1262.90 13883 840.66
## - FIELDING_E 1 1379.34 14000 842.25
##
## Step: AIC=823.71
## TARGET_WINS ~ BATTING_H + BATTING_3B + BATTING_BB + BASERUN_SB +
## BATTING_HBP + PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E +
## FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - BATTING_BB 1 32.85 12738 822.21
## - PITCHING_BB 1 43.42 12748 822.37
## - BASERUN_SB 1 105.16 12810 823.29
## <none> 12705 823.71
## - BATTING_3B 1 153.13 12858 824.00
## - BATTING_HBP 1 183.82 12888 824.46
## - BATTING_H 1 504.11 13209 829.15
## - FIELDING_DP 1 602.80 13308 830.57
## - PITCHING_HR 1 850.25 13555 834.09
## - PITCHING_SO 1 1259.72 13964 839.77
## - FIELDING_E 1 1419.39 14124 841.94
##
## Step: AIC=822.21
## TARGET_WINS ~ BATTING_H + BATTING_3B + BASERUN_SB + BATTING_HBP +
## PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - BASERUN_SB 1 109.99 12848 821.85
## <none> 12738 822.21
## - BATTING_3B 1 156.45 12894 822.54
## - BATTING_HBP 1 186.58 12924 822.98
## - BATTING_H 1 485.67 13223 827.35
## - FIELDING_DP 1 623.19 13361 829.33
## - PITCHING_HR 1 843.83 13581 832.46
## - PITCHING_SO 1 1267.25 14005 838.32
## - FIELDING_E 1 1395.02 14133 840.06
## - PITCHING_BB 1 2364.81 15102 852.73
##
## Step: AIC=821.85
## TARGET_WINS ~ BATTING_H + BATTING_3B + BATTING_HBP + PITCHING_HR +
## PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - BATTING_3B 1 133.47 12981 821.82
## <none> 12848 821.85
## - BATTING_HBP 1 177.11 13025 822.46
## - BATTING_H 1 566.11 13414 828.09
## - FIELDING_DP 1 737.46 13585 830.51
## - PITCHING_HR 1 756.49 13604 830.78
## - PITCHING_SO 1 1257.91 14106 837.69
## - FIELDING_E 1 1330.40 14178 838.67
## - PITCHING_BB 1 2371.12 15219 852.20
##
## Step: AIC=821.82
## TARGET_WINS ~ BATTING_H + BATTING_HBP + PITCHING_HR + PITCHING_BB +
## PITCHING_SO + FIELDING_E + FIELDING_DP
##
## Df Sum of Sq RSS AIC
## <none> 12981 821.82
## - BATTING_HBP 1 228.70 13210 823.16
## - BATTING_H 1 449.87 13431 826.33
## - FIELDING_DP 1 813.17 13794 831.43
## - PITCHING_HR 1 990.20 13971 833.86
## - PITCHING_SO 1 1316.56 14298 838.27
## - FIELDING_E 1 1334.60 14316 838.52
## - PITCHING_BB 1 2583.00 15564 854.49