In this homework assignment, we are exploring, analyzing and modelling a data set containing approximately 2200 records. Each record represents a professional baseball team from the years 1871 to 2006 inclusive. Each record has the performance of the team for the given year, with all of the statistics adjusted to match the performance of a 162 game season.
#install.packages("mice")
#install.packages("cowplot")
library(ggplot2)
library(mice)
## Warning: package 'mice' was built under R version 3.6.2
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.3.2
## v tidyr 0.8.3 v dplyr 0.8.1
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x tidyr::complete() masks mice::complete()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.6.2
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
library(VIM)
## Warning: package 'VIM' was built under R version 3.6.2
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
moneyball_df <- read.csv("D:\\MS Data Science\\CUNY\\CUNY\\CUNY MSDS\\Spring 2020\\DATA 621\\Homework-1\\moneyball-training-data.csv", header = TRUE,
row.names = 1)
str(moneyball_df)
## 'data.frame': 2276 obs. of 16 variables:
## $ TARGET_WINS : int 39 70 86 70 82 75 80 85 86 76 ...
## $ TEAM_BATTING_H : int 1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
## $ TEAM_BATTING_2B : int 194 219 232 209 186 200 179 171 197 213 ...
## $ TEAM_BATTING_3B : int 39 22 35 38 27 36 54 37 40 18 ...
## $ TEAM_BATTING_HR : int 13 190 137 96 102 92 122 115 114 96 ...
## $ TEAM_BATTING_BB : int 143 685 602 451 472 443 525 456 447 441 ...
## $ TEAM_BATTING_SO : int 842 1075 917 922 920 973 1062 1027 922 827 ...
## $ TEAM_BASERUN_SB : int NA 37 46 43 49 107 80 40 69 72 ...
## $ TEAM_BASERUN_CS : int NA 28 27 30 39 59 54 36 27 34 ...
## $ TEAM_BATTING_HBP: int NA NA NA NA NA NA NA NA NA NA ...
## $ TEAM_PITCHING_H : int 9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
## $ TEAM_PITCHING_HR: int 84 191 137 97 102 92 122 116 114 96 ...
## $ TEAM_PITCHING_BB: int 927 689 602 454 472 443 525 459 447 441 ...
## $ TEAM_PITCHING_SO: int 5456 1082 917 928 920 973 1062 1033 922 827 ...
## $ TEAM_FIELDING_E : int 1011 193 175 164 138 123 136 112 127 131 ...
## $ TEAM_FIELDING_DP: int NA 155 153 156 168 149 186 136 169 159 ...
dim(moneyball_df)
## [1] 2276 16
colnames(moneyball_df)
## [1] "TARGET_WINS" "TEAM_BATTING_H" "TEAM_BATTING_2B"
## [4] "TEAM_BATTING_3B" "TEAM_BATTING_HR" "TEAM_BATTING_BB"
## [7] "TEAM_BATTING_SO" "TEAM_BASERUN_SB" "TEAM_BASERUN_CS"
## [10] "TEAM_BATTING_HBP" "TEAM_PITCHING_H" "TEAM_PITCHING_HR"
## [13] "TEAM_PITCHING_BB" "TEAM_PITCHING_SO" "TEAM_FIELDING_E"
## [16] "TEAM_FIELDING_DP"
summary(moneyball_df)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## Min. : 0.00 Min. : 891 Min. : 69.0 Min. : 0.00
## 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0 1st Qu.: 34.00
## Median : 82.00 Median :1454 Median :238.0 Median : 47.00
## Mean : 80.79 Mean :1469 Mean :241.2 Mean : 55.25
## 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0 3rd Qu.: 72.00
## Max. :146.00 Max. :2554 Max. :458.0 Max. :223.00
##
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0 1st Qu.: 66.0
## Median :102.00 Median :512.0 Median : 750.0 Median :101.0
## Mean : 99.61 Mean :501.6 Mean : 735.6 Mean :124.8
## 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0 3rd Qu.:156.0
## Max. :264.00 Max. :878.0 Max. :1399.0 Max. :697.0
## NA's :102 NA's :131
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## Min. : 0.0 Min. :29.00 Min. : 1137 Min. : 0.0
## 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419 1st Qu.: 50.0
## Median : 49.0 Median :58.00 Median : 1518 Median :107.0
## Mean : 52.8 Mean :59.36 Mean : 1779 Mean :105.7
## 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682 3rd Qu.:150.0
## Max. :201.0 Max. :95.00 Max. :30132 Max. :343.0
## NA's :772 NA's :2085
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## Min. : 0.0 Min. : 0.0 Min. : 65.0 Min. : 52.0
## 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0 1st Qu.:131.0
## Median : 536.5 Median : 813.5 Median : 159.0 Median :149.0
## Mean : 553.0 Mean : 817.7 Mean : 246.5 Mean :146.4
## 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2 3rd Qu.:164.0
## Max. :3645.0 Max. :19278.0 Max. :1898.0 Max. :228.0
## NA's :102 NA's :286
Missing data in columns:
### Count by features
apply(moneyball_df, 2, function(x) length(which(is.na(x))))
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 0 0 0 0
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 0 0 102 131
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 772 2085 0 0
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 0 102 0 286
### Per centage by features
pMiss <- function(x) {sum(is.na(x)) / length(x) * 100}
apply(moneyball_df, 2, pMiss)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 0.000000 0.000000 0.000000 0.000000
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 0.000000 0.000000 4.481547 5.755712
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 33.919156 91.608084 0.000000 0.000000
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 0.000000 4.481547 0.000000 12.565905
As the feature - TEAM_BATTING_HBP is missing data for 2085 observations which is 91% of the data, it is good to remove this feature from our dataset. Also checking by drawing a scatter plot between this feature and target wins
gs6b <- ggplot(data = moneyball_df, aes(x = TEAM_BATTING_HBP, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs6b
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## Warning: Removed 2085 rows containing missing values (geom_point).
Plotting Scatter plots for the dependent variable against each of the independent variable:
### Drawing some plots
gs1 <- ggplot(data = moneyball_df, aes(x = TEAM_BATTING_H, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs2 <- ggplot(data = moneyball_df, aes(x = TEAM_BATTING_2B, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs3 <- ggplot(data = moneyball_df, aes(x = TEAM_BATTING_3B, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs4 <- ggplot(data = moneyball_df, aes(x = TEAM_BATTING_HR, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs5 <- ggplot(data = moneyball_df, aes(x = TEAM_BATTING_BB, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs6 <- ggplot(data = moneyball_df, aes(x = TEAM_BATTING_HBP, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs7 <- ggplot(data = moneyball_df, aes(x = TEAM_BATTING_SO, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs8 <- ggplot(data = moneyball_df, aes(x = TEAM_BASERUN_SB, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs9 <- ggplot(data = moneyball_df, aes(x = TEAM_BASERUN_CS, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs10 <- ggplot(data = moneyball_df, aes(x = TEAM_FIELDING_E, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs11 <- ggplot(data = moneyball_df, aes(x = TEAM_FIELDING_DP, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs12 <- ggplot(data = moneyball_df, aes(x = TEAM_PITCHING_BB, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs13 <- ggplot(data = moneyball_df, aes(x = TEAM_PITCHING_H, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs14 <- ggplot(data = moneyball_df, aes(x = TEAM_PITCHING_HR, y = TARGET_WINS)) +
geom_point() + geom_smooth()
gs15 <- ggplot(data = moneyball_df, aes(x = TEAM_PITCHING_SO, y = TARGET_WINS)) +
geom_point() + geom_smooth()
plot_grid(gs1, gs2, labels = "AUTO")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot_grid(gs3, gs4, labels = "AUTO")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot_grid(gs5, gs6, labels = "AUTO")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## Warning: Removed 2085 rows containing missing values (geom_point).
plot_grid(gs7, gs8, labels = "AUTO")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## Warning: Removed 131 rows containing missing values (geom_point).
plot_grid(gs9, gs10, labels = "AUTO")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).
## Warning: Removed 772 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot_grid(gs11, gs12, labels = "AUTO")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).
## Warning: Removed 286 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot_grid(gs13, gs14, labels = "AUTO")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot_grid(gs15, labels = "AUTO")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
Plotting the box plot to see the data ranges for each of the numeric variables:
gb1 <- ggplot(data = moneyball_df, aes(y = TARGET_WINS)) + geom_boxplot()
gb2 <- ggplot(data = moneyball_df, aes(y = TEAM_BATTING_H)) + geom_boxplot()
plot_grid(gb1, gb2, labels = "AUTO")
gb3 <- ggplot(data = moneyball_df, aes(y = TEAM_BATTING_2B)) + geom_boxplot()
gb4 <- ggplot(data = moneyball_df, aes(y = TEAM_BATTING_3B)) + geom_boxplot()
plot_grid(gb3, gb4, labels = "AUTO")
gb5 <- ggplot(data = moneyball_df, aes(y = TEAM_BATTING_HR)) + geom_boxplot()
gb6 <- ggplot(data = moneyball_df, aes(y = TEAM_BATTING_BB)) + geom_boxplot()
plot_grid(gb5, gb6, labels = "AUTO")
gb7 <- ggplot(data = moneyball_df, aes(y = TEAM_BATTING_HBP)) + geom_boxplot()
gb8 <- ggplot(data = moneyball_df, aes(y = TEAM_BATTING_SO)) + geom_boxplot()
plot_grid(gb7, gb8, labels = "AUTO")
## Warning: Removed 2085 rows containing non-finite values (stat_boxplot).
## Warning: Removed 102 rows containing non-finite values (stat_boxplot).
gb9 <- ggplot(data = moneyball_df, aes(y = TEAM_BASERUN_SB)) + geom_boxplot()
gb10 <- ggplot(data = moneyball_df, aes(y = TEAM_BASERUN_CS)) + geom_boxplot()
plot_grid(gb9, gb10, labels = "AUTO")
## Warning: Removed 131 rows containing non-finite values (stat_boxplot).
## Warning: Removed 772 rows containing non-finite values (stat_boxplot).
gb11 <- ggplot(data = moneyball_df, aes(y = TEAM_FIELDING_E)) + geom_boxplot()
gb12 <- ggplot(data = moneyball_df, aes(y = TEAM_FIELDING_DP)) + geom_boxplot()
plot_grid(gb11, gb12, labels = "AUTO")
## Warning: Removed 286 rows containing non-finite values (stat_boxplot).
gb13 <- ggplot(data = moneyball_df, aes(y = TEAM_PITCHING_BB)) + geom_boxplot()
gb14 <- ggplot(data = moneyball_df, aes(y = TEAM_PITCHING_H)) + geom_boxplot()
plot_grid(gb13, gb14, labels = "AUTO")
gb15 <- ggplot(data = moneyball_df, aes(y = TEAM_PITCHING_HR)) + geom_boxplot()
gb16 <- ggplot(data = moneyball_df, aes(y = TEAM_PITCHING_SO)) + geom_boxplot()
plot_grid(gb15, gb16, labels = "AUTO")
## Warning: Removed 102 rows containing non-finite values (stat_boxplot).
Plotting the histogram for the dependent variable:
### Histograms:
ggplot(data = moneyball_df, aes(x = TARGET_WINS)) + geom_histogram() +
geom_vline(aes(xintercept=mean(TARGET_WINS)),
color='blue', linetype='dashed', size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Removing the feature: TEAM_BATTING_HBP as it is missing for 91% of the observations
moneyball_df_model <- moneyball_df %>% select(-TEAM_BATTING_HBP)
Plotting the missing data pattern:
aggr_plot <- aggr(moneyball_df_model, col=c('navyblue','red'),
numbers=TRUE, sortVars=TRUE,
labels=names(data), cex.axis=.7, gap=3,
ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## TEAM_BASERUN_CS 0.33919156
## TEAM_FIELDING_DP 0.12565905
## TEAM_BASERUN_SB 0.05755712
## TEAM_BATTING_SO 0.04481547
## TEAM_PITCHING_SO 0.04481547
## TARGET_WINS 0.00000000
## TEAM_BATTING_H 0.00000000
## TEAM_BATTING_2B 0.00000000
## TEAM_BATTING_3B 0.00000000
## TEAM_BATTING_HR 0.00000000
## TEAM_BATTING_BB 0.00000000
## TEAM_PITCHING_H 0.00000000
## TEAM_PITCHING_HR 0.00000000
## TEAM_PITCHING_BB 0.00000000
## TEAM_FIELDING_E 0.00000000
Imputing the missing data for the other 4 features, which we are keeping for our analysis:
moneyball_df_model_imputed <- mice(data = moneyball_df_model, m = 1,
method = "pmm", maxit = 5, seed = 500)
##
## iter imp variable
## 1 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
moneyball_df_model_imputed_df1 <- mice::complete(moneyball_df_model_imputed, 1)
Now, our new dataframe has all the values imputed.
Starting with the model preparation.
g1 <- lm(TARGET_WINS ~TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H +
TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
TEAM_FIELDING_E + TEAM_FIELDING_DP, data = moneyball_df_model_imputed_df1)
summary(g1)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = moneyball_df_model_imputed_df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44.922 -8.263 0.178 8.174 52.280
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.4081227 5.1272681 7.491 9.75e-14 ***
## TEAM_BATTING_H 0.0414999 0.0035330 11.747 < 2e-16 ***
## TEAM_BATTING_2B -0.0183003 0.0088447 -2.069 0.03865 *
## TEAM_BATTING_3B 0.0248417 0.0163409 1.520 0.12860
## TEAM_BATTING_HR 0.0660225 0.0262241 2.518 0.01188 *
## TEAM_BATTING_BB 0.0136857 0.0055513 2.465 0.01376 *
## TEAM_BATTING_SO -0.0186740 0.0024785 -7.534 7.05e-14 ***
## TEAM_BASERUN_SB 0.0528750 0.0049419 10.699 < 2e-16 ***
## TEAM_BASERUN_CS 0.0009239 0.0099836 0.093 0.92627
## TEAM_PITCHING_H 0.0015578 0.0003815 4.084 4.59e-05 ***
## TEAM_PITCHING_HR 0.0204354 0.0232575 0.879 0.37968
## TEAM_PITCHING_BB -0.0041764 0.0039721 -1.051 0.29318
## TEAM_PITCHING_SO 0.0027716 0.0008819 3.143 0.00169 **
## TEAM_FIELDING_E -0.0427548 0.0026355 -16.223 < 2e-16 ***
## TEAM_FIELDING_DP -0.1199647 0.0125473 -9.561 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.5 on 2261 degrees of freedom
## Multiple R-squared: 0.3741, Adjusted R-squared: 0.3702
## F-statistic: 96.51 on 14 and 2261 DF, p-value: < 2.2e-16
By looking at the p-values of individual features, we have to remove 3 for now: TEAM_BATTING_3B, TEAM_BASERUN_CS, TEAM_PITCHING_HR
g2 <- lm(TARGET_WINS ~TEAM_BATTING_H + TEAM_BATTING_2B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
TEAM_FIELDING_E + TEAM_FIELDING_DP, data = moneyball_df_model_imputed_df1)
summary(g2)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP, data = moneyball_df_model_imputed_df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.472 -8.292 0.244 8.177 51.955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.5501936 4.9975315 7.514 8.22e-14 ***
## TEAM_BATTING_H 0.0438966 0.0032872 13.354 < 2e-16 ***
## TEAM_BATTING_2B -0.0200199 0.0087412 -2.290 0.022095 *
## TEAM_BATTING_HR 0.0828032 0.0088573 9.349 < 2e-16 ***
## TEAM_BATTING_BB 0.0117561 0.0050242 2.340 0.019375 *
## TEAM_BATTING_SO -0.0189410 0.0024231 -7.817 8.24e-15 ***
## TEAM_BASERUN_SB 0.0543727 0.0041049 13.246 < 2e-16 ***
## TEAM_PITCHING_H 0.0014493 0.0003755 3.859 0.000117 ***
## TEAM_PITCHING_BB -0.0020173 0.0034284 -0.588 0.556312
## TEAM_PITCHING_SO 0.0025479 0.0008336 3.056 0.002266 **
## TEAM_FIELDING_E -0.0425251 0.0026301 -16.168 < 2e-16 ***
## TEAM_FIELDING_DP -0.1216175 0.0123987 -9.809 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.5 on 2264 degrees of freedom
## Multiple R-squared: 0.3731, Adjusted R-squared: 0.37
## F-statistic: 122.5 on 11 and 2264 DF, p-value: < 2.2e-16