library(ggrepel)
## Loading required package: ggplot2
library(boot)
library(broom)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(broom)
library(lindia)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:boot':
##
## logit
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:car':
##
## logit
##
## The following object is masked from 'package:boot':
##
## logit
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(gganimate)
lahman_data = read.csv("/Users/anuragreddy/Desktop/Statistics with R/teams.csv")
lahman_data <- lahman_data |>
filter(yearID != 2020)
Note: 2020 seasonis removed due to unusual format because of covid pandemic.
numeric_lahman <- lahman_data[, sapply(lahman_data, is.numeric)]
describe(numeric_lahman)
## vars n mean sd median trimmed mad
## yearID 1 660 2010.59 6.50 2010.50 2010.50 8.15
## Rank 2 660 3.01 1.45 3.00 2.99 1.48
## G 3 660 161.96 0.30 162.00 162.00 0.00
## Ghome 4 660 80.96 0.50 81.00 81.00 0.00
## W 5 660 80.97 12.04 81.00 81.18 13.34
## L 6 660 80.97 12.02 81.00 80.78 13.34
## R 7 660 738.22 82.67 734.50 735.84 82.28
## AB 8 660 5529.27 84.39 5530.50 5529.06 80.80
## H 9 660 1427.12 88.60 1427.00 1425.96 96.37
## X2B 10 660 285.00 28.66 284.00 284.43 26.69
## X3B 11 660 28.96 9.08 29.00 28.59 8.90
## HR 12 660 174.60 36.83 172.00 173.59 38.55
## BB 13 660 521.02 69.39 519.50 519.21 71.91
## SO 14 660 1184.30 169.43 1170.00 1179.41 189.03
## SB 15 660 90.76 29.80 88.00 89.15 31.13
## CS 16 660 35.57 11.04 34.00 35.00 10.38
## HBP 17 660 58.19 14.50 56.00 57.32 13.34
## SF 18 660 43.30 8.60 42.00 42.92 8.90
## RA 19 660 738.22 89.54 732.00 736.24 93.40
## ER 20 660 679.23 83.72 675.00 677.44 86.73
## ERA 21 660 4.24 0.54 4.20 4.22 0.57
## CG 22 660 4.34 3.34 4.00 3.98 2.97
## SHO 23 660 9.50 3.96 9.00 9.30 4.45
## SV 24 660 40.77 7.31 41.00 40.66 7.41
## IPouts 25 660 4332.29 43.62 4331.00 4332.88 40.03
## HA 26 660 1427.12 98.18 1427.50 1428.18 97.11
## HRA 27 660 174.60 29.41 173.00 173.38 28.17
## BBA 28 660 521.02 64.50 519.50 519.53 64.49
## SOA 29 660 1184.30 168.81 1170.50 1176.12 177.17
## E 30 660 99.13 16.48 99.00 98.63 16.31
## DP 31 660 145.90 18.67 145.00 145.92 17.79
## FP 32 660 0.98 0.00 0.98 0.98 0.00
## attendance 33 660 2381390.55 714355.52 2352420.00 2379952.55 811743.52
## BPF 34 660 100.13 5.27 100.00 99.83 4.45
## PPF 35 660 100.14 5.20 100.00 99.84 4.45
## min max range skew kurtosis se
## yearID 2000.00 2022.00 22.00 0.07 -1.13 0.25
## Rank 1.00 6.00 5.00 0.09 -1.18 0.06
## G 161.00 163.00 2.00 -1.01 7.35 0.01
## Ghome 77.00 84.00 7.00 -1.51 23.98 0.02
## W 43.00 116.00 73.00 -0.15 -0.46 0.47
## L 46.00 119.00 73.00 0.15 -0.45 0.47
## R 513.00 978.00 465.00 0.24 -0.15 3.22
## AB 5210.00 5770.00 560.00 -0.02 0.08 3.28
## H 1147.00 1667.00 520.00 0.09 -0.21 3.45
## X2B 201.00 376.00 175.00 0.21 0.09 1.12
## X3B 5.00 61.00 56.00 0.44 0.32 0.35
## HR 91.00 307.00 216.00 0.31 -0.14 1.43
## BB 363.00 775.00 412.00 0.26 -0.17 2.70
## SO 805.00 1596.00 791.00 0.24 -0.68 6.60
## SB 19.00 200.00 181.00 0.51 0.00 1.16
## CS 10.00 74.00 64.00 0.53 0.31 0.43
## HBP 26.00 112.00 86.00 0.57 0.24 0.56
## SF 23.00 75.00 52.00 0.48 0.29 0.33
## RA 513.00 981.00 468.00 0.20 -0.35 3.49
## ER 451.00 913.00 462.00 0.19 -0.29 3.26
## ERA 2.80 5.84 3.04 0.21 -0.31 0.02
## CG 0.00 18.00 18.00 1.03 1.05 0.13
## SHO 1.00 23.00 22.00 0.49 0.18 0.15
## SV 22.00 66.00 44.00 0.15 -0.12 0.28
## IPouts 4138.00 4485.00 347.00 -0.18 0.98 1.70
## HA 1107.00 1683.00 576.00 -0.14 0.02 3.82
## HRA 96.00 305.00 209.00 0.48 0.57 1.14
## BBA 348.00 728.00 380.00 0.22 -0.03 2.51
## SOA 764.00 1687.00 923.00 0.40 -0.31 6.57
## E 54.00 145.00 91.00 0.27 -0.16 0.64
## DP 94.00 204.00 110.00 0.02 -0.07 0.73
## FP 0.98 0.99 0.02 -0.27 -0.18 0.00
## attendance 642617.00 4298655.00 3656038.00 0.04 -0.53 27806.25
## BPF 88.00 125.00 37.00 0.84 1.89 0.21
## PPF 88.00 125.00 37.00 0.90 2.19 0.20
Box_Plot <- lahman_data |>
ggplot(aes(x=lgID,y=R,fill=lgID))+
geom_boxplot()+
labs(x='League',y='Runs scored by teams',title="Distribution of Runs Scored by teams in AL and NL")+
theme_economist()
ggplotly(Box_Plot)
Interpretation: By interpreting the box plot we can assume that the average runs scored in American league is higher than that in National League.
Box_Plot <- lahman_data |>
ggplot(aes(x=lgID,y=RA,fill=lgID))+
geom_boxplot()+
labs(x='League',y='Runs allowed by teams',title="Distribution of Runs Allowed by teams in AL and NL")+
theme_economist()
ggplotly(Box_Plot)
Interpretation: By interpreting the box plot we can assume that the average runs allowed in American league is higher than that in National League
lahman_data_numeric <- lahman_data[ ,sapply(lahman_data, is.numeric)]
lahman_data_numeric <- lahman_data_numeric |>
rename(Attd=attendance)|>
select(-Rank,-G,-Ghome,-L,-BPF,-PPF,-yearID)
Corr = cor(lahman_data_numeric,method = "pearson")
correlation_df <- as.data.frame(Corr)
correlation_df['W'] |>
arrange(desc(W))
## W
## W 1.00000000
## SV 0.65976732
## R 0.56918237
## SHO 0.51138277
## Attd 0.47008704
## IPouts 0.46433974
## BB 0.45157782
## HR 0.41278435
## FP 0.34276064
## H 0.32775616
## SOA 0.31741492
## SF 0.29826846
## X2B 0.26495269
## HBP 0.17598557
## AB 0.16919051
## CG 0.14232488
## SB 0.03504478
## X3B -0.10021377
## SO -0.12092976
## CS -0.12833252
## DP -0.20487554
## E -0.33028311
## HRA -0.37964854
## BBA -0.45978457
## HA -0.53820995
## ER -0.66382178
## RA -0.67636261
## ERA -0.67704392
library(ggcorrplot)
corr <- ggcorrplot(Corr,hc.order = TRUE,
type = "lower") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Correlation Plot") +
theme_classic()
ggplotly(corr)
Interpretation: The above dataframe indicates which independent variables are closely correlated with wins. With some knowledge of baseball, I have selected the following variables as independent variables.
Variables Selected: R+RA+AB+ERA+HR+IPouts+Attd+HA+SHO+SV
library(ggplot2)
# Define custom density function
my_density <- function(data, mapping, ...) {
ggplot(data = data, mapping = mapping) +
geom_density(alpha = 0.5, fill = "cornflowerblue", ...)
}
# Define custom scatterplot function
my_scatter <- function(data, mapping, ...) {
ggplot(data = data, mapping = mapping) +
geom_point(alpha = 0.5, color = "cornflowerblue") +
geom_smooth(method = lm, se = FALSE, show.legend = FALSE, ...)
}
# Example data
data <- lahman_data_numeric |>
select(W, R, ERA, Attd, SV)
# Create scatterplot matrix
pairs <- ggpairs(data,
lower = list(continuous = my_scatter),
diag = list(continuous = my_density),
progress = FALSE)
ggplotly(pairs)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Can only have one: highlight
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Can only have one: highlight
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Can only have one: highlight
## Warning: Can only have one: highlight
Interpretation: Show casing the ggpairs plots between the Wins and few other independent variables.
Dependent Variable <- Wins
Independent Variables <- R+RA+H+ERA+HR+Attd+HA+SHO+SV
MLR_model1 <- lm(W~R+RA+ERA+HR+Attd+HA+SHO+SV+AB,data=lahman_data_numeric) |>
step(direction = 'backward', trace =0)
summary(MLR_model1)
##
## Call:
## lm(formula = W ~ R + RA + ERA + Attd + SHO + SV + AB, data = lahman_data_numeric)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8470 -2.0185 0.0385 1.9721 8.8952
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.307e+01 9.517e+00 7.677 5.96e-14 ***
## R 9.064e-02 1.873e-03 48.407 < 2e-16 ***
## RA -5.609e-02 9.292e-03 -6.037 2.63e-09 ***
## ERA -3.063e+00 1.506e+00 -2.034 0.0424 *
## Attd 3.772e-07 1.966e-07 1.919 0.0555 .
## SHO 2.032e-01 4.244e-02 4.788 2.09e-06 ***
## SV 4.098e-01 2.067e-02 19.829 < 2e-16 ***
## AB -4.371e-03 1.848e-03 -2.365 0.0183 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.123 on 652 degrees of freedom
## Multiple R-squared: 0.9334, Adjusted R-squared: 0.9327
## F-statistic: 1306 on 7 and 652 DF, p-value: < 2.2e-16
The intercept (51) is statistically significant (p < 0.05), suggesting that when all predictor variables are zero, the expected value of the response variable is significantly different from zero.
The coefficients for predictors R (Runs), RA (Runs Allowed), SHO (Shutouts), SV (Saves), ERA (Earned Runs Average), AB (At Bats) are all statistically significant (p < 0.05). For instance, a one-unit increase in R is associated with a 0.088 unit increase in the response variable, controlling for other predictors.
Overall, this model explains approximately 93.34% of the variability in the number of wins (Adjusted R-squared = 0.9327), indicating a strong fit. However, it’s important to note that while some variables like runs scored, runs allowed, shutouts, and saves appear to have significant effects on the number of wins, other variables like attendance do not show statistically significant effects in this model.
(VIF is a measure used in regression analysis to quantify how much the variance of an estimated regression coefficient increases if your predictors are correlated. High VIF values indicate high multicollinearity, which can lead to unreliable coefficient estimates.)
vif(MLR_model1)
## R RA ERA Attd SHO SV AB
## 1.618830 46.759013 44.989702 1.332814 1.905306 1.543260 1.643200
paste("Correlation between RA and ERA", cor(lahman_data$ERA,lahman_data$RA))
## [1] "Correlation between RA and ERA 0.988224732424427"
Interpretation: A rule of thumb is to keep each VIF below 10. In the above output, ERA and RA have values of 86.15 and 74.93, respectively. Technically speaking, RA and ERA measure the runs scored by the opponents, and they are highly correlated. It is better to remove either one of the variables from the independent variables list.
MLR_model2 <- lm(W~R+RA+SHO+SV+IPouts+AB,data=lahman_data_numeric) |>
step(direction = 'backward', trace =0)
summary(MLR_model2)
##
## Call:
## lm(formula = W ~ R + RA + SHO + SV + IPouts + AB, data = lahman_data_numeric)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9247 -2.1280 0.0897 2.0253 9.2901
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.691335 14.001191 0.978 0.329
## R 0.093685 0.001857 50.452 < 2e-16 ***
## RA -0.070289 0.002335 -30.102 < 2e-16 ***
## SHO 0.202505 0.041833 4.841 1.62e-06 ***
## SV 0.396055 0.020626 19.202 < 2e-16 ***
## IPouts 0.021186 0.004182 5.066 5.30e-07 ***
## AB -0.010824 0.002332 -4.641 4.19e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.079 on 653 degrees of freedom
## Multiple R-squared: 0.9352, Adjusted R-squared: 0.9346
## F-statistic: 1571 on 6 and 653 DF, p-value: < 2.2e-16
vif(MLR_model2)
## R RA SHO SV IPouts AB
## 1.638348 3.038943 1.905124 1.581730 2.313793 2.692953
Interpretation: In summary, this regression model suggests that the independent variables R, RA, SHO, SV, IPouts and AB are all significantly associated with the number of wins in baseball games. The model has a high R-squared value (0.9352), indicating that it explains a large proportion of the variance in W.
paste("BIC of Model 1",BIC(MLR_model1))
## [1] "BIC of Model 1 3426.71585363651"
paste("BIC of Model 2",BIC(MLR_model2))
## [1] "BIC of Model 2 3402.29040494585"
Interpretation: Comparing the BIC values of the two models, we observe that Model 2 has a lower BIC value (3402.3) compared to Model 1 (3426.7). According to the BIC criterion, lower values indicate a better trade-off between model fit and complexity. Therefore, based on the BIC values alone, Model 2 appears to be a preferable model as it achieves a better balance between goodness of fit and parsimony.
Diagnostic Plots
gg_diagnose(MLR_model2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
ggres <- gg_resfitted(MLR_model2) +
geom_smooth(se=FALSE)+
theme_economist()
ggplotly(ggres)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Interpretation: The points in the plot exhibit a random scatter around the horizontal line at y = 0, it suggests that the assumption of linearity is not violated. In other words, there is no discernible pattern in the residuals as the fitted values change. The residuals should be independent of each other. In other words, there is no pattern or correlation between consecutive residuals.
ggqq <- gg_qqplot(MLR_model2)+
theme_economist()
ggplotly(ggqq)
Interpretation: Normal Q-Q plot shows points closely following the diagonal line, indicating that the data (residuals) is approximately normally distributed
gg_resX(MLR_model2)
Interpretation: The plot helps to assess the assumption of linearity between the predictor variable (X) and the response variable. The points in the above plot exhibiting a random scatter around the horizontal line at y = 0, it suggests that the assumption of linearity is reasonable.
ghist <- gg_reshist(MLR_model2)+
geom_histogram(aes(fill='red'),show.legend = FALSE)+
theme_economist()
ggplotly(ghist)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Interpretation: The above histogram of residuals interprets that the data is almost normally distributed.
Reslev <- gg_resleverage(MLR_model2)+
theme_economist()
ggplotly(Reslev)
## `geom_smooth()` using formula = 'y ~ x'
Interpretation: This plot helps us in identifying the influential points in the dataset. Look for the points which have high leverage and high residual.
Cook’s Distance - Used to identify the influential points.
Cookd <- gg_cooksd(MLR_model2, threshold = 'matlab') +
theme_economist()
ggplotly(Cookd)
Interpretation: A Cook’s Distance plot is a diagnostic tool used in regression analysis to identify influential data points that may disproportionately affect the estimation of regression coefficients.
Higher values of Cook’s Distance indicate greater influence of the corresponding data point on the regression model.
Observations with Cook’s Distance values exceeding the threshold are highlighted in the plot. These points are considered potentially influential and may significantly impact the estimated regression coefficients if removed from the analysis.
High Cook’s Distance values indicate that removing the corresponding observation from the dataset could lead to substantial changes in the estimated regression coefficients.
Lets try to predict the Tampa Bay Rays 2023 season wins using the multiple linear model we built:
Before that lets try to visualize the scoring patterns and wins of the Tampa Bay Rays.
anim <- lahman_data |>
filter(franchID=='TBD' | franchID == 'NYY')|>
ggplot(aes(x=yearID,y=R,color=franchID))+
geom_line()+
geom_point()+
labs(title = "Runs Scoring pattern from 2000-2022 season",x="Season",y="Runs Scored")+
transition_reveal(yearID)+
theme_economist()
anim
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
Team | Season | At Bats | Runs Scored | Runs Allowed | Shutouts | Saves | IPouts |
---|---|---|---|---|---|---|---|
Tampa Bay Rays | 2023 | 5511 | 860 | 665 | 14 | 45 | 4326 |
Predicted Wins=13.691335+0.093685×R−0.070289×RA+0.202505×SHO+0.396055×SV+0.021186×IPouts−0.010824×AB
Predicted Wins = 96.7.
Actual Wins = 99.
The present study delves into the realm of baseball, with a focus on predicting Major League Baseball wins through the utilization of a Multiple Linear Regression model. The primary aim of this endeavor is to provide teams and managers with insights into the key variables that significantly influence game outcomes.
It has been determined that among various game-related statistics, such as Runs Scored, Runs Allowed, Shutouts, Saves, and IPouts, the latter hold significant sway in determining the outcome of matches. While Runs Scored reflects the offensive prowess of a team, Runs Allowed, Shutouts, Saves, and IPouts shed light on defensive and pitching capabilities.
The constructed model demonstrates an impressive explanatory power, accounting for 93.5% of the variance in wins, utilizing the aforementioned game-related statistics.
Moving forward, there exists ample opportunity for further exploration within the domain of baseball analytics, also known as Sabermetrics. While the current study primarily relies on box-score related statistics, the burgeoning advancements in sports technology present a wealth of possibilities for enhancing our understanding of the game and refining the accuracy and efficacy of predictions.