library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.4
## âś” forcats 1.0.0 âś” stringr 1.5.1
## âś” ggplot2 3.4.4 âś” tibble 3.2.1
## âś” lubridate 1.9.3 âś” tidyr 1.3.0
## âś” purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(rmarkdown)
lahman_data = read.csv("/Users/anuragreddy/Desktop/Statistics with R/Lahmans Databse .csv")
Lets calculate the OBP for each team every year from 2000-2022.
OBP_DF <- lahman_data |>
group_by(franchID) |>
mutate(OBP = round((H+BB+HBP)/(AB+BB+HBP+SF),2))|>
select(yearID,franchID,OBP)|>
arrange(desc(OBP))
head(OBP_DF)
## # A tibble: 6 Ă— 3
## # Groups: franchID [6]
## yearID franchID OBP
## <int> <chr> <dbl>
## 1 2000 CLE 0.37
## 2 2007 NYY 0.37
## 3 2000 CHW 0.36
## 4 2000 OAK 0.36
## 5 2000 SEA 0.36
## 6 2000 COL 0.36
Lets create a categorical variable for OBP - to divide the teams into Poor, Average and Excellent base movers.
OBP performance criteria:
(0,0.3] -> Poor, (0.3-0.34] -> Average, (0.34-inf] -> Excelent
OBP_DF$Performance <- cut(OBP_DF$OBP,breaks = c(0,0.30,0.34,Inf),labels = c("Poor","Average","Excellent"))
head(OBP_DF)
## # A tibble: 6 Ă— 4
## # Groups: franchID [6]
## yearID franchID OBP Performance
## <int> <chr> <dbl> <fct>
## 1 2000 CLE 0.37 Excellent
## 2 2007 NYY 0.37 Excellent
## 3 2000 CHW 0.36 Excellent
## 4 2000 OAK 0.36 Excellent
## 5 2000 SEA 0.36 Excellent
## 6 2000 COL 0.36 Excellent
set.seed(123)
#Randomly select an year.
year <- sample(OBP_DF$yearID,1)
OBP_DF |>
filter(yearID==year)|>
ggplot(aes(x=OBP,y=franchID,color=Performance))+
geom_point()+
scale_x_continuous(breaks = c(0,0.3,0.34,0.4))+
labs(x="On Base Pecentage",y="Teams",title=paste("Teams performance based on OBP in",year,"season"))+
scale_colour_brewer(palette = "Dark2") +
#scale_y_discrete(expand = expansion(mult = c(-1, -1)))+
theme_classic()
Lets calculate the SLG for each team every year from 2000-2022.
SLG_DF <- lahman_data |>
group_by(franchID) |>
mutate(SLG = round((X2B+2*X3B+3*HR+H)/(AB),2))|>
select(yearID,franchID,SLG)|>
arrange(desc(SLG))
head(SLG_DF)
## # A tibble: 6 Ă— 3
## # Groups: franchID [5]
## yearID franchID SLG
## <int> <chr> <dbl>
## 1 2019 HOU 0.5
## 2 2003 BOS 0.49
## 3 2019 MIN 0.49
## 4 2019 NYY 0.49
## 5 2000 HOU 0.48
## 6 2001 COL 0.48
Lets create a categorical variable for SLG - to divide the teams into Poor, Average and Excellent power hitters.
SLG performance criteria:
(0,0.4] -> Poor Hitters.
(0.4,inf] -> Excellent Hitters.
SLG_DF$Performance <- cut(SLG_DF$SLG,breaks = c(0,0.4,Inf),labels = c("Poor","Excellent"))
head(SLG_DF)
## # A tibble: 6 Ă— 4
## # Groups: franchID [5]
## yearID franchID SLG Performance
## <int> <chr> <dbl> <fct>
## 1 2019 HOU 0.5 Excellent
## 2 2003 BOS 0.49 Excellent
## 3 2019 MIN 0.49 Excellent
## 4 2019 NYY 0.49 Excellent
## 5 2000 HOU 0.48 Excellent
## 6 2001 COL 0.48 Excellent
set.seed(234)
#Randomly select an year.
year <- sample(SLG_DF$yearID,1)
#Randomly select 10 teams
franch <- sample(SLG_DF$franchID,10,replace = FALSE)
SLG_DF |>
filter(yearID==year & franchID %in% franch)|>
ggplot(aes(x=franchID,y= SLG, fill=Performance))+
geom_bar(stat = 'identity')+
labs(x="Slugging Pecentage",y="Teams",title=paste("Teams performance based on SLG in",year,"season"))+
scale_colour_brewer(palette = "Dark2")+
facet_wrap(~Performance,scales = "free_x")+
theme_classic()
Lets calculate the OPS for each team every year from 2000-2022.
OPS_DF <- merge(OBP_DF,SLG_DF,by.x=c("yearID","franchID"),by.y=c("yearID","franchID"))
OPS_DF <- OPS_DF|>
select(yearID,franchID,OBP,SLG)|>
mutate(OPS = OBP+SLG)|>
arrange(desc(OPS))
head(OPS_DF)
## yearID franchID OBP SLG OPS
## 1 2003 BOS 0.36 0.49 0.85
## 2 2019 HOU 0.35 0.50 0.85
## 3 2000 CLE 0.37 0.47 0.84
## 4 2000 HOU 0.36 0.48 0.84
## 5 2009 NYY 0.36 0.48 0.84
## 6 2007 NYY 0.37 0.46 0.83
OPS_DF$Performance <- cut(OPS_DF$OPS,breaks = c(0,0.741,Inf),labels = c("Poor","Excellent"))
head(OPS_DF)
## yearID franchID OBP SLG OPS Performance
## 1 2003 BOS 0.36 0.49 0.85 Excellent
## 2 2019 HOU 0.35 0.50 0.85 Excellent
## 3 2000 CLE 0.37 0.47 0.84 Excellent
## 4 2000 HOU 0.36 0.48 0.84 Excellent
## 5 2009 NYY 0.36 0.48 0.84 Excellent
## 6 2007 NYY 0.37 0.46 0.83 Excellent
set.seed(345)
#Randomly select an year.
year <- sample(OPS_DF$yearID,1)
OPS_DF |>
filter(yearID==year)|>
ggplot(aes(x=OPS,y=franchID,color=Performance))+
geom_point()+
labs(x="OPS - (On Base + Slugging) %",y="Teams",title=paste("Teams performance based on OPS in",year,"season"))+
scale_colour_brewer(palette = "Dark2")+
theme_classic()
lets add winning percentage, OBP, SLG, OPS columns to our original data set and analyse which explanatory variable (OBP, SLG, OPS) is highly correlated with response variable (Winning Percentage).
2020 season is excluded because due to covid pandemic the games have been stopped.
dummy_df <- lahman_data |>
filter(yearID!=2020)|>
mutate(OBP = round((H+BB+HBP)/(AB+BB+HBP+SF),3),
SLG = round((X2B+2*X3B+3*HR+H)/(AB),3),
OPS = round((OBP+SLG),3),
`Win%` = round(100*W/(W+L),2))|>
select(`Win%` ,OBP,SLG,OPS)
head(dummy_df)
## Win% OBP SLG OPS
## 1 50.62 0.352 0.472 0.824
## 2 45.68 0.341 0.435 0.776
## 3 52.47 0.341 0.423 0.764
## 4 58.64 0.356 0.470 0.826
## 5 55.56 0.367 0.470 0.837
## 6 48.77 0.343 0.438 0.781
Correlation <- numeric()
for(i in length(dummy_df)-1){
Cor_val <-round(cor(dummy_df$`Win%`,dummy_df[i+1]),2)
Correlation <- c(Correlation, Cor_val)
}
Correlation_df <- data.frame(Explanatory_var = c("OBP","SLG","OPS"),
Response_Var = c("Win%","Win%","Win%"),
Correlation_r = Correlation)
Correlation_df
## Explanatory_var Response_Var Correlation_r
## 1 OBP Win% 0.53
## 2 SLG Win% 0.53
## 3 OPS Win% 0.53
Interpretation: After observing that all three explanatory variables (OBP, SLG, OPS) are positively correlated with the team’s winning percentage, I found this result surprising. Upon further consideration, it occurred to me that using the total number of wins as the response variable might be more appropriate. The total number of wins is a more straightforward and interpretable measure of success.
Lets visualize the correlation plot between the explanatory variables and total no.of wins
library(patchwork)
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
##
## align_plots
## The following object is masked from 'package:lubridate':
##
## stamp
lahman_wins <- lahman_data|>
filter(yearID!=2020) |>
select(W)
`Win%vsOBP` <- dummy_df |>
ggplot(aes(x = OBP, y = lahman_wins$W))+
geom_point(color='red')+
geom_smooth(method='lm',formula = y~x,color='black')+
labs(title = "Wins vs OBP",y="Wins")+
annotate("text", x=0.3, y=115, label = paste("r:", round(cor(dummy_df$OBP,lahman_wins$W),2)))+
theme_classic()
`Win%vsSLG` <- dummy_df |>
ggplot(aes(x = SLG, y = lahman_wins$W))+
geom_point(color='red')+
geom_smooth(method='lm',formula = y~x,color='black')+
labs(title = "Wins vs SLG",y="Wins")+
annotate("text", x=0.35, y=110, label = paste("r:", round(cor(dummy_df$SLG,lahman_wins$W),2)))+
theme_classic()
`Win%vsOPS` <- dummy_df |>
ggplot(aes(x = OPS, y = lahman_wins$W))+
geom_point(color='red')+
geom_smooth(method='lm',formula = y~x,color='black')+
labs(title = "Wins vs OPS",y="Wins")+
annotate("text", x=0.65, y=110, label = paste("r:", round(cor(dummy_df$OPS,lahman_wins$W),2)))+
theme_classic()
plot_grid(`Win%vsOBP`,`Win%vsSLG`,`Win%vsOPS`,ncol=2)
steps:
set.seed(345)
CF <- 0.95
n <- 10^2
bootstrap <- function (data, func=mean, n=n) {
func_values <- c(NULL)
for (i in 1:n) {
data_sample <- sample(data, size = length(data), replace = TRUE)
func_values <- c(func_values, func(data_sample))
}
return(func_values)
}
Wins_means <- bootstrap(lahman_wins$W, mean, n)
head(Wins_means)
## [1] 80.76032 80.25397 80.92540 81.00000 82.17302 81.21746
mean
True_sample_mean <- mean(lahman_wins$W)
wins_se <- sd(Wins_means)
t_star <- qt(p = (1 - CF)/2, df= n - 1, lower.tail=FALSE)
lower <- True_sample_mean - t_star*wins_se
upper <- True_sample_mean + t_star*wins_se
Wins_Conf <- c(lower, upper)
Wins_Conf
## [1] 80.00470 81.93181
Interpretation: The 95% confidence interval for true population mean for wins is between [80,81.9]. Lets interpret the above results in visualization form.
Low_val <- Wins_means - t_star*wins_se
Upp_val <- Wins_means + t_star*wins_se
Non_Conf <- (Low_val < True_sample_mean) & (True_sample_mean < Upp_val)
ggplot()+
geom_segment(aes(x=Low_val, xend = Upp_val,y=1:length(Wins_means),yend=1:length(Wins_means),color=Non_Conf))+
geom_vline(xintercept = True_sample_mean, color = "orange")+
scale_color_manual(values = c("darkgray", "red"),
breaks = c(TRUE, FALSE)) +
annotate("text",label=paste("True Mean Wins:",round(True_sample_mean,2)),x=True_sample_mean,y=n+10)+
theme_classic()
Wins_SD <- sd(lahman_wins$W)
paste(round(Wins_SD/sqrt(length(lahman_wins$W)),2),round(wins_se,2))
## [1] "0.47 0.49"