## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(rmarkdown)
lahman_data = read.csv("/Users/anuragreddy/Desktop/Statistics with R/Lahmans Databse .csv")

Lets calculate the On Base Percentage (OBP):

OBP - It measures the ability of the player or team in reaching the base safely.

Lets calculate the OBP for each team every year from 2000-2022.

OBP_DF <- lahman_data |>
  group_by(franchID) |>
  mutate(OBP = round((H+BB+HBP)/(AB+BB+HBP+SF),2))|>
  select(yearID,franchID,OBP)|>
  arrange(desc(OBP))
head(OBP_DF)

## # A tibble: 6 × 3
## # Groups:   franchID [6]
##   yearID franchID   OBP
##    <int> <chr>    <dbl>
## 1   2000 CLE       0.37
## 2   2007 NYY       0.37
## 3   2000 CHW       0.36
## 4   2000 OAK       0.36
## 5   2000 SEA       0.36
## 6   2000 COL       0.36

Lets create a categorical variable for OBP - to divide the teams into Poor, Average and Excellent base movers.

OBP performance criteria:

(0,0.3] -> Poor, (0.3-0.34] -> Average, (0.34-inf] -> Excelent

OBP_DF$Performance <- cut(OBP_DF$OBP,breaks = c(0,0.30,0.34,Inf),labels = c("Poor","Average","Excellent"))
head(OBP_DF)

## # A tibble: 6 × 4
## # Groups:   franchID [6]
##   yearID franchID   OBP Performance
##    <int> <chr>    <dbl> <fct>      
## 1   2000 CLE       0.37 Excellent  
## 2   2007 NYY       0.37 Excellent  
## 3   2000 CHW       0.36 Excellent  
## 4   2000 OAK       0.36 Excellent  
## 5   2000 SEA       0.36 Excellent  
## 6   2000 COL       0.36 Excellent

set.seed(123)
#Randomly select an year.
year <- sample(OBP_DF$yearID,1)

OBP_DF |>
  filter(yearID==year)|>
  ggplot(aes(x=OBP,y=franchID,color=Performance))+
  geom_point()+
  scale_x_continuous(breaks = c(0,0.3,0.34,0.4))+
  labs(x="On Base Pecentage",y="Teams",title=paste("Teams performance based on OBP in",year,"season"))+
  scale_colour_brewer(palette = "Dark2") +
  #scale_y_discrete(expand = expansion(mult = c(-1, -1)))+
  theme_classic()

Interpretation: In year 2011 Boston Red Sox is the only team to have OBP more than 0.34 (Excellent base movers category). Majority of the teams have fallen (0.30,0.34] Average category.

Lets calculate the Slugging Percentage (SLG):

SLG - It measures a player’s or team’s power and ability to hit for extra bases.

Lets calculate the SLG for each team every year from 2000-2022.

SLG_DF <- lahman_data |>
  group_by(franchID) |>
  mutate(SLG = round((X2B+2*X3B+3*HR+H)/(AB),2))|>
  select(yearID,franchID,SLG)|>
  arrange(desc(SLG))
head(SLG_DF)

## # A tibble: 6 × 3
## # Groups:   franchID [5]
##   yearID franchID   SLG
##    <int> <chr>    <dbl>
## 1   2019 HOU       0.5 
## 2   2003 BOS       0.49
## 3   2019 MIN       0.49
## 4   2019 NYY       0.49
## 5   2000 HOU       0.48
## 6   2001 COL       0.48

Lets create a categorical variable for SLG - to divide the teams into Poor, Average and Excellent power hitters.

SLG performance criteria:

(0,0.4] -> Poor Hitters.

(0.4,inf] -> Excellent Hitters.

SLG_DF$Performance <- cut(SLG_DF$SLG,breaks = c(0,0.4,Inf),labels = c("Poor","Excellent"))
head(SLG_DF)

## # A tibble: 6 × 4
## # Groups:   franchID [5]
##   yearID franchID   SLG Performance
##    <int> <chr>    <dbl> <fct>      
## 1   2019 HOU       0.5  Excellent  
## 2   2003 BOS       0.49 Excellent  
## 3   2019 MIN       0.49 Excellent  
## 4   2019 NYY       0.49 Excellent  
## 5   2000 HOU       0.48 Excellent  
## 6   2001 COL       0.48 Excellent

set.seed(234)
#Randomly select an year.
year <- sample(SLG_DF$yearID,1)
#Randomly select 10 teams
franch <- sample(SLG_DF$franchID,10,replace = FALSE)

SLG_DF |>
  filter(yearID==year & franchID %in% franch)|>
  ggplot(aes(x=franchID,y= SLG, fill=Performance))+
  geom_bar(stat = 'identity')+
  labs(x="Slugging Pecentage",y="Teams",title=paste("Teams performance based on SLG in",year,"season"))+
  scale_colour_brewer(palette = "Dark2")+
  facet_wrap(~Performance,scales = "free_x")+
  theme_classic()

Interpretation: I have randomly selected year and teams (10), so that the visualization can be understood clearly. The above visualization speaks about the teams being divided into poor and excellent hitters based on slugging percentage. Cleveland Guardians is the team with highest slugging percentage.

Lets calculate the OPS (OBP+SLG):

OPS - It measures a player’s or team’s power and ability to hit for extra bases and reaching base safely.

Lets calculate the OPS for each team every year from 2000-2022.

OPS_DF <- merge(OBP_DF,SLG_DF,by.x=c("yearID","franchID"),by.y=c("yearID","franchID"))
OPS_DF <- OPS_DF|>
  select(yearID,franchID,OBP,SLG)|>
  mutate(OPS = OBP+SLG)|>
  arrange(desc(OPS))
head(OPS_DF)

##   yearID franchID  OBP  SLG  OPS
## 1   2003      BOS 0.36 0.49 0.85
## 2   2019      HOU 0.35 0.50 0.85
## 3   2000      CLE 0.37 0.47 0.84
## 4   2000      HOU 0.36 0.48 0.84
## 5   2009      NYY 0.36 0.48 0.84
## 6   2007      NYY 0.37 0.46 0.83

OPS_DF$Performance <- cut(OPS_DF$OPS,breaks = c(0,0.741,Inf),labels = c("Poor","Excellent"))
head(OPS_DF)

##   yearID franchID  OBP  SLG  OPS Performance
## 1   2003      BOS 0.36 0.49 0.85   Excellent
## 2   2019      HOU 0.35 0.50 0.85   Excellent
## 3   2000      CLE 0.37 0.47 0.84   Excellent
## 4   2000      HOU 0.36 0.48 0.84   Excellent
## 5   2009      NYY 0.36 0.48 0.84   Excellent
## 6   2007      NYY 0.37 0.46 0.83   Excellent

set.seed(345)
#Randomly select an year.
year <- sample(OPS_DF$yearID,1)

OPS_DF |>
  filter(yearID==year)|>
  ggplot(aes(x=OPS,y=franchID,color=Performance))+
  geom_point()+
  labs(x="OPS - (On Base + Slugging) %",y="Teams",title=paste("Teams performance based on OPS in",year,"season"))+
  scale_colour_brewer(palette = "Dark2")+
  theme_classic()

Interpretation: Similar to above visualizations, I have shown the visualization between OPS performance team wise for 2015 season.

Response Variable: Winning Percentage.

lets add winning percentage, OBP, SLG, OPS columns to our original data set and analyse which explanatory variable (OBP, SLG, OPS) is highly correlated with response variable (Winning Percentage).

2020 season is excluded because due to covid pandemic the games have been stopped.

dummy_df <- lahman_data |>
  filter(yearID!=2020)|>
  mutate(OBP = round((H+BB+HBP)/(AB+BB+HBP+SF),3),
         SLG = round((X2B+2*X3B+3*HR+H)/(AB),3),
         OPS = round((OBP+SLG),3),
         `Win%` = round(100*W/(W+L),2))|>
  select(`Win%` ,OBP,SLG,OPS)
head(dummy_df)

##    Win%   OBP   SLG   OPS
## 1 50.62 0.352 0.472 0.824
## 2 45.68 0.341 0.435 0.776
## 3 52.47 0.341 0.423 0.764
## 4 58.64 0.356 0.470 0.826
## 5 55.56 0.367 0.470 0.837
## 6 48.77 0.343 0.438 0.781

Correlation between OBP, SLG, OPS and Win%

Correlation <- numeric()

for(i in length(dummy_df)-1){
  Cor_val <-round(cor(dummy_df$`Win%`,dummy_df[i+1]),2)
  Correlation <- c(Correlation, Cor_val)
}

Correlation_df <- data.frame(Explanatory_var = c("OBP","SLG","OPS"), 
                            Response_Var = c("Win%","Win%","Win%"),
                            Correlation_r = Correlation)
Correlation_df

##   Explanatory_var Response_Var Correlation_r
## 1             OBP         Win%          0.53
## 2             SLG         Win%          0.53
## 3             OPS         Win%          0.53

Interpretation: After observing that all three explanatory variables (OBP, SLG, OPS) are positively correlated with the team’s winning percentage, I found this result surprising. Upon further consideration, it occurred to me that using the total number of wins as the response variable might be more appropriate. The total number of wins is a more straightforward and interpretable measure of success.

Lets visualize the correlation plot between the explanatory variables and total no.of wins

library(patchwork)
library(cowplot)

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:patchwork':
## 
##     align_plots

## The following object is masked from 'package:lubridate':
## 
##     stamp

lahman_wins <- lahman_data|>
  filter(yearID!=2020) |>
  select(W)


`Win%vsOBP` <- dummy_df |>
  ggplot(aes(x = OBP, y = lahman_wins$W))+
  geom_point(color='red')+
  geom_smooth(method='lm',formula = y~x,color='black')+
  labs(title = "Wins vs OBP",y="Wins")+
  annotate("text", x=0.3, y=115, label = paste("r:", round(cor(dummy_df$OBP,lahman_wins$W),2)))+
  theme_classic()

`Win%vsSLG` <- dummy_df |>
  ggplot(aes(x = SLG, y = lahman_wins$W))+
  geom_point(color='red')+
  geom_smooth(method='lm',formula = y~x,color='black')+
  labs(title = "Wins vs SLG",y="Wins")+
  annotate("text", x=0.35, y=110, label = paste("r:", round(cor(dummy_df$SLG,lahman_wins$W),2)))+
  theme_classic()

`Win%vsOPS` <- dummy_df |>
  ggplot(aes(x = OPS, y = lahman_wins$W))+
  geom_point(color='red')+
  geom_smooth(method='lm',formula = y~x,color='black')+
  labs(title = "Wins vs OPS",y="Wins")+
  annotate("text", x=0.65, y=110, label = paste("r:", round(cor(dummy_df$OPS,lahman_wins$W),2)))+
  theme_classic()

plot_grid(`Win%vsOBP`,`Win%vsSLG`,`Win%vsOPS`,ncol=2)

Interpretation: This result persists across the various offensive indicators, signifying that teams with higher On-Base Percentage (OBP), Slugging Percentage (SLG), and On-Base Plus Slugging (OPS) tend to achieve a greater number of wins. The correlations are almost equal - 0.5.

Confidence intervals of total no.of wins using bootstrapping SE

steps:

Create a bootstrapped means from total no.of wins data.

set.seed(345)
CF <- 0.95
n <- 10^2

bootstrap <- function (data, func=mean, n=n) {
  func_values <- c(NULL)
  
  for (i in 1:n) {
    
    data_sample <- sample(data, size = length(data), replace = TRUE)
    
    func_values <- c(func_values, func(data_sample))
  }
  
  return(func_values)
}

Wins_means <- bootstrap(lahman_wins$W, mean, n)
head(Wins_means)

## [1] 80.76032 80.25397 80.92540 81.00000 82.17302 81.21746

As we don’t have the population standard deviation used Standard Error - standard deviation of sample means.
Find the 95% confidence interval for true population mean.

mean

True_sample_mean <- mean(lahman_wins$W)
wins_se <- sd(Wins_means) 

t_star <- qt(p = (1 - CF)/2, df= n - 1, lower.tail=FALSE)

lower <- True_sample_mean - t_star*wins_se
upper <- True_sample_mean + t_star*wins_se

Wins_Conf <- c(lower, upper)
Wins_Conf

## [1] 80.00470 81.93181

Interpretation: The 95% confidence interval for true population mean for wins is between [80,81.9]. Lets interpret the above results in visualization form.

Low_val <- Wins_means - t_star*wins_se
Upp_val <- Wins_means + t_star*wins_se

Non_Conf <- (Low_val < True_sample_mean) & (True_sample_mean < Upp_val)


ggplot()+
  geom_segment(aes(x=Low_val, xend = Upp_val,y=1:length(Wins_means),yend=1:length(Wins_means),color=Non_Conf))+
  geom_vline(xintercept = True_sample_mean, color = "orange")+
  scale_color_manual(values = c("darkgray", "red"), 
                     breaks = c(TRUE, FALSE)) +
  annotate("text",label=paste("True Mean Wins:",round(True_sample_mean,2)),x=True_sample_mean,y=n+10)+
  theme_classic()

Interpretation: Out of 100 sampled confidence intervals for total wins, only 4 C.I doesn’t contain our true sample mean. We can also find the C.I using z statistic instead of using SE. Standard Deviation of the total wins from original data divided by square root n and SE of samples mean wins will be almost similar.

Wins_SD <- sd(lahman_wins$W)
paste(round(Wins_SD/sqrt(length(lahman_wins$W)),2),round(wins_se,2))

## [1] "0.47 0.49"

Future questions: In the upcoming weeks, my focus will be on investigating alternative performance metrics to identify those that exhibit higher correlation with the total number of wins compared to the existing variables (OBP, SLG, OPS).

Week 6 Data Dive

Anurag

2024-02-17

Table Of Contents:

- Dividing the teams into categories based on

1. OBP performance.

2. SLG performance.

3. OPS (OBP+SLG) performance.

- Response variable - Winning percentage (Win_Per)

1. OBP vs Win_Per

2. SLG vs Win_Per

3. OPS vs Win_Per

-Response Variable - Wins (W)

- Confidence Interval of W using Bootstrapping