Week-3: Data Dive

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Data_set <- "/Users/ba/Documents/IUPUI/Masters/First Sem/Statistics/Dataset/PitchingPost.csv"
Pitching_Data <- read.csv(Data_set)

QUESTION-1: Let’s find out who is the best pitcher in World Series (WS) playoff game by looking at Run Allowed Statistic

WS_Pitching_RA<-
  Pitching_Data |>
  filter(round=='WS') |>
  group_by(playerID) |>
  summarise(Runs_Allowed = round(mean(R),2))
head(WS_Pitching_RA,10)
## # A tibble: 10 × 2
##    playerID  Runs_Allowed
##    <chr>            <dbl>
##  1 abreubr01         0   
##  2 aceveal01         0   
##  3 adamsmi03         1   
##  4 affelje01         0.25
##  5 albural01         1   
##  6 alexasc01         2   
##  7 allenco01         0   
##  8 alvarjo03         4   
##  9 anderbr02         2   
## 10 anderia01         0

Lets create a bins to group them into three different categories(“Best”, “Average”, “Worst”) based on their performance.

WS_Pitching_RA$Performance = cut(WS_Pitching_RA$Runs_Allowed, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
head(WS_Pitching_RA,10)
## # A tibble: 10 × 3
##    playerID  Runs_Allowed Performance
##    <chr>            <dbl> <fct>      
##  1 abreubr01         0    Best       
##  2 aceveal01         0    Best       
##  3 adamsmi03         1    Best       
##  4 affelje01         0.25 Best       
##  5 albural01         1    Best       
##  6 alexasc01         2    Best       
##  7 allenco01         0    Best       
##  8 alvarjo03         4    Best       
##  9 anderbr02         2    Best       
## 10 anderia01         0    Best

Lets see how probable these performance categories are likely to occur

WS_Pithing_RA_Prob<- count(WS_Pitching_RA, Performance)
WS_Probability_RA <- WS_Pithing_RA_Prob |>
  mutate(Probability = round(n/sum(n),2))
WS_Probability_RA
## # A tibble: 3 × 3
##   Performance     n Probability
##   <fct>       <int>       <dbl>
## 1 Best          330        0.87
## 2 Average        41        0.11
## 3 Worst          10        0.03

Since it is a world series finding a player with best performance has the highest probability

Lets visualize the about stats

WS_Probability_RA |>
  ggplot(aes(x = Performance,y=Probability, fill = Performance)) +
  geom_bar(stat = 'identity')+
  geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5)+
  theme_classic()

Now lets add these probabilities to WS_Pitching_RA table.

WS_Pitching_RA_tag <- left_join(WS_Pitching_RA,WS_Probability_RA, by = 'Performance')
WS_Pitching_RA_tag |>
  select(playerID, Runs_Allowed,Performance,Probability)
## # A tibble: 381 × 4
##    playerID  Runs_Allowed Performance Probability
##    <chr>            <dbl> <fct>             <dbl>
##  1 abreubr01         0    Best               0.87
##  2 aceveal01         0    Best               0.87
##  3 adamsmi03         1    Best               0.87
##  4 affelje01         0.25 Best               0.87
##  5 albural01         1    Best               0.87
##  6 alexasc01         2    Best               0.87
##  7 allenco01         0    Best               0.87
##  8 alvarjo03         4    Best               0.87
##  9 anderbr02         2    Best               0.87
## 10 anderia01         0    Best               0.87
## # ℹ 371 more rows

Lets visualize this in the bar graphs:

WS_Pitching_RA_tag |>
  ggplot(aes(x=Performance,y=Runs_Allowed,color=Performance))+
  geom_boxplot()+
  theme_classic()

With the Runs Allowed statistic determining whether a player is good or bad at the game won’t be that reliable because runs allowed could also be due to the performance of fielders in a game. So let’s move on to another metric to assess the performance.

QUESTION-2: Let’s find out who is the best pitcher in World Series (WS) playoff game by looking at Earned Runs (A run for which the pitcher is held accountable, excluding runs scored due to errors, passed balls, or other defensive mistakes.)

WS_Pitching_ER<-
  Pitching_Data |>
  filter(round=='WS') |>
  group_by(playerID) |>
  summarise(Earned_Runs = round(mean(ER),2))
head(WS_Pitching_ER,10)
## # A tibble: 10 × 2
##    playerID  Earned_Runs
##    <chr>           <dbl>
##  1 abreubr01        0   
##  2 aceveal01        0   
##  3 adamsmi03        1   
##  4 affelje01        0.25
##  5 albural01        1   
##  6 alexasc01        2   
##  7 allenco01        0   
##  8 alvarjo03        4   
##  9 anderbr02        2   
## 10 anderia01        0

Lets create a bins to group them into three different categories(“Best”, “Average”, “Worst”) based on their performance.

WS_Pitching_ER$Performance = cut(WS_Pitching_ER$Earned_Runs, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
head(WS_Pitching_ER,10)
## # A tibble: 10 × 3
##    playerID  Earned_Runs Performance
##    <chr>           <dbl> <fct>      
##  1 abreubr01        0    Best       
##  2 aceveal01        0    Best       
##  3 adamsmi03        1    Best       
##  4 affelje01        0.25 Best       
##  5 albural01        1    Best       
##  6 alexasc01        2    Best       
##  7 allenco01        0    Best       
##  8 alvarjo03        4    Best       
##  9 anderbr02        2    Best       
## 10 anderia01        0    Best

Let’s see how probable these are to occur

WS_Pithing_ER_Prob<- count(WS_Pitching_ER, Performance)
WS_Probability_ER <- WS_Pithing_ER_Prob |>
  mutate(Probability = round(n/sum(n),2))
WS_Probability_ER
## # A tibble: 3 × 3
##   Performance     n Probability
##   <fct>       <int>       <dbl>
## 1 Best          336        0.88
## 2 Average        37        0.1 
## 3 Worst           8        0.02
WS_Probability_ER |>
  ggplot(aes(x = Performance,y=Probability, fill = Performance)) +
  geom_bar(stat = 'identity')+
  geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5)+
  theme_classic()

WS_Pitching_ER_tag <- left_join(WS_Pitching_ER,WS_Probability_ER, by = 'Performance')
WS_Pitching_ER_tag |>
  select(playerID, Earned_Runs,Performance,Probability)
## # A tibble: 381 × 4
##    playerID  Earned_Runs Performance Probability
##    <chr>           <dbl> <fct>             <dbl>
##  1 abreubr01        0    Best               0.88
##  2 aceveal01        0    Best               0.88
##  3 adamsmi03        1    Best               0.88
##  4 affelje01        0.25 Best               0.88
##  5 albural01        1    Best               0.88
##  6 alexasc01        2    Best               0.88
##  7 allenco01        0    Best               0.88
##  8 alvarjo03        4    Best               0.88
##  9 anderbr02        2    Best               0.88
## 10 anderia01        0    Best               0.88
## # ℹ 371 more rows

Now usually in the traditional world series there are 7 matches and earned runs are the total in all the matches in a series, so with this metric we won’t be able to access the pitchers performance in a specific match of 9 innings.

QUESTION-3: Let’s find out who is the best pitcher in World Series (WS) playoff game by looking at Earned Run Average (Calculated by taking the number of earned runs a pitcher allows and dividing it by the total number of innings pitched, then multiplying the result by 9 (since a standard baseball game consists of 9 innings). This gives us a clear sign on how the performance of each pitcher is in a single match

WS_Pitching_ERA<-
  Pitching_Data |>
  filter(round=='WS') |>
  group_by(playerID) |>
  summarise(Earned_Runs = sum(ER),
            IPOuts = sum(IPouts))|>
  mutate(New_ERA = round((9*Earned_Runs)/(IPOuts),2))|>
  mutate(Inning_Played = IPOuts*3)
WS_Pitching_ERA
## # A tibble: 381 × 5
##    playerID  Earned_Runs IPOuts New_ERA Inning_Played
##    <chr>           <int>  <int>   <dbl>         <dbl>
##  1 abreubr01           0     15    0               45
##  2 aceveal01           0      6    0               18
##  3 adamsmi03           1      6    1.5             18
##  4 affelje01           1     35    0.26           105
##  5 albural01           1      6    1.5             18
##  6 alexasc01           2      4    4.5             12
##  7 allenco01           0     18    0               54
##  8 alvarjo03           4     10    3.6             30
##  9 anderbr02           2     16    1.12            48
## 10 anderia01           0     15    0               45
## # ℹ 371 more rows

Lets only consider players who have played more than 63 innings because in a world series of 7 matches we have 63 inning a succesfull pitcher can play

WS_Pitching_ERAA <- WS_Pitching_ERA |>
  filter(Inning_Played > 63)
head(WS_Pitching_ERAA,10)
## # A tibble: 10 × 5
##    playerID  Earned_Runs IPOuts New_ERA Inning_Played
##    <chr>           <int>  <int>   <dbl>         <dbl>
##  1 affelje01           1     35    0.26           105
##  2 arrieja01           3     34    0.79           102
##  3 baezpe01            4     24    1.5             72
##  4 batismi01           0     24    0               72
##  5 bauertr01           5     25    1.8             75
##  6 beckejo02           3     70    0.39           210
##  7 blantjo01           6     36    1.5            108
##  8 buehlwa01           1     39    0.23           117
##  9 buehrma01           4     22    1.64            66
## 10 bumgama01           1    108    0.08           324

according to MLB the ERA < 2.5 is considered as good pitcher. so lets create a bins to group them into two different categories(“Best”, “Average”) based on their performance.

WS_Pitching_ERAA$Performance = cut(WS_Pitching_ERAA$New_ERA, breaks = c(-0.1,2.5,Inf), labels = c('Best','Average'))
head(WS_Pitching_ERAA,10)
## # A tibble: 10 × 6
##    playerID  Earned_Runs IPOuts New_ERA Inning_Played Performance
##    <chr>           <int>  <int>   <dbl>         <dbl> <fct>      
##  1 affelje01           1     35    0.26           105 Best       
##  2 arrieja01           3     34    0.79           102 Best       
##  3 baezpe01            4     24    1.5             72 Best       
##  4 batismi01           0     24    0               72 Best       
##  5 bauertr01           5     25    1.8             75 Best       
##  6 beckejo02           3     70    0.39           210 Best       
##  7 blantjo01           6     36    1.5            108 Best       
##  8 buehlwa01           1     39    0.23           117 Best       
##  9 buehrma01           4     22    1.64            66 Best       
## 10 bumgama01           1    108    0.08           324 Best
WS_Pitching_ER_Prob<- count(WS_Pitching_ERAA, Performance)
WS_Probability_ER <- WS_Pitching_ER_Prob |>
  mutate(Probability = round(n/sum(n),2))
WS_Probability_ER
## # A tibble: 2 × 3
##   Performance     n Probability
##   <fct>       <int>       <dbl>
## 1 Best           95        0.94
## 2 Average         6        0.06
WS_Probability_ER |>
  ggplot(aes(x = Performance,y=Probability, fill = Performance)) +
  geom_bar(stat = 'identity')+
  geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5)+
  theme_classic()

From the above data we can see pitchers who have really good performance