Week-4 : (Data-Dive)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(patchwork)
Data_set <- "/Users/ba/Documents/IUPUI/Masters/First Sem/Statistics/Dataset/PitchingPost.csv"
Pitching_Data <- read.csv(Data_set)
library(dplyr)
set.seed(123)
sample_size <- nrow(Pitching_Data) * 0.5
df_1 <- Pitching_Data %>% sample_n(size = sample_size, replace = TRUE)
df_2 <- Pitching_Data %>% sample_n(size = sample_size, replace = TRUE)
df_3 <- Pitching_Data %>% sample_n(size = sample_size, replace = TRUE)
df_4 <- Pitching_Data %>% sample_n(size = sample_size, replace = TRUE)
df_5 <- Pitching_Data %>% sample_n(size = sample_size, replace = TRUE)
Runs-Allowed
# Sample-1
WS_Pitching_RA1<-
df_1 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Runs_Allowed = round(mean(R),2))
WS_Pitching_RA1$Performance = cut(WS_Pitching_RA1$Runs_Allowed, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
WS_Pithing_RA_Prob1<- count(WS_Pitching_RA1, Performance)
WS_Probability_RA1 <- WS_Pithing_RA_Prob1 |>
mutate(Probability = round(n/sum(n),2))
# Sample-2
WS_Pitching_RA2<-
df_2 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Runs_Allowed = round(mean(R),2))
WS_Pitching_RA2$Performance = cut(WS_Pitching_RA2$Runs_Allowed, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
WS_Pithing_RA_Prob2<- count(WS_Pitching_RA2, Performance)
WS_Probability_RA2 <- WS_Pithing_RA_Prob2 |>
mutate(Probability = round(n/sum(n),2))
# Sample-3
WS_Pitching_RA3<-
df_3 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Runs_Allowed = round(mean(R),2))
WS_Pitching_RA3$Performance = cut(WS_Pitching_RA3$Runs_Allowed, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
WS_Pithing_RA_Prob3<- count(WS_Pitching_RA3, Performance)
WS_Probability_RA3 <- WS_Pithing_RA_Prob3 |>
mutate(Probability = round(n/sum(n),2))
# Sample-4
WS_Pitching_RA4<-
df_4 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Runs_Allowed = round(mean(R),2))
WS_Pitching_RA4$Performance = cut(WS_Pitching_RA4$Runs_Allowed, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
WS_Pithing_RA_Prob4<- count(WS_Pitching_RA4, Performance)
WS_Probability_RA4 <- WS_Pithing_RA_Prob4 |>
mutate(Probability = round(n/sum(n),2))
# Sample-5
WS_Pitching_RA5<-
df_5 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Runs_Allowed = round(mean(R),2))
WS_Pitching_RA5$Performance = cut(WS_Pitching_RA5$Runs_Allowed, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
WS_Pithing_RA_Prob5<- count(WS_Pitching_RA5, Performance)
WS_Probability_RA5 <- WS_Pithing_RA_Prob5 |>
mutate(Probability = round(n/sum(n),2))
#Now lets visualize
library(ggplot2)
library(patchwork)
plot1 <- WS_Probability_RA1 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot2 <- WS_Probability_RA2 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot3 <- WS_Probability_RA3 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot4 <- WS_Probability_RA4 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot5 <- WS_Probability_RA5 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot1
plot2
plot3
plot4
plot5
Here we can see that all the performance categories (Best, Average and Worst) all have close values with a difference in the range of “0.03” unit probability
Earned - Runs
#Sample -1
WS_Pitching_ER1<-
df_1 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Earned_Runs = round(mean(ER),2))
WS_Pitching_ER1$Performance = cut(WS_Pitching_ER1$Earned_Runs, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
WS_Pithing_ER_Prob1<- count(WS_Pitching_ER1, Performance)
WS_Probability_ER1 <- WS_Pithing_ER_Prob1 |>
mutate(Probability = round(n/sum(n),2))
#Sample -2
WS_Pitching_ER2<-
df_2 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Earned_Runs = round(mean(ER),2))
WS_Pitching_ER2$Performance = cut(WS_Pitching_ER2$Earned_Runs, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
WS_Pithing_ER_Prob2<- count(WS_Pitching_ER2, Performance)
WS_Probability_ER2 <- WS_Pithing_ER_Prob2 |>
mutate(Probability = round(n/sum(n),2))
#Sample -3
WS_Pitching_ER3<-
df_3 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Earned_Runs = round(mean(ER),2))
WS_Pitching_ER3$Performance = cut(WS_Pitching_ER3$Earned_Runs, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
WS_Pithing_ER_Prob3<- count(WS_Pitching_ER3, Performance)
WS_Probability_ER3 <- WS_Pithing_ER_Prob3 |>
mutate(Probability = round(n/sum(n),2))
#Sample -4
WS_Pitching_ER4<-
df_4 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Earned_Runs = round(mean(ER),2))
WS_Pitching_ER4$Performance = cut(WS_Pitching_ER4$Earned_Runs, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
WS_Pithing_ER_Prob4<- count(WS_Pitching_ER4, Performance)
WS_Probability_ER4 <- WS_Pithing_ER_Prob4 |>
mutate(Probability = round(n/sum(n),2))
#Sample -5
WS_Pitching_ER5<-
df_5 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Earned_Runs = round(mean(ER),2))
WS_Pitching_ER5$Performance = cut(WS_Pitching_ER5$Earned_Runs, breaks = c(-0.1,4,7,Inf), labels = c('Best','Average','Worst'))
WS_Pithing_ER_Prob5<- count(WS_Pitching_ER5, Performance)
WS_Probability_ER5 <- WS_Pithing_ER_Prob5 |>
mutate(Probability = round(n/sum(n),2))
#Now lets visualize
library(ggplot2)
library(patchwork)
plot6 <- WS_Probability_ER1 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot7 <- WS_Probability_ER2 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot8 <- WS_Probability_ER3 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot9 <- WS_Probability_ER4 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot10 <- WS_Probability_ER5 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot6
plot7
plot8
plot9
plot10
Even for the Earned Runs we can see that all the performance categories (Best, Average and Worst) all have close values with a difference in the range of “0.03” unit probability and sample-2 seems to have the lowest probability in “Best” performance category and sample-3 seems to have the highest probability in the “Best” performance category when compared to all the 5 sub-samples.
Earned Run Average
#Sample -1
WS_Pitching_ERA1<-
df_1 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Earned_Runs = sum(ER),
IPOuts = sum(IPouts))|>
mutate(New_ERA1 = round((9*Earned_Runs)/(IPOuts),2))|>
mutate(Inning_Played = IPOuts*3)
WS_Pitching_ERAA1 <- WS_Pitching_ERA1 |>
filter(Inning_Played > 63)
WS_Pitching_ERAA1$Performance = cut(WS_Pitching_ERAA1$New_ERA1, breaks = c(-0.1,2.5,Inf), labels = c('Best','Average'))
WS_Pitching_ER_Prob1<- count(WS_Pitching_ERAA1, Performance)
WS_Probability_ER1 <- WS_Pitching_ER_Prob1 |>
mutate(Probability = round(n/sum(n),2))
WS_Probability_ER1 |>
ggplot(aes(x = Performance,y=Probability, fill = Performance)) +
geom_bar(stat = 'identity')+
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5)+
theme_classic()
#Sample -2
WS_Pitching_ERA2<-
df_2 |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Earned_Runs = sum(ER),
IPOuts = sum(IPouts))|>
mutate(New_ERA2 = round((9*Earned_Runs)/(IPOuts),2))|>
mutate(Inning_Played = IPOuts*3)
WS_Pitching_ERAA2 <- WS_Pitching_ERA2 |>
filter(Inning_Played > 63)
WS_Pitching_ERAA2$Performance = cut(WS_Pitching_ERAA2$New_ERA2, breaks = c(-0.1,2.5,Inf), labels = c('Best','Average'))
WS_Pitching_ER_Prob2<- count(WS_Pitching_ERAA2, Performance)
WS_Probability_ER2 <- WS_Pitching_ER_Prob2 |>
mutate(Probability = round(n/sum(n),2))
WS_Probability_ER2 |>
ggplot(aes(x = Performance,y=Probability, fill = Performance)) +
geom_bar(stat = 'identity')+
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5)+
theme_classic()
#Sample -3
WS_Pitching_ERA3<-
Pitching_Data |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Earned_Runs = sum(ER),
IPOuts = sum(IPouts))|>
mutate(New_ERA3 = round((9*Earned_Runs)/(IPOuts),2))|>
mutate(Inning_Played = IPOuts*3)
WS_Pitching_ERAA3 <- WS_Pitching_ERA3 |>
filter(Inning_Played > 63)
WS_Pitching_ERAA3$Performance = cut(WS_Pitching_ERAA3$New_ERA3, breaks = c(-0.1,2.5,Inf), labels = c('Best','Average'))
WS_Pitching_ER_Prob3<- count(WS_Pitching_ERAA3, Performance)
WS_Probability_ER3 <- WS_Pitching_ER_Prob3 |>
mutate(Probability = round(n/sum(n),2))
WS_Probability_ER3 |>
ggplot(aes(x = Performance,y=Probability, fill = Performance)) +
geom_bar(stat = 'identity')+
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5)+
theme_classic()
#Sample -4
WS_Pitching_ERA4<-
Pitching_Data |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Earned_Runs = sum(ER),
IPOuts = sum(IPouts))|>
mutate(New_ERA4 = round((9*Earned_Runs)/(IPOuts),2))|>
mutate(Inning_Played = IPOuts*3)
WS_Pitching_ERAA4 <- WS_Pitching_ERA4 |>
filter(Inning_Played > 63)
WS_Pitching_ERAA4$Performance = cut(WS_Pitching_ERAA4$New_ERA4, breaks = c(-0.1,2.5,Inf), labels = c('Best','Average'))
WS_Pitching_ER_Prob4<- count(WS_Pitching_ERAA4, Performance)
WS_Probability_ER4 <- WS_Pitching_ER_Prob4 |>
mutate(Probability = round(n/sum(n),2))
WS_Probability_ER4 |>
ggplot(aes(x = Performance,y=Probability, fill = Performance)) +
geom_bar(stat = 'identity')+
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5)+
theme_classic()
#Sample -5
WS_Pitching_ERA5<-
Pitching_Data |>
filter(round=='WS') |>
group_by(playerID) |>
summarise(Earned_Runs = sum(ER),
IPOuts = sum(IPouts))|>
mutate(New_ERA5 = round((9*Earned_Runs)/(IPOuts),2))|>
mutate(Inning_Played = IPOuts*3)
WS_Pitching_ERAA5 <- WS_Pitching_ERA5 |>
filter(Inning_Played > 63)
WS_Pitching_ERAA5$Performance = cut(WS_Pitching_ERAA5$New_ERA5, breaks = c(-0.1,2.5,Inf), labels = c('Best','Average'))
WS_Pitching_ER_Prob5<- count(WS_Pitching_ERAA5, Performance)
WS_Probability_ER5 <- WS_Pitching_ER_Prob5 |>
mutate(Probability = round(n/sum(n),2))
WS_Probability_ER5 |>
ggplot(aes(x = Performance,y=Probability, fill = Performance)) +
geom_bar(stat = 'identity')+
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5)+
theme_classic()
#Now lets visualize
library(ggplot2)
library(patchwork)
plot11 <- WS_Probability_ER1 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot12 <- WS_Probability_ER2 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot13 <- WS_Probability_ER3 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot14 <- WS_Probability_ER4 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot15 <- WS_Probability_ER5 %>%
ggplot(aes(x = Performance, y = Probability, fill = Performance)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = Probability), size = 4, vjust = -0.5, hjust = 0.5) +
theme_minimal()
plot11
plot12
plot13
plot14
plot15
In this comparison of the Earned Run Average among the 5 sub-samples we can see that sample-3, sample-4 and sample-5 give a consistently same output where sample-1 and sample-2 have slightly different values.
Now lets implement Monte Carlo simulation on the variable “Runs Allowed” to see if it is any close to the actual mean of the variable “Runs Allowed”
samples_Pitching_Data <- rnorm(1000, mean(Pitching_Data$R), sd(Pitching_Data$R))
simulations_Pitching_data <- replicate(1000, mean(sample(samples_Pitching_Data, 100, replace = TRUE)))
hist(simulations_Pitching_data, main = "Histogram of Simulations for Dataset 1")
print(mean(Pitching_Data$R))
## [1] 1.790667
We can see that the mean of “Run Allowed” is close to the mean we acquired through the Monte Carlo’s Simulation.
df_1 <- df_1[is.finite(df_1$ERA), ]
df_1 |>
ggplot(aes(x="sample-1",y=ERA))+
stat_boxplot()+
labs(x="Sample 1",y="ERA Distribution",title="Finding the anomalies in ERA from Sample 1")+
theme_classic()
df_2 <- df_2[is.finite(df_2$ERA), ]
df_2 |>
ggplot(aes(x="sample-2",y=ERA))+
stat_boxplot()+
labs(x="Sample 1",y="ERA Distribution",title="Finding the anomalies in ERA from Sample 2")+
theme_classic()
df_3 <- df_3[is.finite(df_3$ERA), ]
df_3 |>
ggplot(aes(x="sample-3",y=ERA))+
stat_boxplot()+
labs(x="Sample 1",y="ERA Distribution",title="Finding the anomalies in ERA from Sample 3")+
theme_classic()
df_4 <- df_4[is.finite(df_4$ERA), ]
df_4 |>
ggplot(aes(x="sample-4",y=ERA))+
stat_boxplot()+
labs(x="Sample 4",y="ERA Distribution",title="Finding the anomalies in ERA from Sample 4")+
theme_classic()
df_5 <- df_5[is.finite(df_5$ERA), ]
df_5 |>
ggplot(aes(x="sample-5",y=ERA))+
stat_boxplot()+
labs(x="Sample 5",y="ERA Distribution",title="Finding the anomalies in ERA from Sample 5")+
theme_classic()
When we compare ERA among all the 5 samples, we can see that sampling was done really good because most of the outliers were very similar among all the samples, outliers samples 2,3 are very similar but slightly different from samples 1,4,5.