library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(rmarkdown)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(patchwork)
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:patchwork':
##
## align_plots
##
## The following object is masked from 'package:lubridate':
##
## stamp
lahman_data = read.csv("/Users/anuragreddy/Desktop/Statistics with R/Lahmans Databse .csv")
proportion=0.5
#lets find out the no.of rows in the lahman database.
set.seed(123)
Number_Rows <- nrow(lahman_data)
Lahman_Sample1 <- lahman_data[sample(nrow(lahman_data), size = proportion*Number_Rows, replace = TRUE), ]
Lahman_Sample2 <- lahman_data[sample(nrow(lahman_data), size = proportion*Number_Rows, replace = TRUE), ]
Lahman_Sample3 <- lahman_data[sample(nrow(lahman_data), size = proportion*Number_Rows, replace = TRUE), ]
Lahman_Sample4 <- lahman_data[sample(nrow(lahman_data), size = proportion*Number_Rows, replace = TRUE), ]
Lahman_Sample5 <- lahman_data[sample(nrow(lahman_data), size = proportion*Number_Rows, replace = TRUE), ]
Runs Per Game- Offensive Statistic
ERA (Earned Runs Average) - Pitching Statistic
CS (Caught Stealing) - Defensive Statistic
Note: We will calculate above statistic by grouping team wise. So that we can analyse the team’s performance in above statistics and find out the difference between the samples.
Runs per game is the column which we have to integrate into the samples.
Calculate Runs per Game, ERA and CS and integrate that into the samples.
Sample1 <-
Lahman_Sample1 |>
group_by(franchID) |>
summarise(Runs_per_Game = round(sum(R)/sum(G),2),
Team_ERA = round((9*sum(ER))/(sum(IPouts)/3),2),
Team_CS = round(mean(CS),2))
Sample2 <-
Lahman_Sample2 |>
group_by(franchID) |>
summarise(Runs_per_Game = round(sum(R)/sum(G),2),
Team_ERA = round((9*sum(ER))/(sum(IPouts)/3),2),
Team_CS = round(mean(CS),2))
Sample3 <-
Lahman_Sample3 |>
group_by(franchID) |>
summarise(Runs_per_Game = round(sum(R)/sum(G),2),
Team_ERA = round((9*sum(ER))/(sum(IPouts)/3),2),
Team_CS = round(mean(CS),2))
Sample4 <-
Lahman_Sample4 |>
group_by(franchID) |>
summarise(Runs_per_Game = round(sum(R)/sum(G),2),
Team_ERA = round((9*sum(ER))/(sum(IPouts)/3),2),
Team_CS = round(mean(CS),2))
Sample5 <-
Lahman_Sample5 |>
group_by(franchID) |>
summarise(Runs_per_Game = round(sum(R)/sum(G),2),
Team_ERA = round((9*sum(ER))/(sum(IPouts)/3),2),
Team_CS = round(mean(CS),2))
Now we have 5 samples (sample1, sample2, sample3, sample4, sample5) with the columns - Runs per game, Earned Runs Average, Caught Stealing.
Runs_Per_Game_Descriptive_Statistics <- data.frame(Runs_Per_Game = c('Min','Mean','Median','Max','Normality (Shapiro)','Standard Deviation'),
Sample_1 = c(round(min(Sample1$Runs_per_Game),2),round(mean(Sample1$Runs_per_Game),2),round(median(Sample1$Runs_per_Game),2),round(max(Sample1$Runs_per_Game),2),round(shapiro.test(Sample1$Runs_per_Game)$p.value,2),round(sd(Sample1$Runs_per_Game),2)),
Sample_2 = c(round(min(Sample2$Runs_per_Game),2),round(mean(Sample2$Runs_per_Game),2),round(median(Sample2$Runs_per_Game),2),round(max(Sample2$Runs_per_Game),2),round(shapiro.test(Sample2$Runs_per_Game)$p.value,2),round(sd(Sample2$Runs_per_Game),2)),
Sample_3 = c(round(min(Sample3$Runs_per_Game),2),round(mean(Sample3$Runs_per_Game),2),round(median(Sample3$Runs_per_Game),2),round(max(Sample3$Runs_per_Game),2),round(shapiro.test(Sample3$Runs_per_Game)$p.value,2),round(sd(Sample3$Runs_per_Game),2)),
Sample_4 = c(round(min(Sample4$Runs_per_Game),2),round(mean(Sample4$Runs_per_Game),2),round(median(Sample4$Runs_per_Game),2),round(max(Sample4$Runs_per_Game),2),round(shapiro.test(Sample4$Runs_per_Game)$p.value,2),round(sd(Sample4$Runs_per_Game),2)),
Sample_5 = c(round(min(Sample5$Runs_per_Game),2),round(mean(Sample5$Runs_per_Game),2),round(median(Sample5$Runs_per_Game),2),round(max(Sample5$Runs_per_Game),2),round(shapiro.test(Sample5$Runs_per_Game)$p.value,2),round(sd(Sample5$Runs_per_Game),2)))
Runs_Per_Game_Descriptive_Statistics
## Runs_Per_Game Sample_1 Sample_2 Sample_3 Sample_4 Sample_5
## 1 Min 4.08 4.00 4.09 4.15 3.89
## 2 Mean 4.56 4.52 4.58 4.59 4.58
## 3 Median 4.53 4.44 4.51 4.53 4.57
## 4 Max 5.15 5.34 5.31 5.30 5.21
## 5 Normality (Shapiro) 0.49 0.15 0.18 0.04 0.97
## 6 Standard Deviation 0.29 0.28 0.28 0.31 0.30
ERA_Descriptive_Statistics <- data.frame(Earned_Runs_Average = c('Min','Mean','Median','Max','Normality (Shapiro)','Standard Deviation'),
Sample_1 = c(round(min(Sample1$Team_ERA),2),round(mean(Sample1$Team_ERA),2),round(median(Sample1$Team_ERA),2),round(max(Sample1$Team_ERA),2),round(shapiro.test(Sample1$Team_ERA)$p.value,2),round(sd(Sample1$Team_ERA),2)),
Sample_2 = c(round(min(Sample2$Team_ERA),2),round(mean(Sample2$Team_ERA),2),round(median(Sample2$Team_ERA),2),round(max(Sample2$Team_ERA),2),round(shapiro.test(Sample2$Team_ERA)$p.value,2),round(sd(Sample2$Team_ERA),2)),
Sample_3 = c(round(min(Sample3$Team_ERA),2),round(mean(Sample3$Team_ERA),2),round(median(Sample3$Team_ERA),2),round(max(Sample3$Team_ERA),2),round(shapiro.test(Sample3$Team_ERA)$p.value,2),round(sd(Sample3$Team_ERA),2)),
Sample_4 = c(round(min(Sample4$Team_ERA),2),round(mean(Sample4$Team_ERA),2),round(median(Sample4$Team_ERA),2),round(max(Sample4$Team_ERA),2),round(shapiro.test(Sample4$Team_ERA)$p.value,2),round(sd(Sample4$Team_ERA),2)),
Sample_5 = c(round(min(Sample5$Team_ERA),2),round(mean(Sample5$Team_ERA),2),round(median(Sample5$Team_ERA),2),round(max(Sample5$Team_ERA),2),round(shapiro.test(Sample5$Team_ERA)$p.value,2),round(sd(Sample5$Team_ERA),2)))
ERA_Descriptive_Statistics
## Earned_Runs_Average Sample_1 Sample_2 Sample_3 Sample_4 Sample_5
## 1 Min 3.50 3.58 3.38 3.67 3.54
## 2 Mean 4.25 4.21 4.23 4.27 4.25
## 3 Median 4.27 4.20 4.20 4.24 4.22
## 4 Max 4.79 5.00 4.95 4.98 5.03
## 5 Normality (Shapiro) 0.82 0.07 0.90 0.39 0.24
## 6 Standard Deviation 0.29 0.33 0.37 0.32 0.31
CS_Descriptive_Statistics <- data.frame(Caught_Stealing = c('Min','Mean','Median','Max','Normality (Shapiro)','Standard Deviation'),
Sample_1 = c(round(min(Sample1$Team_CS),2),round(mean(Sample1$Team_CS),2),round(median(Sample1$Team_CS),2),round(max(Sample1$Team_CS),2),round(shapiro.test(Sample1$Team_CS)$p.value,2),round(sd(Sample1$Team_CS),2)),
Sample_2 = c(round(min(Sample2$Team_CS),2),round(mean(Sample2$Team_CS),2),round(median(Sample2$Team_CS),2),round(max(Sample2$Team_CS),2),round(shapiro.test(Sample2$Team_CS)$p.value,2),round(sd(Sample2$Team_CS),2)),
Sample_3 = c(round(min(Sample3$Team_CS),2),round(mean(Sample3$Team_CS),2),round(median(Sample3$Team_CS),2),round(max(Sample3$Team_CS),2),round(shapiro.test(Sample3$Team_CS)$p.value,2),round(sd(Sample3$Team_CS),2)),
Sample_4 = c(round(min(Sample4$Team_CS),2),round(mean(Sample4$Team_CS),2),round(median(Sample4$Team_CS),2),round(max(Sample4$Team_CS),2),round(shapiro.test(Sample4$Team_CS)$p.value,2),round(sd(Sample4$Team_CS),2)),
Sample_5 = c(round(min(Sample5$Team_CS),2),round(mean(Sample5$Team_CS),2),round(median(Sample5$Team_CS),2),round(max(Sample5$Team_CS),2),round(shapiro.test(Sample5$Team_CS)$p.value,2),round(sd(Sample5$Team_CS),2)))
CS_Descriptive_Statistics
## Caught_Stealing Sample_1 Sample_2 Sample_3 Sample_4 Sample_5
## 1 Min 22.56 24.42 26.07 24.86 24.14
## 2 Mean 35.40 35.16 35.31 34.46 34.02
## 3 Median 34.89 35.12 35.08 34.05 34.00
## 4 Max 50.43 56.00 44.91 45.70 49.40
## 5 Normality (Shapiro) 0.46 0.13 0.42 0.58 0.51
## 6 Standard Deviation 6.25 6.61 5.20 5.35 5.79
Statistic | Runs Per Game | Earned Runs Average | Caught Stealing |
---|---|---|---|
Min | Almost same in 5 samples. It lies between (4,4.1). Sample 4 has the highest minimum value among the samples. | Ranging between 3.5-3.7. Sample 3 has highest min value and sample 1 has the lowest. | All the samples show variation in the caught stealing minimum value. It ranges from (22,26). Sample 3 has the highest min value. |
Mean | Mean is quite similar in all the samples ranges between (4.5,4.6) Runs Per Game in 22 years. | Mean is quite similar in all the samples. approx ~4.2 Earned Runs Average in last 22 years. | Mean of caught stealing is ranging between (34,35) caught steals in last 22 years. |
Median | The middle most value of all the samples lies between (4.5,4.6). First 4 samples have their mean and median almost same. | The middle most value is quite similar in all the 5 samples. approx 4.2 same as mean. | First 3 samples have approx 35 ha their median and remaining two are at 34. |
Max | When it comes to maximum value - there is little variation of at least 0.2 between samples | The max ERA in all the samples is approximately equals to ~5. In sample 1 it is 4.7. The least max value in all samples. | Max of CS has a high variation compared to other 2 stats. Sample 2 hsa 56 as the max remaining are close to 50. |
Normality | We have used shapiro test for testing the normality of the data. It is found that apart from 4th sample all other samples are normally distributed (sig>0.05). | We have used shapiro test for testing the normality of the data. It is found all ERA samples are normally distributed (sig>0.05) | We have used shapiro test for testing the normality of the data. It is found all CS samples are normally distributed (sig>0.05). |
SD | SD is almost same in all the 5 sample approx ~0.3 | SD is almost same in all the 5 samples, approx ~0.3. except in sample2 it is 0.4. | There is a variation in the SD of caught stealing samples. Samples 1 and 2 are highly dispersed compared to other samples. |
RPG_1 <-
ggplot(Sample1,aes(x=Runs_per_Game))+
geom_histogram(color='black',aes(y=..density..),binwidth=0.1)+
geom_density(color='blue',linewidth=2)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample1$Runs_per_Game),sd=sd(Sample1$Runs_per_Game)))+
ggtitle("Sample 1")+
theme_classic()
RPG_2 <-
ggplot(Sample2,aes(x=Runs_per_Game))+
geom_histogram(color='black',aes(y=..density..),binwidth=0.1)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample2$Runs_per_Game),sd=sd(Sample2$Runs_per_Game)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 2")+
theme_classic()
RPG_3 <-
ggplot(Sample3,aes(x=Runs_per_Game))+
geom_histogram(color='black',aes(y=..density..),binwidth=0.1)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample3$Runs_per_Game),sd=sd(Sample3$Runs_per_Game)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 3")+
theme_classic()
RPG_4 <-
ggplot(Sample4,aes(x=Runs_per_Game))+
geom_histogram(color='black',aes(y=..density..),binwidth=0.1)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample4$Runs_per_Game),sd=sd(Sample4$Runs_per_Game)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 4")+
theme_classic()
RPG_5 <-
ggplot(Sample5,aes(x=Runs_per_Game))+
geom_histogram(color='black',aes(y=..density..),binwidth=0.1)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample5$Runs_per_Game),sd=sd(Sample5$Runs_per_Game)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 5")+
theme_classic()
plot_grid(RPG_1,RPG_2,RPG_3,RPG_4,RPG_5,ncol = 2)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
NOTE: It's important to note that visual inspection alone is not a definitive method for assessing normality
Shapiro Test Null Hypothesis: It states that data is normally distributed.
(Similarly lets visualize the distribution for other 2 statistics (ERA and CS)
ERA_1 <-
ggplot(Sample1,aes(x=Team_ERA))+
geom_histogram(color='black',aes(y=..density..),binwidth=0.1)+
geom_density(color='blue',linewidth=2)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample1$Team_ERA),sd=sd(Sample1$Team_ERA)))+
ggtitle("Sample 1")+
theme_classic()
ERA_2 <-
ggplot(Sample2,aes(x=Team_ERA))+
geom_histogram(color='black',aes(y=..density..),binwidth=0.1)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample2$Team_ERA),sd=sd(Sample2$Team_ERA)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 2")+
theme_classic()
ERA_3 <-
ggplot(Sample3,aes(x=Team_ERA))+
geom_histogram(color='black',aes(y=..density..),binwidth=0.1)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample3$Team_ERA),sd=sd(Sample3$Team_ERA)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 3")+
theme_classic()
ERA_4 <-
ggplot(Sample4,aes(x=Team_ERA))+
geom_histogram(color='black',aes(y=..density..),binwidth=0.1)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample4$Team_ERA),sd=sd(Sample4$Team_ERA)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 4")+
theme_classic()
ERA_5 <-
ggplot(Sample5,aes(x=Team_ERA))+
geom_histogram(color='black',aes(y=..density..),binwidth=0.1)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample5$Team_ERA),sd=sd(Sample5$Team_ERA)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 5")+
theme_classic()
plot_grid(ERA_1,ERA_2,ERA_3,ERA_4,ERA_5,ncol = 2)
CS_1 <-
ggplot(Sample1,aes(x=Team_CS))+
geom_histogram(color='black',aes(y=..density..),binwidth=1.5)+
geom_density(color='blue',linewidth=2)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample1$Team_CS),sd=sd(Sample1$Team_CS)))+
ggtitle("Sample 1")+
theme_classic()
CS_2 <-
ggplot(Sample2,aes(x=Team_CS))+
geom_histogram(color='black',aes(y=..density..),binwidth=1.5)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample2$Team_CS),sd=sd(Sample2$Team_CS)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 2")+
theme_classic()
CS_3 <-
ggplot(Sample3,aes(x=Team_CS))+
geom_histogram(color='black',aes(y=..density..),binwidth=1.5)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample3$Team_CS),sd=sd(Sample3$Team_CS)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 3")+
theme_classic()
CS_4 <-
ggplot(Sample4,aes(x=Team_CS))+
geom_histogram(color='black',aes(y=..density..),binwidth=1.5)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample4$Team_CS),sd=sd(Sample4$Team_CS)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 4")+
theme_classic()
CS_5 <-
ggplot(Sample5,aes(x=Team_CS))+
geom_histogram(color='black',aes(y=..density..),binwidth=1.5)+
geom_function(fun=dnorm,color='red', linewidth=2, args = list(mean=mean(Sample5$Team_CS),sd=sd(Sample5$Team_CS)))+
geom_density(color='blue',linewidth=2)+
ggtitle("Sample 5")+
theme_classic()
plot_grid(CS_1,CS_2,CS_3,CS_4,CS_5,ncol = 2)
NOTE: To visualize the distribution of sample with the distribution of normal data - Q-Q plots (Quantile - Quantile) are best graphs. Lets try to implement the Q-Q plots for the any one of the statistic samples.
Lets use caught stealing statistic and visualize the Q-Q plots for the all 5 samples.
CS_QQ1 <-
ggplot(Sample1, aes(sample=Team_CS))+
geom_qq()+
geom_qq_line()+
labs(x='Theoretical Qunatiles', y='Team_CS Quantiles')+
ggtitle("Q-Q Plot for Sample 1")+
theme_classic()
CS_QQ2 <-
ggplot(Sample2, aes(sample=Team_CS))+
geom_qq()+
geom_qq_line()+
labs(x='Theoretical Qunatiles', y='Team_CS Quantiles')+
ggtitle("Q-Q Plot for Sample 2")+
theme_classic()
CS_QQ3 <-
ggplot(Sample3, aes(sample=Team_CS))+
geom_qq()+
geom_qq_line()+
labs(x='Theoretical Qunatiles', y='Team_CS Quantiles')+
ggtitle("Q-Q Plot for Sample 3")+
theme_classic()
CS_QQ4 <-
ggplot(Sample4, aes(sample=Team_CS))+
geom_qq()+
geom_qq_line()+
labs(x='Theoretical Qunatiles', y='Team_CS Quantiles')+
ggtitle("Q-Q Plot for Sample 4")+
theme_classic()
CS_QQ5 <-
ggplot(Sample5, aes(sample=Team_CS))+
geom_qq()+
geom_qq_line()+
labs(x='Theoretical Qunatiles', y='Team_CS Quantiles')+
ggtitle("Q-Q Plot for Sample 5")+
theme_classic()
plot_grid(CS_QQ1,CS_QQ2,CS_QQ3,CS_QQ4,CS_QQ5,ncol = 3)
RPG_BP1 <-
ggplot(Sample1,aes(x='Sample1',y=Runs_per_Game))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
RPG_BP2 <-
ggplot(Sample2,aes(x='Sample2',y=Runs_per_Game))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
RPG_BP3 <-
ggplot(Sample3,aes(x='Sample3',y=Runs_per_Game))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
RPG_BP4 <-
ggplot(Sample4,aes(x='Sample4',y=Runs_per_Game))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
RPG_BP5 <-
ggplot(Sample5,aes(x='Sample5',y=Runs_per_Game))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
plot_grid(RPG_BP1,RPG_BP2,RPG_BP3,RPG_BP4,RPG_BP5)
For example: An outlier, 5.2 Runs per game scored in sample 2,3,4 is actually not an outlier in the samples 1,2.
ERA_BP1 <-
ggplot(Sample1,aes(x='Sample1',y=Team_ERA))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
ERA_BP2 <-
ggplot(Sample2,aes(x='Sample2',y=Team_ERA))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
ERA_BP3 <-
ggplot(Sample3,aes(x='Sample3',y=Team_ERA))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
ERA_BP4 <-
ggplot(Sample4,aes(x='Sample4',y=Team_ERA))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
ERA_BP5 <-
ggplot(Sample5,aes(x='Sample5',y=Team_ERA))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
plot_grid(ERA_BP1,ERA_BP2,ERA_BP3,ERA_BP4,ERA_BP5)
For example: An outlier, 5.2 ERA in sample 2,4,5 is actually not an outlier in the samples 1,3.
CS_BP1 <-
ggplot(Sample1,aes(x='Sample1',y=Team_CS))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
CS_BP2 <-
ggplot(Sample2,aes(x='Sample2',y=Team_CS))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
CS_BP3 <-
ggplot(Sample3,aes(x='Sample3',y=Team_CS))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
CS_BP4 <-
ggplot(Sample4,aes(x='Sample4',y=Team_CS))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
CS_BP5 <-
ggplot(Sample5,aes(x='Sample5',y=Team_CS))+
geom_boxplot(fill='lightblue',width=0.5,color='black')+
theme_classic()
plot_grid(CS_BP1,CS_BP2,CS_BP3,CS_BP4,CS_BP5)
For example: An outlier, 52 CS in sample 1,2 is actually not an outlier in the samples 3,4,5.
Lets try to simulate no.of home runs made by Los Angeles Angels in MLB.
set.seed(345)
Home_Runs <- lahman_data|>
filter(franchID == 'ANA') |>
select(franchID, HR)
#Define no.of simulations
Home_Runs <- na.omit(Home_Runs)
num_simulations <- 1000
HR_Simulated_Mean <- numeric(1000)
for(i in 1:num_simulations){
Simulated_HR <- Home_Runs$HR[sample(length(Home_Runs$HR),replace = TRUE)]
HR_Simulated_Mean[i] <- mean(Simulated_HR)
}
#These are the simulated mean of home runs of Los Angeles Angels.
head(HR_Simulated_Mean)
## [1] 174.8636 169.7273 161.1818 177.8182 157.6818 146.7273
Now lets visualize the above simulated mean Home Runs in histogram.
ANA_HR_Simulated <- data.frame(Home_Runs = HR_Simulated_Mean)
ggplot(ANA_HR_Simulated,aes(x=Home_Runs)) +
geom_histogram(fill='lightblue',color='black',aes(y = after_stat(density)))+
geom_density(color='red')+
labs(title = "Monte Carlo Simulation of Home Run Totals",
x = "Home Run Totals", y = "Frequency")+
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
HR_Lessthan_150 <- mean(HR_Simulated_Mean < 150)
HR_Lessthan_150
## [1] 0.01
ggplot(ANA_HR_Simulated,aes(x=Home_Runs)) +
geom_histogram(fill='lightblue',color='black',aes(y = ..density..))+
geom_density(color='red')+
geom_vline(xintercept = 150,linetype='solid',color='black')+
annotate('rect',xmin=min(HR_Simulated_Mean),xmax=150,ymin=0,ymax=HR_Lessthan_150,color='black',fill='white')+
annotate('text',label = as.character(HR_Lessthan_150),x=min(HR_Simulated_Mean)+1,y=HR_Lessthan_150+0.005,color='black')+
labs(title = "Monte Carlo Simulation of Home Run Totals",
x = "Home Run Totals", y = "Frequency")+
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
To answer this we will be using quantile() function. It is an in-built function in R.
Confidence_Interval <- quantile(HR_Simulated_Mean, c(0.025,0.975))
Confidence_Interval
## 2.5% 97.5%
## 151.9989 177.6386