library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.4
## âś” forcats 1.0.0 âś” stringr 1.5.1
## âś” lubridate 1.9.3 âś” tibble 3.2.1
## âś” purrr 1.0.2 âś” tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
Data_set <- "/Users/ba/Documents/IUPUI/Masters/First Sem/Statistics/Dataset/PitchingPost.csv"
Pitching_Data <- read.csv(Data_set)
ER_Lea <- Pitching_Data |>
group_by(lgID) |>
summarise(Avg_ER = round(mean(ER),2))|>
arrange(Avg_ER)
ER_Lea
## # A tibble: 2 Ă— 2
## lgID Avg_ER
## <chr> <dbl>
## 1 NL 1.59
## 2 AL 1.74
Null Hypothesis: There is no difference between the earned runs given by pitchers in American and National League.
Observed_mean_diff <- ER_Lea$Avg_ER[2] - ER_Lea$Avg_ER[1]
Pitching_Data |>
ggplot() +
geom_boxplot(mapping =
aes(x = ER,
y = factor(lgID, levels = c('AL', 'NL'),
labels = c("American League", "National League")),color = lgID)) +
labs(title = "League factor on Runs scored per game",
x = "Runs",
y = "League Variation") +
theme_classic()
Interpretation: By interpreting the above box plot, we can infer that the AL has more earned runs compared to the NL. We are interested in determining whether this provides enough evidence against the null hypothesis stated above
Independent Sample t-test:
AL_df = filter(Pitching_Data, lgID == 'AL') |> pluck('ER')
NL_df = filter(Pitching_Data, lgID == 'NL') |> pluck('ER')
n1 <- length(AL_df)
n2 <- length(NL_df)
df <- n1 + n2 - 2
sd1 <- sd(AL_df)
sd2 <- sd(NL_df)
poolvar<- (((n1-1)*sd1^2)+((n2-1)*sd2^2))/df
t_stat <- (Observed_mean_diff - 0)/sqrt((poolvar/n1) + (poolvar/n2))
p_val <- 2 * (1 - pt(abs(t_stat),df))
print(p_val)
## [1] 0.02000549
t.test(AL_df,NL_df)
##
## Welch Two Sample t-test
##
## data: AL_df and NL_df
## t = 2.331, df = 3729.7, p-value = 0.01981
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.02388717 0.27675940
## sample estimates:
## mean of x mean of y
## 1.738403 1.588080
Interpretation: I have used two methods to find the p-value: 1. The traditional mathematical method using the t-distribution, and 2. The independent t-test formula (t.test()). As the p-value is less than 0.05, we reject the null hypothesis. We have enough evidence to assert that AL teams have more earned runs to NL pitchers.
bootstrap <- function (x, func=mean, n_iter=10^4) {
func_values <- c(NULL)
for (i in 1:n_iter) {
x_sample <- sample(x, size = length(x), replace = TRUE)
func_values <- c(func_values, func(x_sample))
}
return(func_values)
}
AL_Sample <- bootstrap(AL_df,fun=mean,n_iter=10^2)
NL_Sample <- bootstrap(NL_df,fun=mean,n_iter=10^2)
bootstapped_mean_diffs <- AL_Sample - NL_Sample
Observed_mean_diff <- round(mean(AL_df) - mean(NL_df),2)
ggplot()+
geom_function(xlim = c(-1, 1),
fun = function(x) dnorm(x, mean = 0,
sd = sd(bootstapped_mean_diffs)))+
geom_vline(xintercept = Observed_mean_diff, color='blue', linetype = "dashed")+
annotate('text',label = paste("Observed mean diff:",Observed_mean_diff),color='blue',x=Observed_mean_diff+0.5,y=2.7)+
labs(title = "Bootstrapped Sampling Distribution of ER diff",
x = "Difference in ER Calculated",
y = "Probability Density",
color = "") +
scale_x_continuous(breaks = seq(-1, 1, 100)) +
theme_classic()
bootstapped_dmean_diffs <- bootstapped_mean_diffs - mean(bootstapped_mean_diffs)
paste("p-value ",
sum(abs(Observed_mean_diff) < abs(bootstapped_dmean_diffs)) /
length(bootstapped_dmean_diffs))
## [1] "p-value 0.02"
Interpretation: As you can see, the area in the extreme values from the observed data is found to be close to zero. Hence, we have enough evidence to reject the null hypothesis stated above.
alpha = 0.5. I have chosen an alpha value of 0.05, indicating my willingness to accept a 5% risk of making an informed decision, depending on the situation.
cohen.d(AL_df,NL_df)
##
## Cohen's d
##
## d estimate: 0.07617594 (negligible)
## 95 percent confidence interval:
## lower upper
## 0.01211587 0.14023600
library(pwrss)
ind_test = pwrss.t.2means(mu1 = mean(AL_df), mu2 = mean(NL_df), sd1 = sd(AL_df), sd2 = sd(NL_df), kappa = 1,power = .85, alpha = 0.05, alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.85
## n1 = 3098
## n2 = 3098
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 6194
## Non-centrality parameter = 2.997
## Type I error rate = 0.05
## Type II error rate = 0.15
plot(ind_test)
## Warning in qt(1 - prob.extreme, df = df, ncp = ncp, lower.tail = TRUE): full
## precision may not have been achieved in 'pnt{final}'
New_Pitching_Data <- Pitching_Data |>
mutate(Performance = ifelse(ER > mean(HR),'Best','Worst'))
contingency_table <- table(New_Pitching_Data$Performance,New_Pitching_Data$lgID)
contingency_table
##
## AL NL
## Best 1168 1129
## Worst 686 767
Null Hypothesis: There is no association between the team of particular league and their Home runs hitting performance. HR_AA: Home Runs Above Average, HR_BA: Home Runs Below Average.
fisher.test(New_Pitching_Data$Performance,New_Pitching_Data$lgID)
##
## Fisher's Exact Test for Count Data
##
## data: New_Pitching_Data$Performance and New_Pitching_Data$lgID
## p-value = 0.03191
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.011880 1.322268
## sample estimates:
## odds ratio
## 1.156651
Interpretation: The Fisher’s exact test have given a p-value less than 0.05, indicating there is enough evidence to reject the null hypothesis. We can interpret that there is an association between the league played and the pitchers’ performance in that particular league.