library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
library(dplyr)
library(rmarkdown)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
lahman_data = read.csv("/Users/anuragreddy/Desktop/Statistics with R/Lahmans Databse .csv")
lahman_data <- lahman_data |>
mutate(RPG = round(R/G,2))
RPG_Lea <- lahman_data |>
group_by(lgID) |>
summarise(Avg_RPG = round(mean(RPG),2))|>
arrange(Avg_RPG)
RPG_Lea
## # A tibble: 2 × 2
## lgID Avg_RPG
## <chr> <dbl>
## 1 NL 4.47
## 2 AL 4.67
Observed_mean_diff <- RPG_Lea$Avg_RPG[2] - RPG_Lea$Avg_RPG[1]
lahman_data |>
ggplot() +
geom_boxplot(mapping =
aes(x = RPG,
y = factor(lgID, levels = c('AL', 'NL'),
labels = c("American League", "National League")),color = lgID)) +
labs(title = "League factor on Runs scored per game",
x = "Runs",
y = "League Variation") +
theme_classic()
Before performing the independent sample t – test, we need to check whether our data is following certain assumptions because independent t test is a parametric test.
AL_df = filter(lahman_data, lgID == 'AL') |> pluck('RPG')
NL_df = filter(lahman_data, lgID == 'NL') |> pluck('RPG')
shapiro.test(AL_df)
##
## Shapiro-Wilk normality test
##
## data: AL_df
## W = 0.99413, p-value = 0.2607
shapiro.test(NL_df)
##
## Shapiro-Wilk normality test
##
## data: NL_df
## W = 0.99263, p-value = 0.08908
sd(AL_df)
## [1] 0.5272305
sd(NL_df)
## [1] 0.4808208
n1 <- length(AL_df)
n2 <- length(NL_df)
df <- n1 + n2 - 2
sd1 <- sd(AL_df)
sd2 <- sd(NL_df)
poolvar<- (((n1-1)*sd1^2)+((n2-1)*sd2^2))/df
t_stat <- (Observed_mean_diff - 0)/sqrt((poolvar/n1) + (poolvar/n2))
p_val <- 2 * (1 - pt(abs(t_stat),df))
print(p_val)
## [1] 4.516326e-07
t.test(AL_df,NL_df)
##
## Welch Two Sample t-test
##
## data: AL_df and NL_df
## t = 4.985, df = 639.42, p-value = 7.988e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1189852 0.2736506
## sample estimates:
## mean of x mean of y
## 4.669117 4.472799
bootstrap <- function (x, func=mean, n_iter=10^4) {
func_values <- c(NULL)
for (i in 1:n_iter) {
x_sample <- sample(x, size = length(x), replace = TRUE)
func_values <- c(func_values, func(x_sample))
}
return(func_values)
}
AL_Sample <- bootstrap(AL_df,fun=mean,n_iter=10^2)
NL_Sample <- bootstrap(NL_df,fun=mean,n_iter=10^2)
#As now we have the bootstapped means of AL and NL. Lets find the difference of those and plot the observed mean differenc eof our original data on the graph.
bootstapped_mean_diffs <- AL_Sample - NL_Sample
Observed_mean_diff <- round(mean(AL_df) - mean(NL_df),2)
ggplot()+
geom_function(xlim = c(-1, 1),
fun = function(x) dnorm(x, mean = 0,
sd = sd(bootstapped_mean_diffs)))+
geom_vline(xintercept = Observed_mean_diff, color='red', linetype = "dashed")+
annotate('text',label = paste("Observed mean diff:",Observed_mean_diff),color='red',x=Observed_mean_diff+0.5,y=2.7)+
labs(title = "Bootstrapped Sampling Distribution of RPG differences",
x = "Difference in RPG Calculated",
y = "Probability Density",
color = "") +
scale_x_continuous(breaks = seq(-1, 1, 100)) +
theme_minimal()
#Taking null hypothesis as true - mean = 0.
bootstapped_dmean_diffs <- bootstapped_mean_diffs - mean(bootstapped_mean_diffs)
paste("p-value ",
sum(abs(Observed_mean_diff) < abs(bootstapped_dmean_diffs)) /
length(bootstapped_dmean_diffs))
## [1] "p-value 0"
cohen.d(AL_df,NL_df)
##
## Cohen's d
##
## d estimate: 0.3897959 (small)
## 95 percent confidence interval:
## lower upper
## 0.2353693 0.5442225
library(pwrss)
ind_test = pwrss.t.2means(mu1 = mean(AL_df), mu2 = mean(NL_df), sd1 = sd(AL_df), sd2 = sd(NL_df), kappa = 1,power = .85, alpha = 0.05, alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.85
## n1 = 120
## n2 = 120
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 238
## Non-centrality parameter = 3.014
## Type I error rate = 0.05
## Type II error rate = 0.15
plot(ind_test)
## Warning in qt(1 - prob.extreme, df = df, ncp = ncp, lower.tail = TRUE): full
## precision may not have been achieved in 'pnt{final}'
library(pwr)
effect_size <- 0.3897959
alpha <- 0.05
power <- 0.85
independent_test <- pwr.t.test(
d = effect_size,
sig.level = alpha,
power = power,
alternative = "two.sided")
independent_test
##
## Two-sample t test power calculation
##
## n = 119.1509
## d = 0.3897959
## sig.level = 0.05
## power = 0.85
## alternative = two.sided
##
## NOTE: n is number in *each* group
New_lahman_data <- lahman_data |>
mutate(Performance = ifelse(HR > mean(HR),'HR_AA','HR_BA'))
#View(New_lahman_data)
contingency_table <- table(New_lahman_data$Performance,New_lahman_data$lgID)
contingency_table
##
## AL NL
## HR_AA 172 150
## HR_BA 145 193
Null Hypothesis: There is no association between the team of particular league and their Home runs hitting performance. HR_AA: Home Runs Above Average, HR_BA: Home Runs Below Average.
chisq.test(New_lahman_data$Performance,New_lahman_data$lgID)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: New_lahman_data$Performance and New_lahman_data$lgID
## X-squared = 6.8915, df = 1, p-value = 0.008661
fisher.test(New_lahman_data$Performance,New_lahman_data$lgID)
##
## Fisher's Exact Test for Count Data
##
## data: New_lahman_data$Performance and New_lahman_data$lgID
## p-value = 0.008026
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.109467 2.099918
## sample estimates:
## odds ratio
## 1.525227