## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)

## 
## Attaching package: 'pwrss'
## 
## The following object is masked from 'package:stats':
## 
##     power.t.test

library(dplyr)
library(rmarkdown)
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

lahman_data = read.csv("/Users/anuragreddy/Desktop/Statistics with R/Lahmans Databse .csv")

lahman_data <- lahman_data |>
  mutate(RPG = round(R/G,2))

1. Null Hypothesis: There is no difference between the runs scored per game by teams in American and National League.

RPG_Lea <- lahman_data |>
  group_by(lgID) |>
  summarise(Avg_RPG = round(mean(RPG),2))|>
  arrange(Avg_RPG)
RPG_Lea

## # A tibble: 2 × 2
##   lgID  Avg_RPG
##   <chr>   <dbl>
## 1 NL       4.47
## 2 AL       4.67

Observed_mean_diff <- RPG_Lea$Avg_RPG[2] - RPG_Lea$Avg_RPG[1]


lahman_data |>
  ggplot() +
  geom_boxplot(mapping = 
                 aes(x = RPG, 
                     y = factor(lgID, levels = c('AL', 'NL'),
                                labels = c("American League", "National League")),color = lgID)) +
  labs(title = "League factor on Runs scored per game",
       x = "Runs",
       y = "League Variation") +
  theme_classic()

Interpretation: By interpreting the above box plot, we can infer that the AL has more runs scored per game compared to the NL. We are interested in determining whether this provides enough evidence against the null hypothesis stated above.

Before performing the independent sample t – test, we need to check whether our data is following certain assumptions because independent t test is a parametric test.

Normality test - whether the data of our two groups is normal or not
Levene’s test - Equal variances between two groups.

AL_df = filter(lahman_data, lgID == 'AL') |> pluck('RPG')
NL_df = filter(lahman_data, lgID == 'NL') |> pluck('RPG')

shapiro.test(AL_df)

## 
##  Shapiro-Wilk normality test
## 
## data:  AL_df
## W = 0.99413, p-value = 0.2607

shapiro.test(NL_df)

## 
##  Shapiro-Wilk normality test
## 
## data:  NL_df
## W = 0.99263, p-value = 0.08908

sd(AL_df)

## [1] 0.5272305

sd(NL_df)

## [1] 0.4808208

Interpretation: The Shapiro test has been used to assess the normality of the two groups, AL and NL, and it has been found that they are normal. The standard deviations are almost the same for both groups.

1. Two Independent Sample t - test:

n1 <- length(AL_df)
n2 <- length(NL_df)
df <- n1 + n2 - 2
sd1 <- sd(AL_df)
sd2 <- sd(NL_df)

poolvar<- (((n1-1)*sd1^2)+((n2-1)*sd2^2))/df 
t_stat <- (Observed_mean_diff - 0)/sqrt((poolvar/n1) + (poolvar/n2))
p_val <- 2 * (1 - pt(abs(t_stat),df))
print(p_val)

## [1] 4.516326e-07

t.test(AL_df,NL_df)

## 
##  Welch Two Sample t-test
## 
## data:  AL_df and NL_df
## t = 4.985, df = 639.42, p-value = 7.988e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1189852 0.2736506
## sample estimates:
## mean of x mean of y 
##  4.669117  4.472799

Interpretation: I have used two methods to find the p-value: 1. The traditional mathematical method using the t-distribution, and 2. The independent t-test formula (t.test()). As the p-value is less than 0.05, we reject the null hypothesis. We have enough evidence to assert that AL teams have more runs per game compared to NL teams. In reality, this is true, as there is a league factor associated with MLB.

2. Fishers style - significance testing.

bootstrap <- function (x, func=mean, n_iter=10^4) {
  func_values <- c(NULL)
  
  for (i in 1:n_iter) {
    x_sample <- sample(x, size = length(x), replace = TRUE)
    
    func_values <- c(func_values, func(x_sample))
  }
  
  return(func_values)
}

AL_Sample <- bootstrap(AL_df,fun=mean,n_iter=10^2)
NL_Sample <- bootstrap(NL_df,fun=mean,n_iter=10^2)

#As now we have the bootstapped means of AL and NL. Lets find the difference of those and plot the observed mean differenc eof our original data on the graph.
bootstapped_mean_diffs <- AL_Sample - NL_Sample
Observed_mean_diff <- round(mean(AL_df) - mean(NL_df),2)

ggplot()+
  geom_function(xlim = c(-1, 1), 
                fun = function(x) dnorm(x, mean = 0, 
                                        sd = sd(bootstapped_mean_diffs)))+
  geom_vline(xintercept = Observed_mean_diff, color='red', linetype = "dashed")+
  annotate('text',label = paste("Observed mean diff:",Observed_mean_diff),color='red',x=Observed_mean_diff+0.5,y=2.7)+
  labs(title = "Bootstrapped Sampling Distribution of RPG differences",
       x = "Difference in RPG Calculated",
       y = "Probability Density",
       color = "") +
  scale_x_continuous(breaks = seq(-1, 1, 100)) +
  theme_minimal()

#Taking null hypothesis as true - mean = 0.
bootstapped_dmean_diffs <- bootstapped_mean_diffs - mean(bootstapped_mean_diffs)
paste("p-value ", 
      sum(abs(Observed_mean_diff) < abs(bootstapped_dmean_diffs)) /
        length(bootstapped_dmean_diffs))

## [1] "p-value  0"

Interpretation: As you can see, the area in the extreme values from the observed data is found to be zero. Hence, we have enough evidence to reject the null hypothesis stated above.

\(\alpha\) = 0.5. I have chosen an alpha value of 0.05, indicating my willingness to accept a 5% risk of making an informed decision, depending on the situation.

3. Effect Size calculation.

cohen.d(AL_df,NL_df)

## 
## Cohen's d
## 
## d estimate: 0.3897959 (small)
## 95 percent confidence interval:
##     lower     upper 
## 0.2353693 0.5442225

Interpretation: We have a relatively “small” sized effect,

4. Sample Size calculation by pwrss and pwr

library(pwrss)
ind_test = pwrss.t.2means(mu1 = mean(AL_df), mu2 = mean(NL_df), sd1 = sd(AL_df), sd2 = sd(NL_df), kappa = 1,power = .85, alpha = 0.05, alternative = "not equal")

##  Difference between Two means 
##  (Independent Samples t Test) 
##  H0: mu1 = mu2 
##  HA: mu1 != mu2 
##  ------------------------------ 
##   Statistical power = 0.85 
##   n1 = 120 
##   n2 = 120 
##  ------------------------------ 
##  Alternative = "not equal" 
##  Degrees of freedom = 238 
##  Non-centrality parameter = 3.014 
##  Type I error rate = 0.05 
##  Type II error rate = 0.15

plot(ind_test)

## Warning in qt(1 - prob.extreme, df = df, ncp = ncp, lower.tail = TRUE): full
## precision may not have been achieved in 'pnt{final}'

library(pwr)

effect_size <- 0.3897959  
alpha <- 0.05       
power <- 0.85      

independent_test <- pwr.t.test(
  d = effect_size,   
  sig.level = alpha,
  power = power,     
  alternative = "two.sided")

independent_test

## 
##      Two-sample t test power calculation 
## 
##               n = 119.1509
##               d = 0.3897959
##       sig.level = 0.05
##           power = 0.85
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

Interpretation: With both methods, the required sample size has come out to be around 120 observations for each group. We have around 300+ observations for each AL and NL group.

4. Chi-Square test and Fishers Exact Test: Association between League and Performance of runs scored.

New_lahman_data <- lahman_data |>
  mutate(Performance = ifelse(HR > mean(HR),'HR_AA','HR_BA'))
#View(New_lahman_data)
contingency_table <- table(New_lahman_data$Performance,New_lahman_data$lgID)
contingency_table

##        
##          AL  NL
##   HR_AA 172 150
##   HR_BA 145 193

Null Hypothesis: There is no association between the team of particular league and their Home runs hitting performance. HR_AA: Home Runs Above Average, HR_BA: Home Runs Below Average.

chisq.test(New_lahman_data$Performance,New_lahman_data$lgID)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  New_lahman_data$Performance and New_lahman_data$lgID
## X-squared = 6.8915, df = 1, p-value = 0.008661

fisher.test(New_lahman_data$Performance,New_lahman_data$lgID)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  New_lahman_data$Performance and New_lahman_data$lgID
## p-value = 0.008026
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.109467 2.099918
## sample estimates:
## odds ratio 
##   1.525227

Interpretation: The Fisher’s exact test and chi-square test have given a p-value less than 0.05, indicating there is enough evidence to reject the null hypothesis. We can interpret that there is an association between the league played and the teams’ performance in that particular league. American League teams have a better home run hitting average than the National League teams.

Week - 7 Data Dive

Anurag

2024-02-23

Contents:

Hypothesis Testing (1&2):

1. Two sample mean test - Comapring mean Runs per Game between American League and National League.

2. Similar test in Fisher’s Style - significance testing and visualizing the test.

Neyman-Pearson Framework (3&4):

3. Effect size calculation (Cohen’s d)

4. Calculating the minimum sample size required to perform hypothesis testing.

Hypothesis testing between two categorical variables (5):

5. Fishers Exact test & Chi-square - association between league and performance of teams. (League, performance - Categorical Variables).