library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readxl)
library(pwr)
dataset <- read_excel("~/Downloads/UFC_Dataset.xls")

#Filtering Dataset for fights in the Men's Division after 2020
dataset<-dataset |>
  filter(Gender == "MALE")

#Creating Columns for Winners and Losers
dataset <- dataset |>
  mutate(
    red_winner = ifelse(Winner == "Red", TRUE, FALSE),
    blue_winner = ifelse(Winner == "Blue", TRUE, FALSE)
  )
winners <- dataset |>
  filter(red_winner == TRUE | blue_winner == TRUE)

losers <- dataset |> 
  filter(red_winner == FALSE | blue_winner == FALSE)

Hypothesis & Null Hypothesis:

1.) Is there a significant difference in strikes landed between winners and losers of a fight?

-Null: There is no significant difference in the number of strikes landed between winners and losers of a fights.

2.) Is there a significant difference in takedowns landed between winners and losers of a fight?

-Null: There is no significant difference in the number of takedowns landed between winners and losers of a fight.

Testing

Hypothesis #1
#Sample Size
sample_size_str <- pwr.t.test(
  d = 0.5,               #Using medium sized effect size, a medium sized difference is significant enough might change to a smalelr size since every punch counts
  sig.level = 0.05,      #Most common significane level
  power = 0.95,          #I made this .95 because I wanted the most chance of finding a trend 
  type = "two.sample"   #Since we are comparing winners and losers
)
#T-Test for strikes landed between winners and losers
t_test_strikes <- t.test(
  c(winners$RedAvgSigStrLanded, winners$BlueAvgSigStrLanded),
  c(losers$RedAvgSigStrLanded, losers$BlueAvgSigStrLanded),
  alternative = "two.sided",
  var.equal =TRUE
)
print(sample_size_str)
## 
##      Two-sample t test power calculation 
## 
##               n = 104.9279
##               d = 0.5
##       sig.level = 0.05
##           power = 0.95
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
print(t_test_strikes)
## 
##  Two Sample t-test
## 
## data:  c(winners$RedAvgSigStrLanded, winners$BlueAvgSigStrLanded) and c(losers$RedAvgSigStrLanded, losers$BlueAvgSigStrLanded)
## t = 0, df = 18832, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5554242  0.5554242
## sample estimates:
## mean of x mean of y 
##  22.25316  22.25316
Hypothesis #2
sample_size_tds <- pwr.t.test(
  d = 0.3,               #Using medium sized effect size, a medium sized difference is significant enough might change to a smalelr size since every punch counts
  sig.level = 0.05,      #Most common significane level
  power = 0.95,          #I made this .95 because I wanted the most chance of finding a trend 
  type = "two.sample"  ) #Since we are comparing winners and losers
print(sample_size_tds)
## 
##      Two-sample t test power calculation 
## 
##               n = 289.7353
##               d = 0.3
##       sig.level = 0.05
##           power = 0.95
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
#T-Test for takedowns landed between winner and losers
t_test_tds <- t.test(
  c(winners$RedAvgTDLanded, winners$BlueAvgTDLanded),
  c(losers$RedAvgTDLanded, losers$BlueAvgTDLanded),
  alternative = "two.sided",
  var.equal = TRUE
)

# Print the T-test result
print(t_test_tds)
## 
##  Two Sample t-test
## 
## data:  c(winners$RedAvgTDLanded, winners$BlueAvgTDLanded) and c(losers$RedAvgTDLanded, losers$BlueAvgTDLanded)
## t = 0, df = 19158, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.03799138  0.03799138
## sample estimates:
## mean of x mean of y 
##  1.357859  1.357859

Hypothesis #1:

-It seems that there is no reason to reject the null hypothesis, the tests above suggest that I have enough data to find a correlation given that it exists but based on the p-value, which is much higher than the alpha level, given the t-tests it can be suggested that there is no significant difference between these strikes landed between the losers and winners of fights. This is unexpected, I thought that I would be able to find some sort of difference but the numbers are almost exactly the same. I fear that correlating winning with one statistics will not be enough information to provide useful insight, as I already knew and as the numbers are now proposing, fighting is not that simple.

Hypothesis #2:

-The p-value here suggests that there is no evidence against the null hypothesis, additionally, the confidence interval includes zero which further cements our claim to accept the null hypothesis.

Visualization for Hypothesis #1:

#Creating Dataframe for bar charts
data <- tibble(
  Group = c ("Winners", "Winnners", "Losers", "Losers"),
  Metric = c("Strikes", "Takedowns", "Strikes", "Takedowns"),
  Average = c(22.25316, 1.357859, 22.25316, 1.357859)
)
strikes_data <- subset(data, Metric == "Strikes")

ggplot(data = strikes_data, aes(x = Group, y = Average, fill = Group)) +
  geom_bar(stat = "identity", alpha = 0.7) +
  scale_fill_manual(values = c("blue", "red")) +
  ggtitle("Comparison of Average Strikes Landed between Winners and Losers") +
  xlab("Group") +
  ylab("Average Strikes Landed") +
  theme_minimal() +
  theme(legend.position = "none") +
  geom_text(aes(label = round(Average, 2)), vjust = -0.5)

Visualization for Hypothesis #2

takedowns_data <- subset(data, Metric == "Takedowns")

# Create a bar plot for takedowns
ggplot(data = takedowns_data, aes(x = Group, y = Average, fill = Group)) +
  geom_bar(stat = "identity", alpha = 0.7) +
  scale_fill_manual(values = c("blue", "red")) +
  ggtitle("Comparison of Average Takedowns Landed between Winners and Losers") +
  xlab("Group") +
  ylab("Average Takedowns Landed") +
  theme_minimal() +
  theme(legend.position = "none") +
  geom_text(aes(label = round(Average, 2)), vjust = -0.5)

Help:

I must have gone wrong somewhere with how I separated or combined the data, it doesn’t seem likely that both would be exactly the same. Can you help me identify the problem?