library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readxl)
library(pwr)
dataset <- read_excel("~/Downloads/UFC_Dataset.xls")
#Filtering Dataset for fights in the Men's Division after 2020
dataset<-dataset |>
filter(Gender == "MALE")
#Creating Columns for Winners and Losers
dataset <- dataset |>
mutate(
red_winner = ifelse(Winner == "Red", TRUE, FALSE),
blue_winner = ifelse(Winner == "Blue", TRUE, FALSE)
)
winners <- dataset |>
filter(red_winner == TRUE | blue_winner == TRUE)
losers <- dataset |>
filter(red_winner == FALSE | blue_winner == FALSE)
1.) Is there a significant difference in strikes landed between winners and losers of a fight?
-Null: There is no significant difference in the number of strikes landed between winners and losers of a fights.
2.) Is there a significant difference in takedowns landed between winners and losers of a fight?
-Null: There is no significant difference in the number of takedowns landed between winners and losers of a fight.
#Sample Size
sample_size_str <- pwr.t.test(
d = 0.5, #Using medium sized effect size, a medium sized difference is significant enough might change to a smalelr size since every punch counts
sig.level = 0.05, #Most common significane level
power = 0.95, #I made this .95 because I wanted the most chance of finding a trend
type = "two.sample" #Since we are comparing winners and losers
)
#T-Test for strikes landed between winners and losers
t_test_strikes <- t.test(
c(winners$RedAvgSigStrLanded, winners$BlueAvgSigStrLanded),
c(losers$RedAvgSigStrLanded, losers$BlueAvgSigStrLanded),
alternative = "two.sided",
var.equal =TRUE
)
print(sample_size_str)
##
## Two-sample t test power calculation
##
## n = 104.9279
## d = 0.5
## sig.level = 0.05
## power = 0.95
## alternative = two.sided
##
## NOTE: n is number in *each* group
print(t_test_strikes)
##
## Two Sample t-test
##
## data: c(winners$RedAvgSigStrLanded, winners$BlueAvgSigStrLanded) and c(losers$RedAvgSigStrLanded, losers$BlueAvgSigStrLanded)
## t = 0, df = 18832, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5554242 0.5554242
## sample estimates:
## mean of x mean of y
## 22.25316 22.25316
sample_size_tds <- pwr.t.test(
d = 0.3, #Using medium sized effect size, a medium sized difference is significant enough might change to a smalelr size since every punch counts
sig.level = 0.05, #Most common significane level
power = 0.95, #I made this .95 because I wanted the most chance of finding a trend
type = "two.sample" ) #Since we are comparing winners and losers
print(sample_size_tds)
##
## Two-sample t test power calculation
##
## n = 289.7353
## d = 0.3
## sig.level = 0.05
## power = 0.95
## alternative = two.sided
##
## NOTE: n is number in *each* group
#T-Test for takedowns landed between winner and losers
t_test_tds <- t.test(
c(winners$RedAvgTDLanded, winners$BlueAvgTDLanded),
c(losers$RedAvgTDLanded, losers$BlueAvgTDLanded),
alternative = "two.sided",
var.equal = TRUE
)
# Print the T-test result
print(t_test_tds)
##
## Two Sample t-test
##
## data: c(winners$RedAvgTDLanded, winners$BlueAvgTDLanded) and c(losers$RedAvgTDLanded, losers$BlueAvgTDLanded)
## t = 0, df = 19158, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03799138 0.03799138
## sample estimates:
## mean of x mean of y
## 1.357859 1.357859
-It seems that there is no reason to reject the null hypothesis, the tests above suggest that I have enough data to find a correlation given that it exists but based on the p-value, which is much higher than the alpha level, given the t-tests it can be suggested that there is no significant difference between these strikes landed between the losers and winners of fights. This is unexpected, I thought that I would be able to find some sort of difference but the numbers are almost exactly the same. I fear that correlating winning with one statistics will not be enough information to provide useful insight, as I already knew and as the numbers are now proposing, fighting is not that simple.
-The p-value here suggests that there is no evidence against the null hypothesis, additionally, the confidence interval includes zero which further cements our claim to accept the null hypothesis.
#Creating Dataframe for bar charts
data <- tibble(
Group = c ("Winners", "Winnners", "Losers", "Losers"),
Metric = c("Strikes", "Takedowns", "Strikes", "Takedowns"),
Average = c(22.25316, 1.357859, 22.25316, 1.357859)
)
strikes_data <- subset(data, Metric == "Strikes")
ggplot(data = strikes_data, aes(x = Group, y = Average, fill = Group)) +
geom_bar(stat = "identity", alpha = 0.7) +
scale_fill_manual(values = c("blue", "red")) +
ggtitle("Comparison of Average Strikes Landed between Winners and Losers") +
xlab("Group") +
ylab("Average Strikes Landed") +
theme_minimal() +
theme(legend.position = "none") +
geom_text(aes(label = round(Average, 2)), vjust = -0.5)
takedowns_data <- subset(data, Metric == "Takedowns")
# Create a bar plot for takedowns
ggplot(data = takedowns_data, aes(x = Group, y = Average, fill = Group)) +
geom_bar(stat = "identity", alpha = 0.7) +
scale_fill_manual(values = c("blue", "red")) +
ggtitle("Comparison of Average Takedowns Landed between Winners and Losers") +
xlab("Group") +
ylab("Average Takedowns Landed") +
theme_minimal() +
theme(legend.position = "none") +
geom_text(aes(label = round(Average, 2)), vjust = -0.5)
I must have gone wrong somewhere with how I separated or combined the data, it doesn’t seem likely that both would be exactly the same. Can you help me identify the problem?