library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readxl)
library(pwr)
dataset <- read_excel("~/Downloads/UFC_Dataset.xls")

#Filtering Dataset for fights in the Men's Division after 2020
dataset<-dataset |>
  filter(Gender == "MALE")

#Creating Column for Response Variable
dataset <- dataset |>
  mutate(TotalAvgSigStrLanded = RedAvgSigStrLanded + BlueAvgSigStrLanded)

dataset <- dataset |>
  filter(!is.na(WeightClass) & !is.na(TotalAvgSigStrLanded))

dataset <- dataset |>
  filter(WeightClass != "Catch Weight")
head(dataset)
## # A tibble: 6 × 119
##   RedFighter     BlueFighter RedOdds BlueOdds RedExpectedValue BlueExpectedValue
##   <chr>          <chr>         <dbl>    <dbl>            <dbl>             <dbl>
## 1 Jack Hermanss… Joe Pyfer       205     -250            205                40  
## 2 Dan Ige        Andre Fili     -185      154             54.1             154  
## 3 Robert Bryczek Ihor Potie…    -230      190             43.5             190  
## 4 Brad Tavares   Gregory Ro…     190     -230            190                43.5
## 5 Michael Johns… Darrius Fl…    -155      130             64.5             130  
## 6 Rodolfo Vieira Armen Petr…    -105     -115             95.2              87.0
## # ℹ 113 more variables: Date <dttm>, Location <chr>, Country <chr>,
## #   Winner <chr>, TitleBout <lgl>, WeightClass <chr>, Gender <chr>,
## #   NumberOfRounds <dbl>, BlueCurrentLoseStreak <dbl>,
## #   BlueCurrentWinStreak <dbl>, BlueDraws <dbl>, BlueAvgSigStrLanded <dbl>,
## #   BlueAvgSigStrPct <dbl>, BlueAvgSubAtt <dbl>, BlueAvgTDLanded <dbl>,
## #   BlueAvgTDPct <dbl>, BlueLongestWinStreak <dbl>, BlueLosses <dbl>,
## #   BlueTotalRoundsFought <dbl>, BlueTotalTitleBouts <dbl>, …

Hypothesis #1:

Null: There is no significant difference in TotalSigStrLanded between different weight classes.

ALT: There is a significant difference in the TotalSigStrLanded between different weight classes.

anova_result <- aov(TotalAvgSigStrLanded ~ WeightClass, data = dataset)
summary (anova_result)
##               Df  Sum Sq Mean Sq F value   Pr(>F)    
## WeightClass    7   53091    7584   6.368 1.81e-07 ***
## Residuals   4343 5172716    1191                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation: By observing both the F value and P value provided, we can suggest that there is significant evidence to reject to Null hypothesis. The high F value suggests that the correlation that we are finding is not due to variance or chance and since the p value is close to zero we can suggest that there is a statistically significant difference in the strikes landed in different weight classes. I was talking to a friend this weekend who is also passionate about UFC and expressing how I was having trouble finding meaningful insights into my data, he made me realize weight class is likely an important factor that I am not taking into account. This shows the uniqueness of each weightclass and how each must adjust and be held to different standards.

ggplot(dataset, aes(x = WeightClass, y = TotalAvgSigStrLanded)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Total Significant Strikes Landed by Weight Class",
       x = "Weight Class",
       y = "Total Significant Strikes Landed")

Interpretation: This graph supports the claims made above and depict that each weight class has a unique average of strikes landed and unique variances regarding the statistic as well. Not taking this into account and only filtering for males might have prevented us from seeing the whole picture.

Linear Regression Model

Purpose: I want to see if weight has a linear relationship with strikes landed. This will test the hypothesis; Does the number strikes landed matter less as you move up in weight class.

linear_reg <- lm(TotalAvgSigStrLanded ~ RedWeightLbs, data = dataset)

summary(linear_reg)
## 
## Call:
## lm(formula = TotalAvgSigStrLanded ~ RedWeightLbs, data = dataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.379 -34.927   0.279  26.554 135.270 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  54.17787    2.70253  20.047  < 2e-16 ***
## RedWeightLbs -0.06239    0.01559  -4.002 6.39e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.6 on 4349 degrees of freedom
## Multiple R-squared:  0.003669,   Adjusted R-squared:  0.00344 
## F-statistic: 16.01 on 1 and 4349 DF,  p-value: 6.392e-05

Interpretation: Based on the values above we can conclude that weight does in fact have a significant affect on the number of strikes that one lands. The coefficient with RedWeighLbs suggests a negative relationship with weight and strikes landed thus suggesting that heavier weight classes generally land less strikes. Not considering this in my past calculations is likely a contributor as to why it was difficult to find significant relationships between a kpi and victory. A suggestion that can be made from this data is that the higher up in weight you go a fighter should focus on quality not quantity, one great strike is more effective than many okay ones at higher weights.