library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(dplyr)
library(ggplot2)
library(readxl)
library(pwr)

ufc_data <- read_excel("~/Downloads/UFC_Dataset.xls")

# Filter dataset for Men's fights and filter out the Catachweight fights
ufc_data <- ufc_data |>
  filter(Gender == "MALE", !is.na(WeightClass), WeightClass != "Catch Weight") |>
  mutate(
    WinnerBinary = ifelse(Winner == "Red", 1, 0) # 1 if Red wins, 0 if Blue wins
  )

# Analyze each weight class separately
weight_classes <- unique(ufc_data$WeightClass)

# Loop through each weight class and analyze
for (weight_class in weight_classes) {
  cat("\nAnalyzing weight class:", weight_class, "\n")

# Filter data for weight class
  ufc_model_data <- ufc_data |>
    filter(WeightClass == weight_class) |>
    select(ReachDif, AgeDif, WinnerBinary)

# Build model
  model <- glm(WinnerBinary ~ ReachDif + AgeDif, data = ufc_model_data, family = binomial)
  print(summary(model))

# Confidence Intervals
  conf_interval <- confint(model)
  print(conf_interval)
}
## 
## Analyzing weight class: Middleweight 
## 
## Call:
## glm(formula = WinnerBinary ~ ReachDif + AgeDif, family = binomial, 
##     data = ufc_model_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.214551   0.075112   2.856  0.00428 **
## ReachDif    -0.017672   0.009195  -1.922  0.05461 . 
## AgeDif      -0.012544   0.014743  -0.851  0.39489   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1002.03  on 728  degrees of freedom
## Residual deviance:  996.93  on 726  degrees of freedom
## AIC: 1002.9
## 
## Number of Fisher Scoring iterations: 4
## Waiting for profiling to be done...
##                   2.5 %       97.5 %
## (Intercept)  0.06764782 0.3622197703
## ReachDif    -0.03581043 0.0002802113
## AgeDif      -0.04153042 0.0163388490
## 
## Analyzing weight class: Featherweight 
## 
## Call:
## glm(formula = WinnerBinary ~ ReachDif + AgeDif, family = binomial, 
##     data = ufc_model_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.460625   0.079129   5.821 5.84e-09 ***
## ReachDif    -0.017168   0.009834  -1.746 0.080858 .  
## AgeDif      -0.062059   0.017487  -3.549 0.000387 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 928.53  on 693  degrees of freedom
## Residual deviance: 910.29  on 691  degrees of freedom
## AIC: 916.29
## 
## Number of Fisher Scoring iterations: 4
## Waiting for profiling to be done...
##                   2.5 %        97.5 %
## (Intercept)  0.30644642  0.6168034875
## ReachDif    -0.03706758  0.0005142276
## AgeDif      -0.09671205 -0.0280757656
## 
## Analyzing weight class: Lightweight 
## 
## Call:
## glm(formula = WinnerBinary ~ ReachDif + AgeDif, family = binomial, 
##     data = ufc_model_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.356282   0.064151   5.554  2.8e-08 ***
## ReachDif    -0.003570   0.008131  -0.439   0.6606    
## AgeDif      -0.030801   0.013246  -2.325   0.0201 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1371.1  on 1010  degrees of freedom
## Residual deviance: 1365.2  on 1008  degrees of freedom
## AIC: 1371.2
## 
## Number of Fisher Scoring iterations: 4
## Waiting for profiling to be done...
##                   2.5 %       97.5 %
## (Intercept)  0.23099049  0.482545670
## ReachDif    -0.01953952  0.012364723
## AgeDif      -0.05690991 -0.004937326
## 
## Analyzing weight class: Welterweight 
## 
## Call:
## glm(formula = WinnerBinary ~ ReachDif + AgeDif, family = binomial, 
##     data = ufc_model_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.278224   0.065510   4.247 2.17e-05 ***
## ReachDif    -0.014299   0.008034  -1.780   0.0751 .  
## AgeDif      -0.011689   0.012863  -0.909   0.3635    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1319.9  on 964  degrees of freedom
## Residual deviance: 1315.6  on 962  degrees of freedom
## AIC: 1321.6
## 
## Number of Fisher Scoring iterations: 4
## Waiting for profiling to be done...
##                   2.5 %      97.5 %
## (Intercept)  0.15017859 0.407068572
## ReachDif    -0.03013047 0.001396362
## AgeDif      -0.03696039 0.013506773
## 
## Analyzing weight class: Light Heavyweight 
## 
## Call:
## glm(formula = WinnerBinary ~ ReachDif + AgeDif, family = binomial, 
##     data = ufc_model_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.22481    0.09546   2.355   0.0185 *  
## ReachDif    -0.04643    0.01127  -4.118 3.81e-05 ***
## AgeDif      -0.03345    0.01677  -1.995   0.0461 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.02  on 470  degrees of freedom
## Residual deviance: 620.75  on 468  degrees of freedom
## AIC: 626.75
## 
## Number of Fisher Scoring iterations: 4
## Waiting for profiling to be done...
##                   2.5 %        97.5 %
## (Intercept)  0.03806720  0.4125742405
## ReachDif    -0.06899186 -0.0247123720
## AgeDif      -0.06663783 -0.0007744874
## 
## Analyzing weight class: Bantamweight 
## 
## Call:
## glm(formula = WinnerBinary ~ ReachDif + AgeDif, family = binomial, 
##     data = ufc_model_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.417907   0.082620   5.058 4.23e-07 ***
## ReachDif     0.004860   0.009612   0.506    0.613    
## AgeDif      -0.022834   0.015893  -1.437    0.151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 829.84  on 616  degrees of freedom
## Residual deviance: 827.57  on 614  degrees of freedom
## AIC: 833.57
## 
## Number of Fisher Scoring iterations: 4
## Waiting for profiling to be done...
##                   2.5 %      97.5 %
## (Intercept)  0.25682010 0.580872035
## ReachDif    -0.01399037 0.023746863
## AgeDif      -0.05415514 0.008239373
## 
## Analyzing weight class: Flyweight 
## 
## Call:
## glm(formula = WinnerBinary ~ ReachDif + AgeDif, family = binomial, 
##     data = ufc_model_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.437172   0.117135   3.732  0.00019 ***
## ReachDif     0.004964   0.017129   0.290  0.77197    
## AgeDif      -0.023576   0.023457  -1.005  0.31486    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 414.59  on 308  degrees of freedom
## Residual deviance: 413.50  on 306  degrees of freedom
## AIC: 419.5
## 
## Number of Fisher Scoring iterations: 4
## Waiting for profiling to be done...
##                   2.5 %     97.5 %
## (Intercept)  0.20932729 0.66902865
## ReachDif    -0.02867570 0.03867288
## AgeDif      -0.07003136 0.02218537
## 
## Analyzing weight class: Heavyweight 
## 
## Call:
## glm(formula = WinnerBinary ~ ReachDif + AgeDif, family = binomial, 
##     data = ufc_model_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.401025   0.096474   4.157 3.23e-05 ***
## ReachDif    -0.033811   0.009769  -3.461 0.000538 ***
## AgeDif       0.004389   0.016638   0.264 0.791921    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 621.21  on 461  degrees of freedom
## Residual deviance: 607.67  on 459  degrees of freedom
## AIC: 613.67
## 
## Number of Fisher Scoring iterations: 4
## Waiting for profiling to be done...
##                   2.5 %      97.5 %
## (Intercept)  0.21295253  0.59143204
## ReachDif    -0.05327294 -0.01501337
## AgeDif      -0.02823718  0.03712179
  # Residuals vs Fitted Values
  plot(model$fitted.values, resid(model), main = "Residuals vs Fitted Values", xlab = "Fitted Values", ylab = "Residuals")
  abline(h = 0, col = "red")

  # Residuals vs Predictor Variables
  plot(ufc_model_data$ReachDif, resid(model), xlab = "Reach Difference", ylab = "Residuals", main = "Residuals vs Reach Difference")
  abline(h = 0, col = "red")

  plot(ufc_model_data$AgeDif, resid(model), xlab = "Age Difference", ylab = "Residuals", main = "Residuals vs Age Difference")
  abline(h = 0, col = "red")

  # Q-Q Plot
  qqnorm(resid(model), main = "Normal Q-Q Plot")
  qqline(resid(model), col = "red")

  # Cook's Distance by Observation
  cooksD <- cooks.distance(model)
  plot(1:length(cooksD), cooksD, type = "h", main = "Cook's Distance by Observation", xlab = "Observation", ylab = "Cook's Distance")
  abline(h = 4/(nrow(ufc_model_data)-length(model$coefficients)-2), col = "red")

Interpretations

Flyweight

-Neither reach or age seems to have a statistically significant affect on victory.

Bantamweight

-Neither reach or age seems to have a statistically significant affect on victory.

Featherweight

-Coefficient for reach is mariginally significant (.081) suggesting that a reach difference affects winning at this weightclass. Confidence interval includes zero suggesting that we should be skeptical towards the relationship. Age is determined to be significant at this weightclass and favors the younger fighter.

Lightweight

-Reach seems to be insignificant at this weightclass as suggested by the p-value and the facts that the confidence interval includes zero. Age however is significant at this weightclass ad is supported by the p-value and confidence interval suggesting that younger fighters have an advantage in this weight class.

Welterweight

-Reach is marginally significant (.075) but is indicated to not be significant by the confidence interval. Age is shown to not be significant since the p-value is .364 and the confidence interval includes zero.

Middleweight

-Results suggest that there is a weak relationship between having a advantage in reach and victory. However, the p-value is not quiote below .05 and is .055 indicating it is weak or moderately significant. The confidence interval also includes zero indicating we should be skeptical towards this relationship. The p-value and confidence interval indicate that age is not a statistically significant factor that affects winning a fight at this weight class.

Light Heavyweight

-Reach in this division is seen to be very significant suggesting that having a larger reach in this weight class aids victory. The confidence interval supports this as well. Age is seen to do the same thing for the same reason, suggesting the younger fighter has an edge in this division.

Heavyweight

-Reach seems to be very significant in regards to victory in this weight-class. The p-value is remarkably low and the confidence interval also suggests that the relationship is strong. Age however, does not seem to have any affect on winning in this weightclass, this is supported by both the p-value and confidence interval.

Diagnostics

-Cook’s distance test and Q-Q plot suggest that we should doubtful and that some significant results may actually not be. May need to analyze further to ensure significance.