library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readxl)
library(pwr)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
ufc_data <- read_excel("~/Downloads/UFC_Dataset.xls")
#Filter for Male fights and filtering out "Catch Weights"
ufc_data <- ufc_data |>
filter(Gender == "MALE", !is.na(WeightClass), WeightClass != "Catch Weight") |>
mutate(
TotalStrikesLanded = RedAvgSigStrLanded + BlueAvgSigStrLanded,
TotalTakedownAttempts = RedAvgTDLanded + BlueAvgTDLanded,
TotalSubAttempts = RedAvgSubAtt + BlueAvgSubAtt,
WinnerBinary = ifelse(Winner == "Red", 1, 0)
)
# Analyze each weight class separately
weight_classes <- unique(ufc_data$WeightClass)
for (weight_class in weight_classes) {
cat("\nAnalyzing weight class:", weight_class, "\n")
ufc_reg <- ufc_data |>
filter(WeightClass == weight_class) |>
select(TotalStrikesLanded, TotalTakedownAttempts, TotalSubAttempts, WinnerBinary, Winner)
#Compare averages of predictors for winners and losers
winner_stats <- ufc_reg |>
group_by(Winner) |>
summarise(
AvgStrikesLanded = mean(TotalStrikesLanded, na.rm = TRUE),
AvgTakedownAttempts = mean(TotalTakedownAttempts, na.rm = TRUE),
AvgSubmissionAttempts = mean(TotalSubAttempts, na.rm = TRUE)
)
print(winner_stats)
#Build the logistic regression model
model <- glm(WinnerBinary ~ TotalStrikesLanded + TotalTakedownAttempts + TotalSubAttempts, data = ufc_reg, family = binomial)
#Summary of the model
print(summary(model))
}
##
## Analyzing weight class: Middleweight
## # A tibble: 2 × 4
## Winner AvgStrikesLanded AvgTakedownAttempts AvgSubmissionAttempts
## <chr> <dbl> <dbl> <dbl>
## 1 Blue 39.1 2.83 0.964
## 2 Red 39.6 2.73 1.09
##
## Call:
## glm(formula = WinnerBinary ~ TotalStrikesLanded + TotalTakedownAttempts +
## TotalSubAttempts, family = binomial, data = ufc_reg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.138374 0.204029 0.678 0.4976
## TotalStrikesLanded 0.001179 0.002721 0.433 0.6647
## TotalTakedownAttempts -0.058988 0.048868 -1.207 0.2274
## TotalSubAttempts 0.182987 0.093340 1.960 0.0499 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 847.30 on 615 degrees of freedom
## Residual deviance: 843.04 on 612 degrees of freedom
## (113 observations deleted due to missingness)
## AIC: 851.04
##
## Number of Fisher Scoring iterations: 4
##
##
## Analyzing weight class: Featherweight
## # A tibble: 2 × 4
## Winner AvgStrikesLanded AvgTakedownAttempts AvgSubmissionAttempts
## <chr> <dbl> <dbl> <dbl>
## 1 Blue 47.0 2.78 1.19
## 2 Red 43.8 2.83 1.28
##
## Call:
## glm(formula = WinnerBinary ~ TotalStrikesLanded + TotalTakedownAttempts +
## TotalSubAttempts, family = binomial, data = ufc_reg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.519971 0.202041 2.574 0.0101 *
## TotalStrikesLanded -0.001978 0.002282 -0.867 0.3860
## TotalTakedownAttempts 0.002317 0.044944 0.052 0.9589
## TotalSubAttempts 0.037183 0.079285 0.469 0.6391
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 752.62 on 565 degrees of freedom
## Residual deviance: 751.45 on 562 degrees of freedom
## (128 observations deleted due to missingness)
## AIC: 759.45
##
## Number of Fisher Scoring iterations: 4
##
##
## Analyzing weight class: Lightweight
## # A tibble: 2 × 4
## Winner AvgStrikesLanded AvgTakedownAttempts AvgSubmissionAttempts
## <chr> <dbl> <dbl> <dbl>
## 1 Blue 47.6 2.96 1.08
## 2 Red 46.6 2.93 1.07
##
## Call:
## glm(formula = WinnerBinary ~ TotalStrikesLanded + TotalTakedownAttempts +
## TotalSubAttempts, family = binomial, data = ufc_reg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4399454 0.1835269 2.397 0.0165 *
## TotalStrikesLanded -0.0009723 0.0020783 -0.468 0.6399
## TotalTakedownAttempts -0.0139654 0.0361269 -0.387 0.6991
## TotalSubAttempts -0.0289939 0.0819483 -0.354 0.7235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1133.5 on 832 degrees of freedom
## Residual deviance: 1133.0 on 829 degrees of freedom
## (178 observations deleted due to missingness)
## AIC: 1141
##
## Number of Fisher Scoring iterations: 4
##
##
## Analyzing weight class: Welterweight
## # A tibble: 2 × 4
## Winner AvgStrikesLanded AvgTakedownAttempts AvgSubmissionAttempts
## <chr> <dbl> <dbl> <dbl>
## 1 Blue 48.8 2.70 1.01
## 2 Red 45.4 2.73 1.06
##
## Call:
## glm(formula = WinnerBinary ~ TotalStrikesLanded + TotalTakedownAttempts +
## TotalSubAttempts, family = binomial, data = ufc_reg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.313253 0.174625 1.794 0.0728 .
## TotalStrikesLanded -0.002860 0.002126 -1.345 0.1786
## TotalTakedownAttempts -0.003256 0.040974 -0.079 0.9367
## TotalSubAttempts 0.059434 0.078987 0.752 0.4518
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1086.1 on 790 degrees of freedom
## Residual deviance: 1083.5 on 787 degrees of freedom
## (174 observations deleted due to missingness)
## AIC: 1091.5
##
## Number of Fisher Scoring iterations: 4
##
##
## Analyzing weight class: Light Heavyweight
## # A tibble: 2 × 4
## Winner AvgStrikesLanded AvgTakedownAttempts AvgSubmissionAttempts
## <chr> <dbl> <dbl> <dbl>
## 1 Blue 40.2 2.20 0.731
## 2 Red 43.9 2.54 0.838
##
## Call:
## glm(formula = WinnerBinary ~ TotalStrikesLanded + TotalTakedownAttempts +
## TotalSubAttempts, family = binomial, data = ufc_reg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.237892 0.229599 -1.036 0.300
## TotalStrikesLanded 0.003887 0.003255 1.194 0.232
## TotalTakedownAttempts 0.098798 0.063804 1.548 0.122
## TotalSubAttempts 0.131822 0.140088 0.941 0.347
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 544.93 on 397 degrees of freedom
## Residual deviance: 538.59 on 394 degrees of freedom
## (73 observations deleted due to missingness)
## AIC: 546.59
##
## Number of Fisher Scoring iterations: 4
##
##
## Analyzing weight class: Bantamweight
## # A tibble: 2 × 4
## Winner AvgStrikesLanded AvgTakedownAttempts AvgSubmissionAttempts
## <chr> <dbl> <dbl> <dbl>
## 1 Blue 44.8 2.85 1.06
## 2 Red 41.9 2.94 1.02
##
## Call:
## glm(formula = WinnerBinary ~ TotalStrikesLanded + TotalTakedownAttempts +
## TotalSubAttempts, family = binomial, data = ufc_reg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.550870 0.213296 2.583 0.0098 **
## TotalStrikesLanded -0.001994 0.002369 -0.841 0.4001
## TotalTakedownAttempts 0.013040 0.041689 0.313 0.7544
## TotalSubAttempts -0.089819 0.107388 -0.836 0.4029
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 674.03 on 500 degrees of freedom
## Residual deviance: 672.61 on 497 degrees of freedom
## (116 observations deleted due to missingness)
## AIC: 680.61
##
## Number of Fisher Scoring iterations: 4
##
##
## Analyzing weight class: Flyweight
## # A tibble: 2 × 4
## Winner AvgStrikesLanded AvgTakedownAttempts AvgSubmissionAttempts
## <chr> <dbl> <dbl> <dbl>
## 1 Blue 43.5 3.26 1.34
## 2 Red 42.8 3.31 1.38
##
## Call:
## glm(formula = WinnerBinary ~ TotalStrikesLanded + TotalTakedownAttempts +
## TotalSubAttempts, family = binomial, data = ufc_reg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.3648132 0.2907575 1.255 0.210
## TotalStrikesLanded -0.0006556 0.0033819 -0.194 0.846
## TotalTakedownAttempts 0.0284400 0.0648408 0.439 0.661
## TotalSubAttempts 0.0107107 0.1060749 0.101 0.920
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 349.27 on 260 degrees of freedom
## Residual deviance: 349.01 on 257 degrees of freedom
## (48 observations deleted due to missingness)
## AIC: 357.01
##
## Number of Fisher Scoring iterations: 4
##
##
## Analyzing weight class: Heavyweight
## # A tibble: 2 × 4
## Winner AvgStrikesLanded AvgTakedownAttempts AvgSubmissionAttempts
## <chr> <dbl> <dbl> <dbl>
## 1 Blue 37.3 1.75 0.655
## 2 Red 35.3 1.97 0.732
##
## Call:
## glm(formula = WinnerBinary ~ TotalStrikesLanded + TotalTakedownAttempts +
## TotalSubAttempts, family = binomial, data = ufc_reg)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.330141 0.211126 1.564 0.118
## TotalStrikesLanded -0.001634 0.003508 -0.466 0.641
## TotalTakedownAttempts 0.037331 0.059791 0.624 0.532
## TotalSubAttempts 0.064358 0.128278 0.502 0.616
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 519.80 on 384 degrees of freedom
## Residual deviance: 518.41 on 381 degrees of freedom
## (77 observations deleted due to missingness)
## AIC: 526.41
##
## Number of Fisher Scoring iterations: 4
Interpretation: From the analysis above I can suggest what
statistics influence victory the most at different weightclasses. Many
of the weight classes showed that there is no significant correlation
between the kpi we choose and victory. Total Strikes Landed and
Takedowns landed were found to be significant in none of the
weightclasses. Submission attempts were found to only be significant in
the middleweight division. This suggests that I should look towards
different factors like age, reach, and experience to possibly find a
trend, or maybe it is a combination of these factors that would garner
significant influence over the result. Next, I will do a similar
analysis just with height, reach, and age differentials.
Questions: Am I doing the winner filtering correctly, I am getting a
bit confused doing it?
#1. Residuals vs Fitted Values
plot(model$fitted.values, resid(model), xlab = "Fitted Values", ylab = "Residuals", main = "Residuals vs Fitted Values")
abline(h = 0, col = "red")

#2. Residual Histogram
hist(resid(model), main = "Residual Histogram", xlab = "Residuals", breaks = 20)

#3. Q-Q Plot
qqnorm(resid(model))
qqline(resid(model), col = "red")

#4. Cook's Distance by Observation
cooksD <- cooks.distance(model)
plot(1:length(cooksD), cooksD, type = "h", main = "Cook's Distance by Observation", xlab = "Observation", ylab = "Cook's Distance")
abline(h = 4/(nrow(ufc_reg)-length(model$coefficients)-2), col = "red")

#5.Scale-Location Plot
plot(model$fitted.values, sqrt(abs(resid(model))), xlab = "Fitted Values", ylab = "Square Root of |Residuals|", main = "Scale-Location Plot")
abline(h = 0, col = "red")

Interpretation: For the residuals vs Fitted values, this graph
should show the residuals evenly dispersed around zero, as we can see
this is not the case suggesting a non-linear relationship. The residuals
histogram checks for a normal distribution, as depicted in our graph
above, we can see that it is almost the opposite of our traditional bell
curve. The QQ plot also checks for a normal distribution, residuals
should fall roughly around the reference line, in our graph, this is not
the case. Cook’s distance plot IDs data points of significance, we can
observe a substancial amount of outliers above the reference line,
likely indicative of the unpredictable and wild nature of fighting. The
scale-location plot is one used to check if residuals have a constant
spread across the fitted values. The lines in our graph diverge
suggestign that the residuals may have an issue. It seems like non of
the plots we made were even close to looking like what they were
supposed to look like. This suggests that winning in MMA cannot be
simply attributed to one statistic alone, it doesn’t seem like
exceptional performance in one metric can give you consistent victory.
Next we will look to see if differences in build and age factor into
victory.