library(tidyverse)
setwd("~/Nassor/MC/Data101")
boxing <- read_csv("boxing_pay_data.csv")
## Rows: 4670 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Boxer, Date, Venue
## dbl (24): Purse, lnRPurse, weight, Age, Wins, Losses, KO, W-Title, PPV, ESPN...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(boxing)
## [1] 4670 27
head(boxing)
## # A tibble: 6 × 27
## Boxer Date Venue Purse lnRPurse weight Age Wins Losses KO `W-Title`
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Gamboa, … 2009… Buff… 37998 10.7 126 27.1 12 0 10 0
## 2 Gonzalez… 2009… Buff… 20000 10.0 127 30.7 27 2 18 0
## 3 Lara, Er… 2009… Buff… 3000 8.14 154 25.8 2 0 1 0
## 4 Bogere, … 2009… Buff… 1500 7.45 135 20.2 4 0 2 0
## 5 Frazier,… 2009… Buff… 1000 7.04 139 30.9 1 0 1 0
## 6 Gonzalez… 2009… Buff… 1500 7.45 146 31.5 2 1 1 0
## # ℹ 16 more variables: PPV <dbl>, ESPN <dbl>, HBO <dbl>, FOX <dbl>,
## # TopRank <dbl>, GoldenBoy <dbl>, RDS <dbl>, Y2009 <dbl>, Y2010 <dbl>,
## # Y2011 <dbl>, Y2012 <dbl>, Y2013 <dbl>, Y2014 <dbl>, Y2015 <dbl>,
## # Y2016 <dbl>, Y2017 <dbl>
boxing_venue <- boxing |>
group_by(Venue) |>
count() |>
arrange(desc(n))|>
head(10)
boxing_venue
## # A tibble: 10 × 2
## # Groups: Venue [10]
## Venue n
## <chr> <int>
## 1 MGM Grand 313
## 2 Doubletree Hotel 272
## 3 Fantasy Springs 268
## 4 Hard Rock 261
## 5 Mandalay Bay 185
## 6 StubHub 176
## 7 MGM Arena 161
## 8 CosmoLV 160
## 9 Texas Station 122
## 10 Belasco Theater 112
options(scipen = 999)
boxer_count <- boxing |>
group_by(Boxer) |>
summarize(count = n(), mean_purse = mean(Purse), max_purse = max(Purse), mean_win = mean(Wins), mean_KO = mean(KO)) |>
arrange(desc(count)) |>
filter(count >= 10)
boxer_count
## # A tibble: 57 × 6
## Boxer count mean_purse max_purse mean_win mean_KO
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Magdaleno, Jessie 21 17576. 155000 11.5 8.52
## 2 Chavez, Joaquin R. 18 3000 6000 4.39 1.78
## 3 Vargas, Jessie 18 227028. 2800000 16.6 7.39
## 4 Magdaleno, Diego 17 11788. 25000 16.2 5.24
## 5 Diaz, Joseph Jnr. 16 32250 200000 11.1 6.75
## 6 Bogere, Sharif 15 11647. 75000 16.1 10.3
## 7 De La Hoya, Diego 15 13967. 80000 7.4 4.8
## 8 Gutierrez, Jesus A. 15 3343. 10000 7.2 2.27
## 9 Breazeale, Dominic 14 10464. 50000 7 6.64
## 10 Gavril, Ronald 14 21143. 125000 10.9 8.29
## # ℹ 47 more rows
# count head(50) or count >= 10
Alvarez, Saul C. Benavidez, David Broner, Adrien J. Crawford, Terence Davis, Gervonta Duran, Roberto Garcia, Ryan Mayweather, Floyd Pacquiao, Emmanuel
Lomachenko, Vasyl Golovkin, Gennady Haney, Devin Khan, Amir Lara, Erislandy Maidana, Marcos Marquez, Juan Manuel Rigondeaux, Guillermo
Porter, Shawn Donaire, Nonito Gamboa, Yuriorkis
unique(boxing$`W-Title`)
## [1] 0 1 2 3
class(boxing$`W-Title`)
## [1] "numeric"
table(boxing$`W-Title`)
##
## 0 1 2 3
## 4389 232 40 9
summary(boxing$KO)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 4.000 6.881 10.000 51.000
class(boxing$KO)
## [1] "numeric"
summary(boxing$Wins)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 3.00 8.00 11.39 17.00 65.00
class(boxing$Wins)
## [1] "numeric"
max(boxing$Wins)
## [1] 65
#Max number of wins is 65
summary(boxing$lnRPurse)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.587 7.366 8.195 8.567 9.210 18.455
unique(boxing$PPV)
## [1] 0 1
class(boxing$PPV)
## [1] "numeric"
cor(boxing$PPV, boxing$lnRPurse, use = "complete.obs")
## [1] 0.5261753
cor(boxing$Wins, boxing$lnRPurse, use = "complete.obs")
## [1] 0.7635325
#cor(boxing$Losses, boxing$lnRPurse, use = "complete.obs")
#cor(boxing$weight, boxing$lnRPurse, use = "complete.obs")
(filtering for top 10 venues)
boxing1 <- boxing |>
#filter(lnRPurse <= 10) |>
filter(Venue %in% c("MGM Grand", "Doubletree Hotel", "Fantasy Springs", "Hard Rock", "Mandalay Bay", "StubHub", "MGM Arena", "CosmoLV", "Texas Station", "Belasco Theater"))
There are zeros so we have to use square root as opposed to the log transformation.
summary(boxing1$Wins)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 5.00 12.00 14.15 21.00 65.00
ggplot(boxing1, aes(x=Wins))+
geom_density()
The distribution of wins is strongly skewed right.
ggplot(boxing1, aes(x=sqrt(Wins)))+
geom_density()
Able to use the square root function to make the distribution of wins more bell shaped.
ggplot(boxing1, aes(x=lnRPurse)) +
geom_density()
Unable to transform this distribution because the numeric values for purse were already log transformed
ggplot(boxing1, aes(x = Wins, y = lnRPurse)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Boxer Wins vs Purse",
x = "Wins", y = "Fight Purse") +
theme_minimal()
Linear, but a lot more variability on the upper end.
options(scipen = 0)
model <- lm(lnRPurse ~ Wins, data = boxing1)
summary(model)
##
## Call:
## lm(formula = lnRPurse ~ Wins, data = boxing1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9618 -0.5350 -0.1228 0.4463 5.8516
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.419804 0.042128 176.1 <2e-16 ***
## Wins 0.120267 0.002313 52.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.195 on 2028 degrees of freedom
## Multiple R-squared: 0.5715, Adjusted R-squared: 0.5712
## F-statistic: 2704 on 1 and 2028 DF, p-value: < 2.2e-16
“For every Win, the predicted lnRPurse goes up by 0.81 points.” “If a boxer has 0 Wins, the predicted lnRPurse is 6.38”
winput <- data.frame(Wins = c(5, 15, 25, 35, 45, 55, 65))
purse_estimate <- exp(predict(model, winput))
cbind(winput, purse_estimate)
## Wins purse_estimate
## 1 5 3044.636
## 2 15 10135.528
## 3 25 33740.963
## 4 35 112322.963
## 5 45 373920.804
## 6 55 1244774.570
## 7 65 4143828.624
Before trusting a regression model, check two assumptions:
plot(model, which = 1)
#An IDEAL residual plot looks like random dots scattered around the horizontal line at 0. A funnel shape or a clear pattern = bad sign
# Q-Q plot: checks normality of residuals
plot(model, which = 2)
#An IDEAL Q-Q plot has points falling close to the diagonal line. Big deviations at the tails = non-normal residuals.
Residuals vs Fitted shows a mostly random distribution. Maybe a bit of a funnel shape. QQ Residuals shows non-normal residuals at the tails and falls along the diagonal in the center
rmse <- sqrt(mean(residuals(model)^2))
rmse
## [1] 1.194695
On average, my predicted fight purse is off by about 1.195
rmse points.”
Multiple Regression
multi_model1 <- lm(lnRPurse ~ Wins + KO + Venue + PPV, data = boxing1)
summary(multi_model1)
##
## Call:
## lm(formula = lnRPurse ~ Wins + KO + Venue + PPV, data = boxing1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3575 -0.4606 -0.0662 0.3916 5.3464
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.511124 0.098788 76.032 < 2e-16 ***
## Wins 0.075174 0.005664 13.272 < 2e-16 ***
## KO 0.023850 0.007553 3.158 0.001614 **
## VenueCosmoLV 0.432796 0.127400 3.397 0.000694 ***
## VenueDoubletree Hotel -0.441198 0.115152 -3.831 0.000131 ***
## VenueFantasy Springs -0.055508 0.115413 -0.481 0.630604
## VenueHard Rock 0.113849 0.116315 0.979 0.327799
## VenueMandalay Bay 0.299973 0.124036 2.418 0.015676 *
## VenueMGM Arena 0.250859 0.132793 1.889 0.059023 .
## VenueMGM Grand 0.662178 0.116842 5.667 1.66e-08 ***
## VenueStubHub 0.545236 0.125930 4.330 1.57e-05 ***
## VenueTexas Station -0.106896 0.134190 -0.797 0.425778
## PPV 1.840537 0.097644 18.850 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.025 on 2017 degrees of freedom
## Multiple R-squared: 0.6865, Adjusted R-squared: 0.6846
## F-statistic: 368.1 on 12 and 2017 DF, p-value: < 2.2e-16
multi_model2 <- lm(lnRPurse ~ Wins + KO + PPV, data = boxing1)
summary(multi_model2)
##
## Call:
## lm(formula = lnRPurse ~ Wins + KO + PPV, data = boxing1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9008 -0.5346 -0.0799 0.4309 5.7224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.561499 0.038790 194.936 < 2e-16 ***
## Wins 0.080564 0.005896 13.663 < 2e-16 ***
## KO 0.027188 0.007883 3.449 0.000574 ***
## PPV 2.023428 0.092812 21.801 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.074 on 2026 degrees of freedom
## Multiple R-squared: 0.6547, Adjusted R-squared: 0.6542
## F-statistic: 1280 on 3 and 2026 DF, p-value: < 2.2e-16