load the library

library(tidyverse)

load the data

setwd("~/Nassor/MC/Data101")
boxing <- read_csv("boxing_pay_data.csv")
## Rows: 4670 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Boxer, Date, Venue
## dbl (24): Purse, lnRPurse, weight, Age, Wins, Losses, KO, W-Title, PPV, ESPN...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(boxing)
## [1] 4670   27
head(boxing)
## # A tibble: 6 × 27
##   Boxer     Date  Venue Purse lnRPurse weight   Age  Wins Losses    KO `W-Title`
##   <chr>     <chr> <chr> <dbl>    <dbl>  <dbl> <dbl> <dbl>  <dbl> <dbl>     <dbl>
## 1 Gamboa, … 2009… Buff… 37998    10.7     126  27.1    12      0    10         0
## 2 Gonzalez… 2009… Buff… 20000    10.0     127  30.7    27      2    18         0
## 3 Lara, Er… 2009… Buff…  3000     8.14    154  25.8     2      0     1         0
## 4 Bogere, … 2009… Buff…  1500     7.45    135  20.2     4      0     2         0
## 5 Frazier,… 2009… Buff…  1000     7.04    139  30.9     1      0     1         0
## 6 Gonzalez… 2009… Buff…  1500     7.45    146  31.5     2      1     1         0
## # ℹ 16 more variables: PPV <dbl>, ESPN <dbl>, HBO <dbl>, FOX <dbl>,
## #   TopRank <dbl>, GoldenBoy <dbl>, RDS <dbl>, Y2009 <dbl>, Y2010 <dbl>,
## #   Y2011 <dbl>, Y2012 <dbl>, Y2013 <dbl>, Y2014 <dbl>, Y2015 <dbl>,
## #   Y2016 <dbl>, Y2017 <dbl>
boxing_venue <- boxing |>
  group_by(Venue) |>
  count() |>
  arrange(desc(n))|>
  head(10)
boxing_venue
## # A tibble: 10 × 2
## # Groups:   Venue [10]
##    Venue                n
##    <chr>            <int>
##  1 MGM Grand          313
##  2 Doubletree Hotel   272
##  3 Fantasy Springs    268
##  4 Hard Rock          261
##  5 Mandalay Bay       185
##  6 StubHub            176
##  7 MGM Arena          161
##  8 CosmoLV            160
##  9 Texas Station      122
## 10 Belasco Theater    112
options(scipen = 999)
boxer_count <- boxing |>
  group_by(Boxer) |>
  summarize(count = n(), mean_purse = mean(Purse), max_purse = max(Purse), mean_win = mean(Wins), mean_KO = mean(KO)) |>
  arrange(desc(count)) |>
  filter(count >= 10)
boxer_count
## # A tibble: 57 × 6
##    Boxer               count mean_purse max_purse mean_win mean_KO
##    <chr>               <int>      <dbl>     <dbl>    <dbl>   <dbl>
##  1 Magdaleno, Jessie      21     17576.    155000    11.5     8.52
##  2 Chavez, Joaquin R.     18      3000       6000     4.39    1.78
##  3 Vargas, Jessie         18    227028.   2800000    16.6     7.39
##  4 Magdaleno, Diego       17     11788.     25000    16.2     5.24
##  5 Diaz, Joseph Jnr.      16     32250     200000    11.1     6.75
##  6 Bogere, Sharif         15     11647.     75000    16.1    10.3 
##  7 De La Hoya, Diego      15     13967.     80000     7.4     4.8 
##  8 Gutierrez, Jesus A.    15      3343.     10000     7.2     2.27
##  9 Breazeale, Dominic     14     10464.     50000     7       6.64
## 10 Gavril, Ronald         14     21143.    125000    10.9     8.29
## # ℹ 47 more rows
 # count head(50) or count >= 10

Alvarez, Saul C. Benavidez, David Broner, Adrien J. Crawford, Terence Davis, Gervonta Duran, Roberto Garcia, Ryan Mayweather, Floyd Pacquiao, Emmanuel

Lomachenko, Vasyl Golovkin, Gennady Haney, Devin Khan, Amir Lara, Erislandy Maidana, Marcos Marquez, Juan Manuel Rigondeaux, Guillermo

Porter, Shawn Donaire, Nonito Gamboa, Yuriorkis

unique(boxing$`W-Title`)
## [1] 0 1 2 3
class(boxing$`W-Title`)
## [1] "numeric"
table(boxing$`W-Title`)
## 
##    0    1    2    3 
## 4389  232   40    9
summary(boxing$KO)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   4.000   6.881  10.000  51.000
class(boxing$KO)
## [1] "numeric"
summary(boxing$Wins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    3.00    8.00   11.39   17.00   65.00
class(boxing$Wins)
## [1] "numeric"
max(boxing$Wins)
## [1] 65
#Max number of wins is 65

summary(boxing$lnRPurse)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.587   7.366   8.195   8.567   9.210  18.455
unique(boxing$PPV)
## [1] 0 1
class(boxing$PPV)
## [1] "numeric"
cor(boxing$PPV, boxing$lnRPurse, use = "complete.obs")
## [1] 0.5261753
cor(boxing$Wins, boxing$lnRPurse, use = "complete.obs")
## [1] 0.7635325
#cor(boxing$Losses, boxing$lnRPurse, use = "complete.obs")
#cor(boxing$weight, boxing$lnRPurse, use = "complete.obs")

Remove extreme lnRPurse outliers that are breaking the model

(filtering for top 10 venues)

boxing1 <- boxing |>
  #filter(lnRPurse <= 10) |>
  filter(Venue %in% c("MGM Grand",  "Doubletree Hotel", "Fantasy Springs", "Hard Rock", "Mandalay Bay", "StubHub", "MGM Arena", "CosmoLV", "Texas Station", "Belasco Theater"))

Check the distribution for number of wins

There are zeros so we have to use square root as opposed to the log transformation.

summary(boxing1$Wins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    5.00   12.00   14.15   21.00   65.00
ggplot(boxing1, aes(x=Wins))+
  geom_density()

The distribution of wins is strongly skewed right.

Perform square root transformation for Wins

ggplot(boxing1, aes(x=sqrt(Wins)))+
  geom_density()

Able to use the square root function to make the distribution of wins more bell shaped.

Check distribution for lnRPurse

ggplot(boxing1, aes(x=lnRPurse)) +
  geom_density()

Unable to transform this distribution because the numeric values for purse were already log transformed

ggplot(boxing1, aes(x = Wins, y = lnRPurse)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Boxer Wins vs Purse",
       x = "Wins", y = "Fight Purse") +
  theme_minimal()

Linear, but a lot more variability on the upper end.

options(scipen = 0)
model <- lm(lnRPurse ~ Wins, data = boxing1)
summary(model)
## 
## Call:
## lm(formula = lnRPurse ~ Wins, data = boxing1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9618 -0.5350 -0.1228  0.4463  5.8516 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.419804   0.042128   176.1   <2e-16 ***
## Wins        0.120267   0.002313    52.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.195 on 2028 degrees of freedom
## Multiple R-squared:  0.5715, Adjusted R-squared:  0.5712 
## F-statistic:  2704 on 1 and 2028 DF,  p-value: < 2.2e-16
  1. The Estimate column under “Coefficients” — first row is the intercept, second is the slope. y = b0 + b1x y = 6.38 + 0.81x
  2. The R-squared value — how much of the variation in y is explained by x. 0 = none, 1 = all. Adjusted R-squared: 0.5226 52% of the variation are explained by this model.
  3. The p-value for the slope — is the relationship statistically significant? p value is close to zero and < alpha of 0.05. Wins are a meaningful predictor of purse (fighter pay)

“For every Win, the predicted lnRPurse goes up by 0.81 points.” “If a boxer has 0 Wins, the predicted lnRPurse is 6.38”

winput <- data.frame(Wins = c(5, 15, 25, 35, 45, 55, 65))
purse_estimate <- exp(predict(model, winput))
cbind(winput, purse_estimate)
##   Wins purse_estimate
## 1    5       3044.636
## 2   15      10135.528
## 3   25      33740.963
## 4   35     112322.963
## 5   45     373920.804
## 6   55    1244774.570
## 7   65    4143828.624

Before trusting a regression model, check two assumptions:

  1. Homoscedasticity — residuals (errors) should have constant spread, not a funnel
  2. Normality of residuals — residuals should follow a roughly normal distribution
plot(model, which = 1)

  #An IDEAL residual plot looks like random dots scattered around the horizontal line at 0. A funnel shape or a clear pattern = bad sign


# Q-Q plot: checks normality of residuals
plot(model, which = 2)

  #An IDEAL Q-Q plot has points falling close to the diagonal line. Big deviations at the tails = non-normal residuals.

Residuals vs Fitted shows a mostly random distribution. Maybe a bit of a funnel shape. QQ Residuals shows non-normal residuals at the tails and falls along the diagonal in the center

rmse <- sqrt(mean(residuals(model)^2))
rmse
## [1] 1.194695

On average, my predicted fight purse is off by about 1.195 rmse points.”

Multiple Regression

multi_model1 <- lm(lnRPurse ~ Wins + KO + Venue + PPV, data = boxing1)
summary(multi_model1)
## 
## Call:
## lm(formula = lnRPurse ~ Wins + KO + Venue + PPV, data = boxing1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3575 -0.4606 -0.0662  0.3916  5.3464 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            7.511124   0.098788  76.032  < 2e-16 ***
## Wins                   0.075174   0.005664  13.272  < 2e-16 ***
## KO                     0.023850   0.007553   3.158 0.001614 ** 
## VenueCosmoLV           0.432796   0.127400   3.397 0.000694 ***
## VenueDoubletree Hotel -0.441198   0.115152  -3.831 0.000131 ***
## VenueFantasy Springs  -0.055508   0.115413  -0.481 0.630604    
## VenueHard Rock         0.113849   0.116315   0.979 0.327799    
## VenueMandalay Bay      0.299973   0.124036   2.418 0.015676 *  
## VenueMGM Arena         0.250859   0.132793   1.889 0.059023 .  
## VenueMGM Grand         0.662178   0.116842   5.667 1.66e-08 ***
## VenueStubHub           0.545236   0.125930   4.330 1.57e-05 ***
## VenueTexas Station    -0.106896   0.134190  -0.797 0.425778    
## PPV                    1.840537   0.097644  18.850  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.025 on 2017 degrees of freedom
## Multiple R-squared:  0.6865, Adjusted R-squared:  0.6846 
## F-statistic: 368.1 on 12 and 2017 DF,  p-value: < 2.2e-16
multi_model2 <- lm(lnRPurse ~ Wins + KO + PPV, data = boxing1)
summary(multi_model2)
## 
## Call:
## lm(formula = lnRPurse ~ Wins + KO + PPV, data = boxing1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9008 -0.5346 -0.0799  0.4309  5.7224 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.561499   0.038790 194.936  < 2e-16 ***
## Wins        0.080564   0.005896  13.663  < 2e-16 ***
## KO          0.027188   0.007883   3.449 0.000574 ***
## PPV         2.023428   0.092812  21.801  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.074 on 2026 degrees of freedom
## Multiple R-squared:  0.6547, Adjusted R-squared:  0.6542 
## F-statistic:  1280 on 3 and 2026 DF,  p-value: < 2.2e-16

Bonus 1. Did the R-squared go up compared to the simple model in Part C?

Bonus 2. Which predictor has a bigger effect (look at the magnitudes of the slopes)?

Bonus 3. Predict the IMDB rating for a movie with Rotten_Tomatoes_Rating = 80 AND Running_Time_min = 120.