library(conflicted)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("lag", "dplyr")
## [conflicted] Will prefer dplyr::lag over any other package.
# load ncaa file I cleaned
ncaa <- read.csv("./ncaa_clean.csv", header = TRUE)
Remaking last week’s regression model, but adding more variables
# only includes FBS schools in 2019
schools_1 <- ncaa |>
filter(year == 2019) |>
filter(classification_code == 1) |>
group_by(institution_name) |>
summarise(revenue = sum(rev_men, na.rm = TRUE) +
sum(rev_women, na.rm = TRUE),
expense = sum(exp_men, na.rm = TRUE) +
sum(exp_women, na.rm = TRUE),
division = min(classification_name),
students = sum(max(ef_male_count, na.rm = TRUE) +
max(ef_female_count, na.rm = TRUE)),
athletes = sum(sum_partic_men, na.rm = TRUE) +
sum(sum_partic_women, na.rm = TRUE),
state = max(state_cd))
# last week's model
model_1 <- lm(expense ~ revenue, schools_1) |> summary()
model_1
##
## Call:
## lm(formula = expense ~ revenue, data = schools_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41955383 -5491295 -1733269 4157223 29504034
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.224e+07 1.569e+06 7.799 2.12e-12 ***
## revenue 6.622e-01 2.280e-02 29.042 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9581000 on 125 degrees of freedom
## Multiple R-squared: 0.8709, Adjusted R-squared: 0.8699
## F-statistic: 843.5 on 1 and 125 DF, p-value: < 2.2e-16
cor(schools_1[c('revenue', 'students', 'athletes')])
## revenue students athletes
## revenue 1.0000000 0.4640764 0.6654169
## students 0.4640764 1.0000000 0.4992977
## athletes 0.6654169 0.4992977 1.0000000
Nothing seems to be too strongly correlated with another variable being tested, so I don’t think adding the extra variable will pad the results. I couldn’t do this with state values, but as we’ll find out soon that isn’t important anyways. In hind sight, I probably shouldn’t have used a non-integer or continuous variable anyways.
# model plus students
model_2 <- lm(expense ~ revenue + students, schools_1) |> summary()
model_2
##
## Call:
## lm(formula = expense ~ revenue + students, data = schools_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42012805 -5478859 -1691262 4136337 29339879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.206e+07 1.986e+06 6.071 1.44e-08 ***
## revenue 6.604e-01 2.584e-02 25.556 < 2e-16 ***
## students 1.435e+01 9.667e+01 0.148 0.882
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9618000 on 124 degrees of freedom
## Multiple R-squared: 0.871, Adjusted R-squared: 0.8689
## F-statistic: 418.4 on 2 and 124 DF, p-value: < 2.2e-16
# model plus athletes
model_3 <- lm(expense ~ revenue + athletes, schools_1) |> summary()
model_3
##
## Call:
## lm(formula = expense ~ revenue + athletes, data = schools_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37436952 -5201605 -1473590 3939281 22849124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.099e+06 3.386e+06 0.620 0.53640
## revenue 5.969e-01 2.937e-02 20.320 < 2e-16 ***
## athletes 2.325e+04 6.952e+03 3.344 0.00109 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9213000 on 124 degrees of freedom
## Multiple R-squared: 0.8816, Adjusted R-squared: 0.8797
## F-statistic: 461.7 on 2 and 124 DF, p-value: < 2.2e-16
# model plus state?
model_4 <- lm(expense ~ revenue + state, schools_1) |> summary()
model_4
##
## Call:
## lm(formula = expense ~ revenue + state, data = schools_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39022792 -3707805 -564277 3986087 19464623
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.051e+06 4.696e+06 1.715 0.0900 .
## revenue 6.695e-01 2.588e-02 25.867 <2e-16 ***
## stateAR 5.286e+06 8.248e+06 0.641 0.5233
## stateAZ 1.231e+07 8.246e+06 1.493 0.1391
## stateCA 1.381e+07 5.780e+06 2.390 0.0191 *
## stateCO -4.036e+05 8.248e+06 -0.049 0.9611
## stateCT 2.294e+07 1.081e+07 2.123 0.0367 *
## stateFL 6.674e+06 5.771e+06 1.157 0.2507
## stateGA -4.732e+05 6.615e+06 -0.072 0.9431
## stateHI 8.329e+06 1.085e+07 0.768 0.4448
## stateIA 6.601e+06 8.261e+06 0.799 0.4265
## stateID 1.571e+06 1.083e+07 0.145 0.8850
## stateIL 2.166e+06 7.198e+06 0.301 0.7642
## stateIN 4.135e+06 6.617e+06 0.625 0.5337
## stateKS 5.095e+06 8.247e+06 0.618 0.5384
## stateKY 7.761e+06 7.197e+06 1.078 0.2840
## stateLA -7.670e+05 6.262e+06 -0.122 0.9028
## stateMA 7.421e+06 8.256e+06 0.899 0.3713
## stateMD 5.170e+06 1.080e+07 0.479 0.6332
## stateMI 8.210e+05 6.233e+06 0.132 0.8955
## stateMN 1.247e+07 1.081e+07 1.153 0.2520
## stateMO 6.761e+06 1.080e+07 0.626 0.5329
## stateMS 7.819e+06 7.203e+06 1.085 0.2808
## stateNC 2.080e+06 5.773e+06 0.360 0.7195
## stateNE -1.330e+07 1.089e+07 -1.222 0.2251
## stateNJ 1.499e+07 1.080e+07 1.389 0.1686
## stateNM -4.809e+05 8.309e+06 -0.058 0.9540
## stateNV -9.015e+04 8.289e+06 -0.011 0.9913
## stateNY 4.227e+06 8.250e+06 0.512 0.6097
## stateOH 1.457e+06 5.658e+06 0.258 0.7974
## stateOK -7.916e+05 7.204e+06 -0.110 0.9128
## stateOR 3.727e+06 8.247e+06 0.452 0.6525
## statePA 5.291e+06 7.220e+06 0.733 0.4657
## stateSC 1.029e+07 7.198e+06 1.430 0.1563
## stateTN 2.884e+06 6.611e+06 0.436 0.6638
## stateTX 2.667e+04 5.246e+06 0.005 0.9960
## stateUT 6.141e+06 7.207e+06 0.852 0.3966
## stateVA 9.038e+06 6.614e+06 1.367 0.1754
## stateWA -3.303e+06 8.273e+06 -0.399 0.6907
## stateWI -8.648e+06 1.091e+07 -0.792 0.4303
## stateWV -1.510e+04 8.261e+06 -0.002 0.9985
## stateWY -1.751e+06 1.083e+07 -0.162 0.8720
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9855000 on 85 degrees of freedom
## Multiple R-squared: 0.9071, Adjusted R-squared: 0.8623
## F-statistic: 20.25 on 41 and 85 DF, p-value: < 2.2e-16
# model plus athletes with an interaction
model_5 <- lm(expense ~ revenue*athletes, schools_1) |> summary()
model_5
##
## Call:
## lm(formula = expense ~ revenue * athletes, data = schools_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34800961 -3710564 -673700 3076256 21530626
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.587e+07 5.855e+06 -4.419 2.15e-05 ***
## revenue 9.889e-01 7.495e-02 13.194 < 2e-16 ***
## athletes 7.285e+04 1.085e+04 6.716 6.15e-10 ***
## revenue:athletes -6.332e-04 1.133e-04 -5.588 1.41e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8261000 on 123 degrees of freedom
## Multiple R-squared: 0.9056, Adjusted R-squared: 0.9033
## F-statistic: 393.2 on 3 and 123 DF, p-value: < 2.2e-16
Lets see the differences
rsq = c(model_1$r.squared, model_2$r.squared, model_3$r.squared,
model_4$r.squared, model_5$r.squared)
rsqadj = c(model_1$adj.r.squared,
model_2$adj.r.squared,
model_3$adj.r.squared,
model_4$adj.r.squared,
model_5$adj.r.squared)
list_name = c('revenue', '+students', '+athletes', '+state', '*athletes')
list = c(1,2,3,4,5)
#plot(x=list, y= rsqadj, type="l")
#lines(list, y = rsq, type="l")
ggplot() +
geom_line(mapping = aes(x = list, y = rsq, color = 'r_sq')) +
geom_line(mapping = aes(x = list, y = rsqadj, color = 'r_sq_adj')) +
#scale_x_discrete(labels = list_name)
labs(title = "R squared vs R squared adjusted for different lr models",
x = "tests:: 1: revenues, 2: +students, 3: +athletes, 4: +state, 5: *athletes",
y = "value") +
theme_minimal()
As we can see, students and state values don’t help improve our model. However, the inclusion of athletes, especially when athletes are also included as an interaction variable, improves the model. The graph makes it look like it was a big jump, but this change was really only about a 2% increase in effectiveness.
To begin, to say that revenue and number of athletes would positively impact performance in explaining revenue makes logical sense. Many programs, especially in their non-profit nature, have an incentive, if not a legal requirement, to have expenses match revenues. Additionally, additional number of students on rosters create extra variable expenses such as scholarships, equipment, and meal expenses, but marginal revenue caused by individual athletes can be minimal given the team nature of many sports.
For further evaluation, we can use diagnostic plots.
model = lm(expense ~ revenue*athletes, schools_1)
library(lindia)
gg_resfitted(model) +
geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
For the most part, our assumptions hold true. It isn’t great that variances are much lower in the first part of the graph, and of course we can see a few outliers towards the end, but the majority of the graph fits very well. Additionally, the curve is very flat throughout the graph, indicating that there isn’t any substantial over or under estimations.
plots <- gg_resX(model, plot.all = FALSE)
# for each variable of interest …
plots$revenue +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
plots$athletes +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
This gives us additional evidence that errors are normally distributed over the prediction line. The lines are relatively flat with the only minor issues being a few outliers and a bit more clustering towards the left of the plot. Very similar to the one above, this is pretty solid overall, and is aided by the blue line being pretty flat across the majority of the plot.
gg_reshist(model)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#data <- rnorm(1000, mean = mean(x), sd = sd(x))
x <- model$residuals
x_val <- seq(min(x), max(x), length=40)
fun <- dnorm(x_val, mean = mean(x), sd = sd(x))
hist(x, breaks=50)
lines(x_val, fun*180000000)
We can see that is is far from a perfect looking distribution, but it has a normal look to it. Given the small sample size of the data, this looks pretty accurate for a normal distribution. It may appear to have slight outliers on both sides, especially on the left, but once again, given the small amount of data, this isn’t too surprising.
gg_qqplot(model)
We can see on the x axis of -1 to 1, our data fits the theoretical distribution remarkably well. Beyond this range though, we start to see outliers. Luckily, it appears that the outliers are “even”, or that there is a similar amount of outliers above the expected values as there are below the expected value, so this shouldn’t cause any severe errors to our regression model.
library(lindia)
library(ggthemes)
library(ggrepel)
gg_cooksd(model, threshold = 'matlab')
ggplot(data = slice(schools_1,
c(41,62,79,73))) +
geom_point(data = schools_1,
mapping = aes(x = revenue, y = expense)) +
geom_point(mapping = aes(x = revenue, y = expense),
color = 'darkred') +
geom_text_repel(mapping = aes(x = revenue,
y = expense,
label = institution_name),
color = 'darkred') +
labs(title="Investigating High Influence Points",
subtitle="Label = Institution name")
These are some of the schools which, per Cook’s distance, stand out as highly influential rows. Many of these we’ve seen appear in other data dives, such as Texas, Ohio, and Georgia, three of the largest schools in the US with some of the most athletes and biggest fan bases of all universities. Some schools closer to the threshold like UCLA are still significant, with this school in particular likely standing out with a big net loss after an underwhelming athletic performance in 2019, but as we can see above they are significantly less significant than those like Ohio or Texas.
When making models, it might be a good idea to try to “smooth” each institution’s values by taking an average across all the years available. For schools like Texas, Ohio, and potentially Georgia (common outliers in past data dives), although I don’t have the data available in the current data set, I would want to consider adding in columns such as placement at national championships or major tournaments to the linear regression and perhaps help explain the extra variation.