library(tidyverse)
library(ggthemes)
library(ggrepel)
library(patchwork)
library(broom)
library(lindia)
library(car)

data <- read_delim("./sports.csv", delim = ',')
# filtering NCAA Division I
ncaa_I <- data |>
  filter(classification_name %in% c('NCAA Division I without football', 
  'NCAA Division I-FBS','NCAA Division I-FCS')) 
# these are rows of data with no information on athletes and revenue. Basically NA values on both men and women athletes. I'm removing those.
# Also I filled NA with zeros where only one column was NA. This allows us to count the Total Athletes. Also Football has NA for women, so having zero there makes sense. 
filtered_ncaa_I <- ncaa_I |>
  filter(!(is.na(partic_men) & is.na(partic_women))) |>
  mutate(
    partic_men = ifelse(is.na(partic_men), 0, partic_men),
    partic_women = ifelse(is.na(partic_women), 0, partic_women)) |>
  filter(sector_name == 'Public, 4-year or above')
# Adding total athletes column
filtered_ncaa_I <- filtered_ncaa_I |>
  mutate(Total_Athletes = partic_men + partic_women)
# Athletes in each university 
summarized_data <- filtered_ncaa_I |>
  group_by(institution_name) |>
  summarise(Athletes = sum(Total_Athletes),
            Revenue = sum(total_rev_menwomen))


# Plotting the code above
ggplot(summarized_data, aes(x = institution_name, y = Revenue, fill = Athletes)) +
  geom_bar(stat = "identity") +  # Create a bar plot
  scale_fill_gradient(low = "lightblue", high = "darkblue") +  # Color scale for Athletes
  theme_minimal() +  # Minimal theme for clarity
  theme(
  plot.title = element_text(size = 16, hjust = 0.5, face = 'bold'),
  axis.text.x = element_text(size = 10.5,hjust = 1)
  ) +
  scale_x_discrete(labels = c(
    "Ball State University" = "Ball State",
    "Indiana University-Purdue University-Fort Wayne" = "IUPUF",
    "Indiana University-Purdue University-Indianapolis" = "IUPUI",
    "Indiana State University"="ISU",
    "Purdue University-Main Campus" = "Purdue",
    "Indiana University-Bloomington" = "IU Bloom",
    "Purdue University Fort Wayne"= "Purdue Fort Wayne"
  )) +
  labs(
    title = "Revenue by Institution w/ Varying Athlete Count",
    x = "Public Institutions",
    y = "Revenue (in USD)",
    fill = "Athletes"  # Legend title for the fill color
  )

From the graph above, it can be seen that higher total athletes in a universities is related to higher revenue. But we need more analysis to confirm what’s going on.

#box plot
ggplot(filtered_ncaa_I, mapping = aes(x=institution_name,y=total_rev_menwomen)) +
  geom_boxplot()+
  scale_x_discrete(labels = c(
    "Ball State University" = "Ball State",
    "Indiana University-Purdue University-Fort Wayne" = "IUPUF",
    "Indiana University-Purdue University-Indianapolis" = "IUPUI",
    "Indiana State University"="ISU",
    "Purdue University-Main Campus" = "Purdue",
    "Indiana University-Bloomington" = "IU Bloom",
    "Purdue University Fort Wayne"= "Purdue Fort Wayne"
  )) +
  scale_y_log10() +
  theme_minimal() +
  labs(
    title = "Distributions of Universities in terms of Revenue Generated",
    x = "Institutions",
    y = "Revenue (log scaled)") + 
  theme(
  plot.title = element_text(hjust = 0.5, face = 'bold'),
  axis.text.x = element_text(,hjust = 1)
  ) 

The box plot is showing a different picture. The median revenue is lower for IU Bloomington even though it has highest total number of athletes. So it seems that only certain sports are inflating the total revenue.

#scatterplot
ggplot(summarized_data, mapping = aes(x = Athletes, y = Revenue)) +
  geom_point(color = 'chocolate') +
  theme_minimal() +
  geom_smooth(method = "lm", color = "red", se = FALSE, size = 0.3) +
  labs(
    title = 'Positive linear relationship between athletes & revenue',
    y= "Revenue (in USD)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  )
## `geom_smooth()` using formula = 'y ~ x'

The scatterplot above is just based on the barplot we had above. We already looked at the boxplot and found that only certain sports might be heavily contributing to revenue. So we can’t use this scatterplot as it could be misleading and might not show the true picture of what’s happening.

# another scatterplot
ggplot(filtered_ncaa_I, mapping = aes(x = Total_Athletes, y = total_rev_menwomen)) +
  geom_point(color = 'chocolate') +
  theme_minimal() +
  # geom_smooth(method = "lm", color = "red", se = FALSE, size = 0.3) +
  labs(
    title = 'Athletes across different sports and revenue',
    y= "Revenue (in USD)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  ) 

  # scale_y_log10() +
  # scale_x_log10()

Each data point in scatterplot above is connected to a certain sport. From this plot, we can see around 110 athletes and 30 athletes, the revenue shoots up. So it is safe to say that these sports are heavy contributors to revenue compared to others. After looking at the dataset, it was found that “30 athlete sport” is basketball and other one is football. So let’s look at them separately to see if number of athletes have any effect on revenue.

filtered_ncaa_I |>
  filter(sports == 'Football') |>
  ggplot(mapping = aes(x = Total_Athletes, y = total_rev_menwomen)) +
  geom_point(color = 'chocolate') +
  theme_minimal() +
  # geom_smooth(method = "lm", color = "red", se = FALSE, size = 0.3) +
  labs(
    title = 'Football Athletes and Revenue',
    y= "Revenue (in USD)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  ) 

Athletes less than 105 seem to have significantly lower revenue. But we also need to consider FBS and FCS schools in Division I. The lower athlete group is most likely FCS, thus lower revenue. Let’s filter again to just include FBS.

filtered_ncaa_I |>
  filter(classification_name == 'NCAA Division I-FBS') |>
  filter(sports == 'Football') |>
  ggplot(mapping = aes(x = Total_Athletes, y = total_rev_menwomen)) +
  geom_point(color = 'chocolate', size = 3) +
  theme_minimal() +
  # geom_smooth(method = "lm", color = "red", se = FALSE, size = 0.3) +
  labs(
    title = 'FBS Div I Football Athletes and Revenue',
    y= "Revenue (in USD)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  ) 

In this plot, if we look closely around the range of around 110 - 125 athletes. We can see there are kind of two groups, lower end of revenue and higher end. I interpret that it’s higher quality of athletes and not necessarily the number of athletes that lead to higher revenue. We can do the same thing on basketball.

filtered_ncaa_I |>
  filter(sports == 'Basketball') |>
  ggplot(mapping = aes(x = Total_Athletes, y = total_rev_menwomen)) +
  geom_point(color = 'chocolate', size = 3 ) +
  theme_minimal() +
  # geom_smooth(method = "lm", color = "red", se = FALSE, size = 0.3) +
  labs(
    title = 'Basketball Athletes and Revenue',
    y= "Revenue (in USD)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  ) 

Again, we can see similar thing happening: same number of athletes but different revenue generation. Quality of athletes seems to be that separating factor. There’s no other practical explanation that might apply here better.

x <- filtered_ncaa_I |>
  select(Total_Athletes) |>
  pull()


y <- filtered_ncaa_I |>
  select(total_rev_menwomen) |>
  pull()

The correlation test below finds a correlation between total athletes and revenue to be significant. However, it could be due to FCS and FBS schools, FCS schools generally has lower revenue and lower athletes. That’s why higher athlete count might be showing correlation with higher revenue.

test_result <- cor.test(x,y,method = "pearson")

test_result
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 3.9492, df = 306, p-value = 9.739e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1111945 0.3240007
## sample estimates:
##       cor 
## 0.2202163
ggplot(filtered_ncaa_I, mapping = aes(x = Total_Athletes, y = total_rev_menwomen)) +
  geom_point(color = 'chocolate') +
  theme_minimal() +
  geom_smooth(method = "lm", color = 'blue', se = FALSE, size = 0.3) +
  labs(
    title = 'Athletes across different sports and revenue',
    y= "Revenue (in USD) log scaled",
    x = "Total Athletes (log scaled)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  ) +
  scale_y_log10() +
  scale_x_log10()
## `geom_smooth()` using formula = 'y ~ x'

The linear regression analysis in this case might be misleading if not careful. Both x and y axes are log scaled. Because of the scaling, we’re losing the excessive jumps in revenue of basketball and football. Also it’s important to remember each data point is connected to a specific sport. So more athletes here could just mean having more diversity in sports programs or having more sports. It’s not just more athletes, more revenue. So we already figured higher quality of athletes in basketball and football is better for revenue, besides that it may be beneficial to have more sports programs.

#attempting linear regression
lm_model <- lm(total_rev_menwomen ~ log(Total_Athletes),
            filtered_ncaa_I)
summary(lm_model)
## 
## Call:
## lm(formula = total_rev_menwomen ~ log(Total_Athletes), data = filtered_ncaa_I)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -8951621 -2520465 -1231961   -47483 49610517 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -8893234    2234582   -3.98 8.62e-05 ***
## log(Total_Athletes)  3292717     612016    5.38 1.48e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8004000 on 306 degrees of freedom
## Multiple R-squared:  0.08642,    Adjusted R-squared:  0.08343 
## F-statistic: 28.95 on 1 and 306 DF,  p-value: 1.482e-07
#power check before we apply log on response variable
pT <- powerTransform(lm_model, family = "bcnPower") 
pT$lambda
## [1] -0.04990567

We got -0.049. It’s less than 0.1, so we apply log to the response as well.

lm_model <- lm(log(total_rev_menwomen) ~ log(Total_Athletes),
            filtered_ncaa_I)
summary(lm_model)
## 
## Call:
## lm(formula = log(total_rev_menwomen) ~ log(Total_Athletes), data = filtered_ncaa_I)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2952 -0.8209  0.1565  0.5768  3.8781 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          10.8636     0.4184  25.967  < 2e-16 ***
## log(Total_Athletes)   0.6939     0.1146   6.056 4.08e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.499 on 306 degrees of freedom
## Multiple R-squared:  0.107,  Adjusted R-squared:  0.1041 
## F-statistic: 36.68 on 1 and 306 DF,  p-value: 4.081e-09

Because this is a log-log linear regression, interpretation is bit different. For every 1% increase in total athletes, we can expect about 0.6939% increase in revenue. Again, it can’t be taken at face value. It’s not just total athletes but athletes across different sports. To get more revenue, you most likely need to expand sports programs.

#visualizing log transformations to variables
filtered_ncaa_I |>
  ggplot(mapping = aes(x = log(Total_Athletes), 
                       y = log(total_rev_menwomen))) +
  geom_point() +
  geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed',
              se = FALSE) +
  labs(title = "test") +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'