Loading libraries and the dataset

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
data <- read_delim("./sports.csv", delim = ',')
## Rows: 2936 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): institution_name, city_txt, state_cd, classification_name, classif...
## dbl (21): year, unitid, zip_text, classification_code, ef_male_count, ef_fem...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Saving a copy

data_raw <- data

First Pair

data <- data |>
  mutate(profit = total_rev_menwomen - total_exp_menwomen)

p <- data |>
  select(total_rev_menwomen, profit)

first_pair <- as_tibble(p)
first_pair
## # A tibble: 2,936 × 2
##    total_rev_menwomen profit
##                 <dbl>  <dbl>
##  1             148226      0
##  2             232988      0
##  3              56770      0
##  4             226856      0
##  5             126341      0
##  6              80311      0
##  7             160295   6048
##  8             246495   1535
##  9             188718  31729
## 10             286347   3651
## # ℹ 2,926 more rows

Creating a categorical column from numeric

data <- data |>
  mutate(students = cut(
    ef_total_count, breaks = c(0,1500,3000,4000,5000,Inf), labels = c("0-1500","1500-3000","3000-4000","4000-5000","5000 and up"),right = FALSE)) 

Second Pair

# options(scipen = 999)
data <- data |>
  mutate(profit_margin = (p$profit/total_rev_menwomen)*100,
         profit_margin = round(profit_margin, digits = 0)
         ) 

q <- data |>
  select(students, profit_margin)
 
pair <- q |>
  group_by(students) |>
  summarise(profit_margin = mean(profit_margin, na.rm = TRUE))

second_pair <- as_tibble(pair)
second_pair # we created second pair by first converting numeric var to categorical and then attached it to profit margin.
## # A tibble: 5 × 2
##   students    profit_margin
##   <fct>               <dbl>
## 1 0-1500               2.45
## 2 1500-3000            2.41
## 3 3000-4000            2.41
## 4 4000-5000            1.87
## 5 5000 and up       -512.

Visualizations

First Pair

library(ggthemes)
ggplot(first_pair, mapping = aes(x = total_rev_menwomen, y = profit)) +
geom_point() +
theme_clean() +
labs(
  title = "Relationship between sports revenue and profit for institutions in Indiana",
  x = "Revenue",
  y = "Profit") +
theme(plot.title = element_text(size = 12)) 
## Warning: Removed 958 rows containing missing values or values outside the scale range
## (`geom_point()`).

We can see a clear trend in the plot. As revenue goes up, profit also goes up. Another thing to note is values below zero on y-axis. These are institutions in loss, and we can see that there are no institutions in a loss if their revenue is higher than 30 million approximately. One more thing I noticed is the gap between 60 and 90 million. There are some institutions with higher revenue than that gap and most are below that gap. However, I don’t consider any outliers here.

Second Pair

ggplot(second_pair, mapping = aes(x = students, y = profit_margin)) +
geom_col() +
theme_clean() +
labs(
  title = "How number of students affect profit margin in sports for institutions in Indiana",
  x = "Number of Students",
  y = "Mean Profit Margin"
) +
theme(plot.title = element_text(size = 12)) +
ylim(0, max(second_pair$profit_margin, na.rm = TRUE))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).

In this plot, we only included the profits, not the losses because they were interfering with the visualization. Anyways, the mean profit margin stays positive up until 5000 students. In the ‘5000 and up’ group, it drops to whopping -500 (not shown in figure). So we can see a general downtrend even in the positive margins as the number of students increase.

Correlation Coefficients

First Pair

round(cor(first_pair$total_rev_menwomen,first_pair$profit, use = "complete.obs"),2)
## [1] 0.94

For the first pair, we’re getting 0.94 correlation coefficient. This makes sense because our visualization also showed good positive linear relationship. So, it’s clear that higher revenue leads to higher profit.

Second Pair

# we need to use Spearman Rho for this pair because we got ordered categorical variable here.
s_rho <- cor(x = as.numeric(second_pair$students),
             y = second_pair$profit_margin,
             method = "spearman")
s_rho
## [1] -0.9

We got a negative correlation coefficient which is in line with our visualization for second pair. We observed in the plot that as the number of students increased, the profit margin went down especially after 5000 students.

Confidence Intervals

bootstrap <- function (x, func=mean, n_iter=10^4) {
  # empty vector to be filled with values from each iteration
  x <- na.omit(x)
  func_values <- c(NULL)
  
  # we simulate sampling `n_iter` times
  for (i in 1:n_iter) {
    # pull the sample
    x_sample <- sample(x, size = length(x), replace = TRUE)
    
    # add on this iteration's value to the collection
    func_values <- c(func_values, func(x_sample))
  }
  
  return(func_values)
}
profit_means = bootstrap(first_pair$profit)
head(profit_means,25)
##  [1]  74647.958  42872.468  68593.536 -27155.551 120333.950  32588.560
##  [7]  14340.375  64185.890 123672.291 237817.777   5567.163  84406.722
## [13]  46081.646 132237.906 153037.042  21718.732  29238.990 -60206.375
## [19]  23868.378  83349.928 157833.750  20971.262  34271.805  34770.382
## [25] 168174.321

First response variable

boot_std_error <- sd(profit_means)
boot_std_error
## [1] 78244.59
avg_profit <- mean(first_pair$profit, na.rm=TRUE)
P <- 0.95
z_score <- qnorm(p=(1 - P)/2, lower.tail=FALSE)
z_times_error <- z_score*boot_std_error
CI <- c(avg_profit - z_times_error, avg_profit + z_times_error)
CI
## [1] -55361.4 251351.7

95 out of 100 times, the confidence interval above will contain the true mean of the population.

Second response variable

library(boot)
boot_ci <- function (v, func = median, conf = 0.95, n_iter = 1000) {
  # the `boot` library needs the function in this format
  boot_func <- \(x, i) func(x[i], na.rm=TRUE)
  
  b <- boot(v, boot_func, R = n_iter)
  
  boot.ci(b, conf = conf, type = "perc")
}
boot_ci(second_pair$profit_margin, mean, 0.95)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = b, conf = conf, type = "perc")
## 
## Intervals : 
## Level     Percentile     
## 95%   (-306.3,    2.4 )  
## Calculations and Intervals on Original Scale

The confidence interval above -306.2 to 2.4 will contain the true mean of the population 95 out of 100 times.