START

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
data <- read_delim("./sports.csv",delim = ",")
## Rows: 2936 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): institution_name, city_txt, state_cd, classification_name, classif...
## dbl (21): year, unitid, zip_text, classification_code, ef_male_count, ef_fem...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# cleaning data before sampling i.e., removing na values
# But we're only including few columns (both categorical and numeric) from our original dataset

cleaned_data <- na.omit(data[,c("institution_name","city_txt","classification_name", "total_rev_menwomen", "total_exp_menwomen")])
cleaned_data
## # A tibble: 1,978 × 5
##    institution_name    city_txt  classification_name          total_rev_menwomen
##    <chr>               <chr>     <chr>                                     <dbl>
##  1 Ancilla College     Donaldson NJCAA Division II                        148226
##  2 Ancilla College     Donaldson NJCAA Division II                        232988
##  3 Ancilla College     Donaldson NJCAA Division II                         56770
##  4 Ancilla College     Donaldson NJCAA Division II                        226856
##  5 Ancilla College     Donaldson NJCAA Division II                        126341
##  6 Ancilla College     Donaldson NJCAA Division II                         80311
##  7 Anderson University Anderson  NCAA Division III with foot…             160295
##  8 Anderson University Anderson  NCAA Division III with foot…             246495
##  9 Anderson University Anderson  NCAA Division III with foot…             188718
## 10 Anderson University Anderson  NCAA Division III with foot…             286347
## # ℹ 1,968 more rows
## # ℹ 1 more variable: total_exp_menwomen <dbl>

First Sample

sample_one <- cleaned_data[sample(nrow(cleaned_data),1000, replace = TRUE),]
sample_one |>
  group_by(institution_name) |>
  summarise(sum = sum(total_rev_menwomen)) |>
  arrange(desc(sum))
## # A tibble: 46 × 2
##    institution_name                                        sum
##    <chr>                                                 <dbl>
##  1 University of Notre Dame                          470876227
##  2 Purdue University-Main Campus                     196303544
##  3 Indiana University-Bloomington                    193237030
##  4 Butler University                                  52041679
##  5 Ball State University                              49174790
##  6 University of Evansville                           36008679
##  7 Indiana State University                           32410933
##  8 University of Indianapolis                         28979367
##  9 Indiana University-Purdue University-Indianapolis  27840735
## 10 University of Saint Francis-Fort Wayne             25953100
## # ℹ 36 more rows
#getting top 5 institutions
top_5 <- sample_one |>
  group_by(institution_name) |>
  summarise(sum = sum(total_rev_menwomen)) |>
  arrange(desc(sum)) |>
  slice_head(n = 5) |>
  pluck("institution_name")
  
top_5
## [1] "University of Notre Dame"       "Purdue University-Main Campus" 
## [3] "Indiana University-Bloomington" "Butler University"             
## [5] "Ball State University"

Let’s visualize the first sample…

sample_one |>
  filter(institution_name %in% top_5) |>
  ggplot() +
  geom_boxplot(mapping = aes(x=institution_name, y = total_rev_menwomen)) +
  labs(
    title = "Sample One Institutions and Their Revenue",
    x = "Institutions",
    y = "Revenue"
  ) +
  scale_x_discrete(labels = c("Ball State", "Butler", "Indiana State Uni", "IU B", "Purdue Lafayette")) +
  coord_cartesian(ylim = c(10000,5000000))

Second Sample

sample_two <- cleaned_data[sample(nrow(cleaned_data),1000, replace = TRUE),]
sample_two |>
  group_by(institution_name) |>
  summarise(sum = sum(total_rev_menwomen)) |>
  arrange(desc(sum))
## # A tibble: 46 × 2
##    institution_name                             sum
##    <chr>                                      <dbl>
##  1 Indiana University-Bloomington         277759736
##  2 University of Notre Dame               221182201
##  3 Purdue University-Main Campus          169788899
##  4 Butler University                       56339439
##  5 Ball State University                   48095413
##  6 University of Saint Francis-Fort Wayne  29749806
##  7 Marian University                       27648010
##  8 University of Evansville                27279310
##  9 University of Indianapolis              26686548
## 10 Indiana State University                24455686
## # ℹ 36 more rows
#getting top 5 institutions
top_5 <- sample_two |>
  group_by(institution_name) |>
  summarise(sum = sum(total_rev_menwomen)) |>
  arrange(desc(sum)) |>
  slice_head(n = 5) |>
  pluck("institution_name")
  
top_5
## [1] "Indiana University-Bloomington" "University of Notre Dame"      
## [3] "Purdue University-Main Campus"  "Butler University"             
## [5] "Ball State University"

Visualizing second sample…

sample_two |>
  filter(institution_name %in% top_5) |>
  ggplot() +
  geom_boxplot(mapping = aes(x=institution_name, y = total_rev_menwomen)) +
  labs(
    title = "Sample Second: Institutions and Their Revenue",
    x = "Institutions",
    y = "Revenue"
  ) +
  scale_x_discrete(labels = c("Butler", "Indiana State Uni", "IU B", "Purdue Lafayette", "Notre Dame")) +
  coord_cartesian(ylim = c(10000,2500000))

In this sample’s visualization, we didn’t get any weirdly high dispersion like some other samples.

Third Sample

sample_three <- cleaned_data[sample(nrow(cleaned_data),1000, replace = TRUE),]
sample_three |>
  group_by(institution_name) |>
  summarise(sum = sum(total_rev_menwomen)) |>
  arrange(desc(sum))
## # A tibble: 45 × 2
##    institution_name                             sum
##    <chr>                                      <dbl>
##  1 University of Notre Dame               674721887
##  2 Indiana University-Bloomington         145203914
##  3 Purdue University-Main Campus           71034311
##  4 Indiana State University                48506301
##  5 Butler University                       42007109
##  6 University of Indianapolis              35004148
##  7 Ball State University                   26106485
##  8 Marian University                       23413226
##  9 Valparaiso University                   22888917
## 10 University of Saint Francis-Fort Wayne  22236507
## # ℹ 35 more rows
top_5 <- sample_three |>
  group_by(institution_name) |>
  summarise(sum = sum(total_rev_menwomen)) |>
  arrange(desc(sum)) |>
  slice_head(n = 5) |>
  pluck("institution_name")
  
top_5
## [1] "University of Notre Dame"       "Indiana University-Bloomington"
## [3] "Purdue University-Main Campus"  "Indiana State University"      
## [5] "Butler University"

Visualization

sample_three |>
  filter(institution_name %in% top_5) |>
  ggplot() +
  geom_boxplot(mapping = aes(x=institution_name, y = total_rev_menwomen)) +
  labs(
    title = "Sample Third: Institutions and Their Revenue",
    x = "Institutions",
    y = "Revenue"
  ) +
  scale_x_discrete(labels = c("Ball State", "Indiana IT", "IU B", "Purdue Lafayette", "Notre Dame")) +
  coord_cartesian(ylim = c(10000,3000000))

Fourth Sample

sample_four <- cleaned_data[sample(nrow(cleaned_data),1000,replace = TRUE),]
sample_four |>
  group_by(institution_name) |>
  summarise(sum = sum(total_rev_menwomen)) |>
  arrange(desc(sum))
## # A tibble: 46 × 2
##    institution_name                     sum
##    <chr>                              <dbl>
##  1 University of Notre Dame       347051483
##  2 Indiana University-Bloomington 340859751
##  3 Purdue University-Main Campus  128403437
##  4 Butler University               65874017
##  5 Ball State University           54642492
##  6 Indiana State University        40435482
##  7 University of Evansville        39777140
##  8 Marian University               30131884
##  9 Valparaiso University           27032437
## 10 University of Indianapolis      21225757
## # ℹ 36 more rows
top_5 <- sample_four |>
  group_by(institution_name) |>
  summarise(sum = sum(total_rev_menwomen)) |>
  arrange(desc(sum)) |>
  slice_head(n = 5) |>
  pluck("institution_name")
  
top_5
## [1] "University of Notre Dame"       "Indiana University-Bloomington"
## [3] "Purdue University-Main Campus"  "Butler University"             
## [5] "Ball State University"
sample_four |>
  filter(institution_name %in% top_5) |>
  ggplot() +
  geom_boxplot(mapping = aes(x=institution_name, y = total_rev_menwomen)) +
  labs(
    title = "Sample Fourth: Institutions and Their Revenue",
    x = "Institutions",
    y = "Revenue"
  ) +
  scale_x_discrete(labels = c("Ball State", "Indiana State Uni", "IU B", "Purdue Lafayette", "Notre Dame")) +
  coord_cartesian(ylim = c(10000,5000000))

Fifth Sample

sample_five <- cleaned_data[sample(nrow(cleaned_data), 1000, replace = TRUE),]
sample_five |>
  group_by(institution_name) |>
  summarise(sum = sum(total_rev_menwomen)) |>
  arrange(desc(sum))
## # A tibble: 46 × 2
##    institution_name                             sum
##    <chr>                                      <dbl>
##  1 University of Notre Dame               440006898
##  2 Purdue University-Main Campus          142551664
##  3 Indiana University-Bloomington          93249333
##  4 Ball State University                   46715666
##  5 University of Evansville                29974757
##  6 Indiana State University                29170996
##  7 Butler University                       29031479
##  8 University of Saint Francis-Fort Wayne  27217253
##  9 University of Indianapolis              25929673
## 10 Marian University                       23276168
## # ℹ 36 more rows
top_5 <- sample_five |>
  group_by(institution_name) |>
  summarise(sum = sum(total_rev_menwomen)) |>
  arrange(desc(sum)) |>
  slice_head(n = 5) |>
  pluck("institution_name")
  
top_5
## [1] "University of Notre Dame"       "Purdue University-Main Campus" 
## [3] "Indiana University-Bloomington" "Ball State University"         
## [5] "University of Evansville"
sample_five |>
  filter(institution_name %in% top_5) |>
  ggplot() +
  geom_boxplot(mapping = aes(x=institution_name, y = total_rev_menwomen)) +
  labs(
    title = "Sample Fifth: Institutions and Their Revenue",
    x = "Institutions",
    y = "Revenue"
  ) +
  scale_x_discrete(labels = c("Ball State", "IU B", "Purdue Lafayette", "Uni of Indy", "Notre Dame")) +
  coord_cartesian(ylim = c(10000,4500000))

Differences: Dispersion in all samples are pretty different. The box plots for each sample shows that all institutions have different dispersion.

Consistencies: We are getting consistent institutions in all samples. For example, we’re seeing Bloomington and Purdue in pretty much all samples.

In some samples, box plots show weirdly high dispersion, it can be considered an anomaly. However, in other samples that same institution might look totally normal.

In the future, I won’t trust just one sample. I would take multiple samples to get a better picture of population parameters.