START
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
data <- read_delim("./sports.csv",delim = ",")
## Rows: 2936 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): institution_name, city_txt, state_cd, classification_name, classif...
## dbl (21): year, unitid, zip_text, classification_code, ef_male_count, ef_fem...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# cleaning data before sampling i.e., removing na values
# But we're only including few columns (both categorical and numeric) from our original dataset
cleaned_data <- na.omit(data[,c("institution_name","city_txt","classification_name", "total_rev_menwomen", "total_exp_menwomen")])
cleaned_data
## # A tibble: 1,978 × 5
## institution_name city_txt classification_name total_rev_menwomen
## <chr> <chr> <chr> <dbl>
## 1 Ancilla College Donaldson NJCAA Division II 148226
## 2 Ancilla College Donaldson NJCAA Division II 232988
## 3 Ancilla College Donaldson NJCAA Division II 56770
## 4 Ancilla College Donaldson NJCAA Division II 226856
## 5 Ancilla College Donaldson NJCAA Division II 126341
## 6 Ancilla College Donaldson NJCAA Division II 80311
## 7 Anderson University Anderson NCAA Division III with foot… 160295
## 8 Anderson University Anderson NCAA Division III with foot… 246495
## 9 Anderson University Anderson NCAA Division III with foot… 188718
## 10 Anderson University Anderson NCAA Division III with foot… 286347
## # ℹ 1,968 more rows
## # ℹ 1 more variable: total_exp_menwomen <dbl>
First Sample
sample_one <- cleaned_data[sample(nrow(cleaned_data),1000, replace = TRUE),]
sample_one |>
group_by(institution_name) |>
summarise(sum = sum(total_rev_menwomen)) |>
arrange(desc(sum))
## # A tibble: 46 × 2
## institution_name sum
## <chr> <dbl>
## 1 University of Notre Dame 470876227
## 2 Purdue University-Main Campus 196303544
## 3 Indiana University-Bloomington 193237030
## 4 Butler University 52041679
## 5 Ball State University 49174790
## 6 University of Evansville 36008679
## 7 Indiana State University 32410933
## 8 University of Indianapolis 28979367
## 9 Indiana University-Purdue University-Indianapolis 27840735
## 10 University of Saint Francis-Fort Wayne 25953100
## # ℹ 36 more rows
#getting top 5 institutions
top_5 <- sample_one |>
group_by(institution_name) |>
summarise(sum = sum(total_rev_menwomen)) |>
arrange(desc(sum)) |>
slice_head(n = 5) |>
pluck("institution_name")
top_5
## [1] "University of Notre Dame" "Purdue University-Main Campus"
## [3] "Indiana University-Bloomington" "Butler University"
## [5] "Ball State University"
Let’s visualize the first sample…
sample_one |>
filter(institution_name %in% top_5) |>
ggplot() +
geom_boxplot(mapping = aes(x=institution_name, y = total_rev_menwomen)) +
labs(
title = "Sample One Institutions and Their Revenue",
x = "Institutions",
y = "Revenue"
) +
scale_x_discrete(labels = c("Ball State", "Butler", "Indiana State Uni", "IU B", "Purdue Lafayette")) +
coord_cartesian(ylim = c(10000,5000000))

Second Sample
sample_two <- cleaned_data[sample(nrow(cleaned_data),1000, replace = TRUE),]
sample_two |>
group_by(institution_name) |>
summarise(sum = sum(total_rev_menwomen)) |>
arrange(desc(sum))
## # A tibble: 46 × 2
## institution_name sum
## <chr> <dbl>
## 1 Indiana University-Bloomington 277759736
## 2 University of Notre Dame 221182201
## 3 Purdue University-Main Campus 169788899
## 4 Butler University 56339439
## 5 Ball State University 48095413
## 6 University of Saint Francis-Fort Wayne 29749806
## 7 Marian University 27648010
## 8 University of Evansville 27279310
## 9 University of Indianapolis 26686548
## 10 Indiana State University 24455686
## # ℹ 36 more rows
#getting top 5 institutions
top_5 <- sample_two |>
group_by(institution_name) |>
summarise(sum = sum(total_rev_menwomen)) |>
arrange(desc(sum)) |>
slice_head(n = 5) |>
pluck("institution_name")
top_5
## [1] "Indiana University-Bloomington" "University of Notre Dame"
## [3] "Purdue University-Main Campus" "Butler University"
## [5] "Ball State University"
Visualizing second sample…
sample_two |>
filter(institution_name %in% top_5) |>
ggplot() +
geom_boxplot(mapping = aes(x=institution_name, y = total_rev_menwomen)) +
labs(
title = "Sample Second: Institutions and Their Revenue",
x = "Institutions",
y = "Revenue"
) +
scale_x_discrete(labels = c("Butler", "Indiana State Uni", "IU B", "Purdue Lafayette", "Notre Dame")) +
coord_cartesian(ylim = c(10000,2500000))

In this sample’s visualization, we didn’t get any weirdly high
dispersion like some other samples.
Third Sample
sample_three <- cleaned_data[sample(nrow(cleaned_data),1000, replace = TRUE),]
sample_three |>
group_by(institution_name) |>
summarise(sum = sum(total_rev_menwomen)) |>
arrange(desc(sum))
## # A tibble: 45 × 2
## institution_name sum
## <chr> <dbl>
## 1 University of Notre Dame 674721887
## 2 Indiana University-Bloomington 145203914
## 3 Purdue University-Main Campus 71034311
## 4 Indiana State University 48506301
## 5 Butler University 42007109
## 6 University of Indianapolis 35004148
## 7 Ball State University 26106485
## 8 Marian University 23413226
## 9 Valparaiso University 22888917
## 10 University of Saint Francis-Fort Wayne 22236507
## # ℹ 35 more rows
top_5 <- sample_three |>
group_by(institution_name) |>
summarise(sum = sum(total_rev_menwomen)) |>
arrange(desc(sum)) |>
slice_head(n = 5) |>
pluck("institution_name")
top_5
## [1] "University of Notre Dame" "Indiana University-Bloomington"
## [3] "Purdue University-Main Campus" "Indiana State University"
## [5] "Butler University"
Visualization
sample_three |>
filter(institution_name %in% top_5) |>
ggplot() +
geom_boxplot(mapping = aes(x=institution_name, y = total_rev_menwomen)) +
labs(
title = "Sample Third: Institutions and Their Revenue",
x = "Institutions",
y = "Revenue"
) +
scale_x_discrete(labels = c("Ball State", "Indiana IT", "IU B", "Purdue Lafayette", "Notre Dame")) +
coord_cartesian(ylim = c(10000,3000000))

Fourth Sample
sample_four <- cleaned_data[sample(nrow(cleaned_data),1000,replace = TRUE),]
sample_four |>
group_by(institution_name) |>
summarise(sum = sum(total_rev_menwomen)) |>
arrange(desc(sum))
## # A tibble: 46 × 2
## institution_name sum
## <chr> <dbl>
## 1 University of Notre Dame 347051483
## 2 Indiana University-Bloomington 340859751
## 3 Purdue University-Main Campus 128403437
## 4 Butler University 65874017
## 5 Ball State University 54642492
## 6 Indiana State University 40435482
## 7 University of Evansville 39777140
## 8 Marian University 30131884
## 9 Valparaiso University 27032437
## 10 University of Indianapolis 21225757
## # ℹ 36 more rows
top_5 <- sample_four |>
group_by(institution_name) |>
summarise(sum = sum(total_rev_menwomen)) |>
arrange(desc(sum)) |>
slice_head(n = 5) |>
pluck("institution_name")
top_5
## [1] "University of Notre Dame" "Indiana University-Bloomington"
## [3] "Purdue University-Main Campus" "Butler University"
## [5] "Ball State University"
sample_four |>
filter(institution_name %in% top_5) |>
ggplot() +
geom_boxplot(mapping = aes(x=institution_name, y = total_rev_menwomen)) +
labs(
title = "Sample Fourth: Institutions and Their Revenue",
x = "Institutions",
y = "Revenue"
) +
scale_x_discrete(labels = c("Ball State", "Indiana State Uni", "IU B", "Purdue Lafayette", "Notre Dame")) +
coord_cartesian(ylim = c(10000,5000000))

Fifth Sample
sample_five <- cleaned_data[sample(nrow(cleaned_data), 1000, replace = TRUE),]
sample_five |>
group_by(institution_name) |>
summarise(sum = sum(total_rev_menwomen)) |>
arrange(desc(sum))
## # A tibble: 46 × 2
## institution_name sum
## <chr> <dbl>
## 1 University of Notre Dame 440006898
## 2 Purdue University-Main Campus 142551664
## 3 Indiana University-Bloomington 93249333
## 4 Ball State University 46715666
## 5 University of Evansville 29974757
## 6 Indiana State University 29170996
## 7 Butler University 29031479
## 8 University of Saint Francis-Fort Wayne 27217253
## 9 University of Indianapolis 25929673
## 10 Marian University 23276168
## # ℹ 36 more rows
top_5 <- sample_five |>
group_by(institution_name) |>
summarise(sum = sum(total_rev_menwomen)) |>
arrange(desc(sum)) |>
slice_head(n = 5) |>
pluck("institution_name")
top_5
## [1] "University of Notre Dame" "Purdue University-Main Campus"
## [3] "Indiana University-Bloomington" "Ball State University"
## [5] "University of Evansville"
sample_five |>
filter(institution_name %in% top_5) |>
ggplot() +
geom_boxplot(mapping = aes(x=institution_name, y = total_rev_menwomen)) +
labs(
title = "Sample Fifth: Institutions and Their Revenue",
x = "Institutions",
y = "Revenue"
) +
scale_x_discrete(labels = c("Ball State", "IU B", "Purdue Lafayette", "Uni of Indy", "Notre Dame")) +
coord_cartesian(ylim = c(10000,4500000))

Differences: Dispersion in all samples are pretty different. The box
plots for each sample shows that all institutions have different
dispersion.
Consistencies: We are getting consistent institutions in all
samples. For example, we’re seeing Bloomington and Purdue in pretty much
all samples.
In some samples, box plots show weirdly high dispersion, it can be
considered an anomaly. However, in other samples that same institution
might look totally normal.
In the future, I won’t trust just one sample. I would take multiple
samples to get a better picture of population parameters.