library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read_delim("./sports.csv", delim =",")
## Rows: 2936 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): institution_name, city_txt, state_cd, classification_name, classif...
## dbl (21): year, unitid, zip_text, classification_code, ef_male_count, ef_fem...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data |>
group_by(institution_name) |>
summarise(unique_values = table(institution_name))
## # A tibble: 46 × 2
## institution_name unique_values
## <chr> <table[1d]>
## 1 Ancilla College 57
## 2 Anderson University 76
## 3 Ball State University 86
## 4 Bethel College-Indiana 40
## 5 Bethel University 12
## 6 Butler University 81
## 7 Calumet College of Saint Joseph 77
## 8 DePauw University 87
## 9 Earlham College 52
## 10 Franklin College 85
## # ℹ 36 more rows
data |>
group_by(city_txt) |>
summarise(city_count = table(city_txt))
## # A tibble: 34 × 2
## city_txt city_count
## <chr> <table[1d]>
## 1 Anderson 76
## 2 Angola 94
## 3 Bloomington 94
## 4 Donaldson 57
## 5 Evansville 147
## 6 Fort Wayne 238
## 7 Franklin 85
## 8 Gary 54
## 9 Goshen 66
## 10 Greencastle 87
## # ℹ 24 more rows
min(data$rev_men, na.rm = TRUE)
## [1] 5911
max(data$rev_men, na.rm = TRUE)
## [1] 115510518
min(data$rev_women, na.rm = TRUE)
## [1] 2212
max(data$rev_women, na.rm = TRUE)
## [1] 6843854
mean(data$rev_men, na.rm = TRUE)
## [1] 1157502
mean(data$rev_women, na.rm = TRUE)
## [1] 278681.4
quantile(data$rev_men, probs = c(0.25,0.50,0.75), na.rm = TRUE)
## 25% 50% 75%
## 90074.5 198110.0 429557.5
quantile(data$rev_women, probs = c(0.25,0.50,0.75), na.rm = TRUE)
## 25% 50% 75%
## 80436 160575 332543
data %>%
aggregate(total_rev_menwomen ~ sector_name, data = ., FUN = mean)
## sector_name total_rev_menwomen
## 1 Private nonprofit, 2-year 121968.3
## 2 Private nonprofit, 4-year or above 878372.5
## 3 Public, 4-year or above 1675965.1
top_5 <- data |>
group_by(institution_name) |>
summarise(visualize = sum(total_rev_menwomen,na.rm = TRUE)) |>
arrange(desc(visualize)) |>
filter(visualize >= 90000000) |>
pluck("institution_name")
top_5
## [1] "University of Notre Dame" "Indiana University-Bloomington"
## [3] "Purdue University-Main Campus" "Ball State University"
## [5] "Butler University"
p <- data |>
filter(institution_name %in% top_5) |>
ggplot(aes(x = institution_name, y = total_rev_menwomen, fill = institution_name)) +
geom_bar(stat = "identity") +
theme_light() +
scale_fill_brewer(palette = "Dark2") +
scale_x_discrete(labels = c("Ball State","Butler", "IU B", "Purdue", "Notre Dame")) +
labs(
x = "Institutions",
y = "Total Revenue (men and women) in dollars",
title = "Top Indiana Universities' Sports Revenue"
)
p
## Warning: Removed 99 rows containing missing values or values outside the scale range
## (`geom_bar()`).
#### I used the top 5 colleges I gathered earlier and ran this bar plot
to see total revenue of each. Notre Dame generates the most revenue of
all according to our dataset. It is to be noted that the total revenue
is added together over the years.
data |>
filter(institution_name %in% top_5) |>
ggplot() +
geom_boxplot(mapping = aes(x= institution_name, y = total_rev_menwomen)) +
scale_x_discrete(labels = c("Ball State","Butler", "IU B", "Purdue", "Notre Dame")) +
labs(
title = "Distribution of top 5 colleges' sports revenue",
x = "Universities",
y = "Total Revenue"
) +
coord_cartesian(ylim = c(2000,2500000)) + # we needed to constrict y axis to show the plot because data is large
theme_minimal()
## Warning: Removed 99 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#### I used this box plot to show the general view of the data. It is
interesting to see that Notre Dame’s median or even 75th percentile is
not that high compared to others. Yes, the total revenue is still the
highest for Notre Dame and that could be due to super high outliers (all
outliers couldn’t be shown in this plot). But, it is interesting to see
how bulk of the data is laid out for all.
library(ggthemes)
data |>
filter(institution_name == "Purdue University-Main Campus") |>
group_by(year)|>
summarise(total_revenue = sum(total_rev_menwomen, na.rm = TRUE)) |>
ggplot(aes(x = year, y = total_revenue)) +
geom_line() +
geom_point() +
theme_minimal() +
labs(
x = "Year",
y= "Total Revenue",
title = "Purdue University Revenue Over Time",
color = "Institution"
)