Installing tidyverse
# This works to get rid of errors
library(conflicted)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("lag", "dplyr")
## [conflicted] Will prefer dplyr::lag over any other package.
Link: https://github.com/rfordatascience/tidytuesday/blob/master/data/2022/2022-03-29/readme.md
ncaa <- read.csv("./ncaa_sports_1.csv", header = TRUE)
nrow(ncaa)
## [1] 78595
colnames(ncaa)
## [1] "year" "unitid" "institution_name"
## [4] "city_txt" "state_cd" "zip_text"
## [7] "classification_code" "classification_name" "classification_other"
## [10] "ef_male_count" "ef_female_count" "ef_total_count"
## [13] "sector_cd" "sector_name" "sportscode"
## [16] "partic_men" "partic_women" "partic_coed_men"
## [19] "partic_coed_women" "sum_partic_men" "sum_partic_women"
## [22] "rev_men" "rev_women" "total_rev_menwomen"
## [25] "exp_men" "exp_women" "total_exp_menwomen"
## [28] "sports" "pct_men"
Remove empty rows (21,133)
ncaa <- ncaa |>
filter(sum_partic_men + sum_partic_women > 0)
nrow(ncaa)
## [1] 57462
Separate mens and womens, then combine
2.a -> filter for men, remove all instances of women
men_ncaa <- ncaa |>
filter(sum_partic_men > 0)
men_ncaa[1:nrow(men_ncaa),'partic_women'] <- NA
men_ncaa[1:nrow(men_ncaa),'partic_coed_women'] <- NA
men_ncaa[1:nrow(men_ncaa),'sum_partic_women'] <- NA
men_ncaa[1:nrow(men_ncaa),'rev_women'] <- NA
men_ncaa[1:nrow(men_ncaa),'exp_women'] <- NA
2.b -> filter for women, remove all instances of men
women_ncaa <- ncaa |>
filter(sum_partic_women > 0)
women_ncaa[1:nrow(women_ncaa),'partic_men'] <- NA
women_ncaa[1:nrow(women_ncaa),'partic_coed_men'] <- NA
women_ncaa[1:nrow(women_ncaa),'sum_partic_men'] <- NA
women_ncaa[1:nrow(women_ncaa),'rev_men'] <- NA
women_ncaa[1:nrow(women_ncaa),'exp_men'] <- NA
print(nrow(men_ncaa))
## [1] 40418
print(nrow(women_ncaa))
## [1] 46454
2.c -> combine data frames together
ncaa_clean <- rbind(men_ncaa, women_ncaa)
path = './ncaa_clean.csv'
write.csv(ncaa_clean, path)
nrow(ncaa_clean)
## [1] 86872