Import data
# excel file
NCAA <- read_excel("../00_data/myData_charts.xls")
## New names:
## • `` -> `...1`
NCAA
## # A tibble: 16,383 × 29
## ...1 year unitid institution_name city_txt state_cd zip_text
## <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 1 2015 100654 Alabama A & M University Normal AL 35762
## 2 2 2015 100654 Alabama A & M University Normal AL 35762
## 3 3 2015 100654 Alabama A & M University Normal AL 35762
## 4 4 2015 100654 Alabama A & M University Normal AL 35762
## 5 5 2015 100654 Alabama A & M University Normal AL 35762
## 6 6 2015 100654 Alabama A & M University Normal AL 35762
## 7 7 2015 100654 Alabama A & M University Normal AL 35762
## 8 8 2015 100654 Alabama A & M University Normal AL 35762
## 9 9 2015 100654 Alabama A & M University Normal AL 35762
## 10 10 2015 100654 Alabama A & M University Normal AL 35762
## # ℹ 16,373 more rows
## # ℹ 22 more variables: classification_code <dbl>, classification_name <chr>,
## # classification_other <chr>, ef_male_count <dbl>, ef_female_count <dbl>,
## # ef_total_count <dbl>, sector_cd <dbl>, sector_name <chr>, sportscode <dbl>,
## # partic_men <chr>, partic_women <chr>, partic_coed_men <chr>,
## # partic_coed_women <chr>, sum_partic_men <dbl>, sum_partic_women <dbl>,
## # rev_men <chr>, rev_women <chr>, total_rev_menwomen <chr>, exp_men <chr>, …
Introduction
Questions
Variation
Visualizing distributions
NCAA %>%
filter(institution_name == "University of Denver") %>%
ggplot(aes(x = sports, y = rev_women)) +
geom_bar(stat = "identity")

NCAA %>% count(rev_women)
## # A tibble: 12,524 × 2
## rev_women n
## <chr> <int>
## 1 1000010 1
## 2 1000045 1
## 3 100025 1
## 4 100032 1
## 5 100040 1
## 6 1000445 1
## 7 100093 1
## 8 100099 1
## 9 1001 4
## 10 100137 1
## # ℹ 12,514 more rows
NCAA %>%
filter(institution_name == "University of Denver") %>%
ggplot(aes(x = sports, y = rev_women)) +
geom_histogram(stat = "identity", binwidth = 0.25)
## Warning in geom_histogram(stat = "identity", binwidth = 0.25): Ignoring unknown
## parameters: `binwidth` and `bins`

ggplot(data = NCAA) +
geom_histogram(mapping = aes(x = sportscode), binwidth = 1)

NCAA %>%
filter(institution_name == "University of Denver") %>%
ggplot(data = NCAA, mapping = aes(x = ef_female_count, colour = sports)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values
ggplot(data = NCAA, mapping = aes(x = sum_partic_women)) +
geom_histogram(binwidth = 0.8)

Unusual values
NCAA %>%
ggplot(aes(ef_female_count)) +
geom_histogram() +
coord_cartesian(ylim = c(0, 50))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values
NCAA %>%
#filter(y < 5 | y > 200) %>%
mutate(y = ifelse(sum_partic_women < 5 | sum_partic_women > 200, NA, sum_partic_women)) %>%
# Plot
ggplot(aes(x = sports, y = sum_partic_women)) +
geom_point()

Covariation
A categorical and continuous variable
NCAA %>%
ggplot(aes(x = sports, y = sum_partic_women)) +
geom_boxplot()

Two categorical variables
NCAA %>%
count(sports, sector_cd) %>%
ggplot(aes(x = sports, y = sector_cd, fill = n)) +
geom_tile()

Two continous variables
library(hexbin)
NCAA %>%
ggplot(aes(x = sum_partic_men, y = sum_partic_women)) +
geom_hex()

NCAA %>%
filter(sum_partic_men < 200) %>%
ggplot(aes(x = sum_partic_men, y = sum_partic_women)) +
geom_boxplot(aes(group = cut_width(sum_partic_men, 0.1)))

Patterns and models
library(modelr)
NCAA_clean <- NCAA %>%
filter(sum_partic_men > 0,
sum_partic_women > 0)
mod <- lm(log(sum_partic_women) ~ log(sum_partic_men), data = NCAA_clean)
NCAA_resid <- NCAA_clean %>%
modelr::add_residuals(mod) %>%
mutate(resid = exp(resid))
NCAA_resid %>%
ggplot(aes(sum_partic_women, resid)) +
geom_point()

NCAA_resid %>%
ggplot(aes(sports, resid)) +
geom_boxplot()
