Import data

# excel file
NCAA <- read_excel("../00_data/myData_charts.xls")
## New names:
## • `` -> `...1`
NCAA
## # A tibble: 16,383 × 29
##     ...1  year unitid institution_name         city_txt state_cd zip_text
##    <dbl> <dbl>  <dbl> <chr>                    <chr>    <chr>       <dbl>
##  1     1  2015 100654 Alabama A & M University Normal   AL          35762
##  2     2  2015 100654 Alabama A & M University Normal   AL          35762
##  3     3  2015 100654 Alabama A & M University Normal   AL          35762
##  4     4  2015 100654 Alabama A & M University Normal   AL          35762
##  5     5  2015 100654 Alabama A & M University Normal   AL          35762
##  6     6  2015 100654 Alabama A & M University Normal   AL          35762
##  7     7  2015 100654 Alabama A & M University Normal   AL          35762
##  8     8  2015 100654 Alabama A & M University Normal   AL          35762
##  9     9  2015 100654 Alabama A & M University Normal   AL          35762
## 10    10  2015 100654 Alabama A & M University Normal   AL          35762
## # ℹ 16,373 more rows
## # ℹ 22 more variables: classification_code <dbl>, classification_name <chr>,
## #   classification_other <chr>, ef_male_count <dbl>, ef_female_count <dbl>,
## #   ef_total_count <dbl>, sector_cd <dbl>, sector_name <chr>, sportscode <dbl>,
## #   partic_men <chr>, partic_women <chr>, partic_coed_men <chr>,
## #   partic_coed_women <chr>, sum_partic_men <dbl>, sum_partic_women <dbl>,
## #   rev_men <chr>, rev_women <chr>, total_rev_menwomen <chr>, exp_men <chr>, …

Introduction

Questions

Variation

Visualizing distributions

NCAA %>%
    
    filter(institution_name == "University of Denver") %>%
    
    ggplot(aes(x = sports, y = rev_women)) +
    geom_bar(stat = "identity")

NCAA %>% count(rev_women)
## # A tibble: 12,524 × 2
##    rev_women     n
##    <chr>     <int>
##  1 1000010       1
##  2 1000045       1
##  3 100025        1
##  4 100032        1
##  5 100040        1
##  6 1000445       1
##  7 100093        1
##  8 100099        1
##  9 1001          4
## 10 100137        1
## # ℹ 12,514 more rows
NCAA %>%
    
    filter(institution_name == "University of Denver") %>%
    
    ggplot(aes(x = sports, y = rev_women)) +
    geom_histogram(stat = "identity", binwidth = 0.25)
## Warning in geom_histogram(stat = "identity", binwidth = 0.25): Ignoring unknown
## parameters: `binwidth` and `bins`

ggplot(data = NCAA) +
    geom_histogram(mapping = aes(x = sportscode), binwidth = 1)

NCAA %>%
    
    filter(institution_name == "University of Denver") %>%
    
    ggplot(data = NCAA, mapping = aes(x = ef_female_count, colour = sports)) +
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values

ggplot(data = NCAA, mapping = aes(x = sum_partic_women)) +
  geom_histogram(binwidth = 0.8)

Unusual values

NCAA %>%
    ggplot(aes(ef_female_count)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0, 50))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values

NCAA %>%
    
    #filter(y < 5 | y > 200) %>%
    
    mutate(y = ifelse(sum_partic_women < 5 | sum_partic_women > 200, NA, sum_partic_women)) %>%
    
    # Plot
    ggplot(aes(x = sports, y = sum_partic_women)) +
    geom_point()

Covariation

A categorical and continuous variable

NCAA %>%
    
    ggplot(aes(x = sports, y = sum_partic_women)) +
    geom_boxplot()

Two categorical variables

NCAA %>%
    
    count(sports, sector_cd) %>%
    
    ggplot(aes(x = sports, y = sector_cd, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin)

NCAA %>%
    ggplot(aes(x = sum_partic_men, y = sum_partic_women)) +
    geom_hex()

NCAA %>%
    filter(sum_partic_men < 200) %>%
    ggplot(aes(x = sum_partic_men, y = sum_partic_women)) +
    geom_boxplot(aes(group = cut_width(sum_partic_men, 0.1)))

Patterns and models

library(modelr)

NCAA_clean <- NCAA %>%
  filter(sum_partic_men > 0,
         sum_partic_women > 0)

mod <- lm(log(sum_partic_women) ~ log(sum_partic_men), data = NCAA_clean)

NCAA_resid <- NCAA_clean %>%
    modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))

NCAA_resid %>%
    ggplot(aes(sum_partic_women, resid)) +
    geom_point()

NCAA_resid %>%
    ggplot(aes(sports, resid)) +
    geom_boxplot()