Import your data

# excel file
NCAA <- read_excel("../00_data/myData_charts.xls")
## New names:
## • `` -> `...1`
NCAA
## # A tibble: 16,383 × 29
##     ...1  year unitid institution_name         city_txt state_cd zip_text
##    <dbl> <dbl>  <dbl> <chr>                    <chr>    <chr>       <dbl>
##  1     1  2015 100654 Alabama A & M University Normal   AL          35762
##  2     2  2015 100654 Alabama A & M University Normal   AL          35762
##  3     3  2015 100654 Alabama A & M University Normal   AL          35762
##  4     4  2015 100654 Alabama A & M University Normal   AL          35762
##  5     5  2015 100654 Alabama A & M University Normal   AL          35762
##  6     6  2015 100654 Alabama A & M University Normal   AL          35762
##  7     7  2015 100654 Alabama A & M University Normal   AL          35762
##  8     8  2015 100654 Alabama A & M University Normal   AL          35762
##  9     9  2015 100654 Alabama A & M University Normal   AL          35762
## 10    10  2015 100654 Alabama A & M University Normal   AL          35762
## # ℹ 16,373 more rows
## # ℹ 22 more variables: classification_code <dbl>, classification_name <chr>,
## #   classification_other <chr>, ef_male_count <dbl>, ef_female_count <dbl>,
## #   ef_total_count <dbl>, sector_cd <dbl>, sector_name <chr>, sportscode <dbl>,
## #   partic_men <chr>, partic_women <chr>, partic_coed_men <chr>,
## #   partic_coed_women <chr>, sum_partic_men <dbl>, sum_partic_women <dbl>,
## #   rev_men <chr>, rev_women <chr>, total_rev_menwomen <chr>, exp_men <chr>, …
NCAA_small <- NCAA %>%
    
    select(institution_name, sports, rev_women, sum_partic_men, sum_partic_women) %>%
    
    filter(institution_name %in% c("University of Denver"))

Chapter 15

Create a factor

I’ll use sports

NCAA_small %>% count(sports)
## # A tibble: 10 × 2
##    sports                  n
##    <chr>               <int>
##  1 Basketball              1
##  2 Golf                    1
##  3 Gymnastics              1
##  4 Ice Hockey              1
##  5 Lacrosse                1
##  6 Skiing                  1
##  7 Soccer                  1
##  8 Swimming and Diving     1
##  9 Tennis                  1
## 10 Volleyball              1
x1 <- c("Lacrosse", "Swimming and Diving", "Ice Hockey", "Soccer", "Basketball", "Tenis", "Golf", "Gymnastic", "Volleyball")

sports_levels <- c("Lacrosse", "Swimming and Diving", "Ice Hockey", "Soccer", "Basketball", "Tenis", "Golf", "Gymnastic", "Volleyball")

NCAA_rev <- NCAA_small %>%
    mutate(sports = sports %>% factor(levels = sports_levels))
           
NCAA_rev
## # A tibble: 10 × 5
##    institution_name     sports         rev_women sum_partic_men sum_partic_women
##    <chr>                <fct>          <chr>              <dbl>            <dbl>
##  1 University of Denver Basketball     1729128               17               18
##  2 University of Denver Golf           617182                 7                7
##  3 University of Denver <NA>           1388115                0               14
##  4 University of Denver Ice Hockey     NA                    27                0
##  5 University of Denver Lacrosse       1271011               51               32
##  6 University of Denver <NA>           812527                 9               14
##  7 University of Denver Soccer         1356455               25               24
##  8 University of Denver Swimming and … 1122277               28               27
##  9 University of Denver <NA>           796467                10                8
## 10 University of Denver Volleyball     1349643                0               15
y1 <- factor(x1, levels = sports_levels)
y1
## [1] Lacrosse            Swimming and Diving Ice Hockey         
## [4] Soccer              Basketball          Tenis              
## [7] Golf                Gymnastic           Volleyball         
## 9 Levels: Lacrosse Swimming and Diving Ice Hockey Soccer Basketball ... Volleyball

Modify factor order

Make two bar charts here - one before ordering another after

Unordered

# Transform data: calculate average revenue in women sports by sports
partic_women_by_sports <- NCAA %>%
    group_by(sports) %>%
    summarise(
        avg_partic_women = mean(sum_partic_women, na.rm = TRUE)
        )

partic_women_by_sports
## # A tibble: 36 × 2
##    sports             avg_partic_women
##    <chr>                         <dbl>
##  1 All Track Combined            77.1 
##  2 Archery                        8.82
##  3 Badminton                      9   
##  4 Baseball                       0   
##  5 Basketball                    14.4 
##  6 Beach Volleyball              15.5 
##  7 Bowling                        9.35
##  8 Diving                         3.14
##  9 Equestrian                    31.0 
## 10 Fencing                       16.5 
## # ℹ 26 more rows
ggplot(partic_women_by_sports, aes(avg_partic_women, sports)) + geom_point()

ggplot(partic_women_by_sports, aes(avg_partic_women, fct_reorder(sports, avg_partic_women))) + geom_point()

Modify factor levels

Show examples of three functions:

  • fct_recode
NCAA %>%
  mutate(sports = fct_recode(sports,
    "basketball"          = "Basketball",
    "golf"                = "Golf",
    "gymnastics"          = "Gymnastics",
    "ice hockey"          = "Ice Hockey",
    "lacrosse"            = "Lacrosse",
    "skiing"              = "Skiing",
    "soccer"              = "Soccer",
    "swimming and diving" = "Swimming and Diving",
    "tennis"              = "Tennis",
    "volleyball"          = "Volleyball",
    "wrestling"           = "Wrestling",
    "all track combined"  = "All Track Combined",
    "badminton"           = "Badminton",
    "baseball"            = "Baseball",
    "beach volleyball"    = "Beach Volleyball",
    "bowling"             = "Bowling",
    "diving"              = "Diving",
    "equestrian"          = "Equestrian",
    "fencing"             = "Fencing",
    "archery"             = "Archery",
    "rowing"              = "Rowing",
    "track and field, outdoor" = "Track and Field, Outdoor",
    "field hockey"        = "Field Hockey",
    "softball"            = "Softball",
    "sailing"             = "Sailing",
    "rodeo"               = "Rodeo",
    "other sports"        = "Other Sports",
    "squash"              = "Squash",
    "rifle"               = "Rifle",
    "table tennis"        = "Table Tennis",
    "water polo"          = "Water Polo",
    "swimming"            = "Swimming",
    "track and field, x-country" = "Track and Field, X-Country",
    "football"            = "Football"
    )) %>%
  count(sports)
## # A tibble: 36 × 2
##    sports                 n
##    <fct>              <int>
##  1 all track combined   733
##  2 archery               11
##  3 badminton             14
##  4 baseball            1583
##  5 basketball          1878
##  6 beach volleyball      77
##  7 bowling              133
##  8 diving                 7
##  9 equestrian            67
## 10 fencing               41
## # ℹ 26 more rows
  • fct_collapse
NCAA_small %>%
  mutate(sports = fct_collapse(sports,
                               Lacrosse = "Lacrosse",
                               Soccer = "Soccer",
                               Other = c("Basketball", "Volleyball", "Gymnastics", "Skiing", "Tennis", "Golf", "Ice Hockey", "Swimming and Diving"))) %>%
             count(sports)
## # A tibble: 3 × 2
##   sports       n
##   <fct>    <int>
## 1 Other        8
## 2 Lacrosse     1
## 3 Soccer       1
  • fct_lump
NCAA_small %>%
    mutate(sports = fct_lump(sports)) %>%
    count(sports)
## # A tibble: 10 × 2
##    sports                  n
##    <fct>               <int>
##  1 Basketball              1
##  2 Golf                    1
##  3 Gymnastics              1
##  4 Ice Hockey              1
##  5 Lacrosse                1
##  6 Skiing                  1
##  7 Soccer                  1
##  8 Swimming and Diving     1
##  9 Tennis                  1
## 10 Volleyball              1

Chapter 16

No need to do anything here.