Instal and load packages

       library(tidyverse)
## -- Attaching packages ------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ---------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
       library(skimr)
       library(knitr)
       library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

Get Data

load(url("https://github.com/robertwwalker/DADMStuff/raw/master/Discrimination2020.RData"))
str(Discrimination)
## 'data.frame':    1000 obs. of  7 variables:
##  $ ID            : num  10210 10409 10486 10538 10568 ...
##  $ Age.Cohort    : Factor w/ 6 levels " 6 - 12","0 - 5",..: 3 5 2 4 3 3 3 3 3 3 ...
##  $ Age           : num  17 37 3 19 13 15 13 17 14 13 ...
##  $ Gender        : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 1 1 2 1 2 ...
##  $ Expenditures  : num  2113 41924 1454 6400 4412 ...
##  $ Ethnicity     : Factor w/ 8 levels "American Indian",..: 8 8 4 4 8 4 8 3 8 4 ...
##  $ Ethnicity.Char: chr  "White not Hispanic" "White not Hispanic" "Hispanic" "Hispanic" ...
str(UCB.Micro)
## 'data.frame':    4526 obs. of  3 variables:
##  $ Admit: Factor w/ 2 levels "Admitted","Rejected": 1 1 1 1 1 1 1 1 1 1 ...
##  $ M.F  : Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Dept : Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "out.attrs")=List of 2
##   ..$ dim     : Named int [1:3] 2 2 6
##   .. ..- attr(*, "names")= chr [1:3] "Admit" "Gender" "Dept"
##   ..$ dimnames:List of 3
##   .. ..$ Admit : chr [1:2] "Admit=Admitted" "Admit=Rejected"
##   .. ..$ Gender: chr [1:2] "Gender=Male" "Gender=Female"
##   .. ..$ Dept  : chr [1:6] "Dept=A" "Dept=B" "Dept=C" "Dept=D" ...

#Just the Questions

Questions for Discrimination:

  1. What is the average of expenditures for: (a) all males vs. all females, (b) all Hispanics and all White, non-Hispanics, (c) all 22-50 year olds, (d) all male, White non-Hispanics, and (e) all Asian, 22-50 year olds?
Discrimination %>% group_by(Gender) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
##   Gender  Means Medians    Q1     Q3
##   <fct>   <dbl>   <dbl> <dbl>  <dbl>
## 1 Female 18130.    6400 2872. 39488.
## 2 Male   18001.    7219 2954  37201
Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% group_by(Ethnicity) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
##   Ethnicity           Means Medians    Q1     Q3
##   <fct>               <dbl>   <dbl> <dbl>  <dbl>
## 1 Hispanic           11066.    3952 2331. 10292.
## 2 White not Hispanic 24698.   15718 3977  43134
Discrimination %>% filter(Age > 21, Age < 51) %>% group_by(Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 5
##   Age.Cohort  Means Medians     Q1     Q3
##   <fct>       <dbl>   <dbl>  <dbl>  <dbl>
## 1 22-50      40209.  40456. 36447. 44721.
# White Males
Discrimination %>% filter(Gender == "Male", Ethnicity=="White not Hispanic") %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
##     Means Medians      Q1      Q3
## 1 24573.8 27390.5 4195.25 40817.5
Discrimination %>% filter(Age > 21, Age < 51, Ethnicity=="Asian") %>% group_by(Ethnicity, Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` regrouping output by 'Ethnicity' (override with `.groups` argument)
## # A tibble: 1 x 6
## # Groups:   Ethnicity [1]
##   Ethnicity Age.Cohort  Means Medians    Q1    Q3
##   <fct>     <fct>       <dbl>   <dbl> <dbl> <dbl>
## 1 Asian     22-50      39581.   40240 33634 44293
  1. What is the median of expenditures for: (a) all males vs. all females, (b) all Hispanics and all White, non-Hispanics, (c) all 13-17 year olds, (d) all male, White non-Hispanics, and (e) all Asian, 13-17 year olds?
Discrimination %>% group_by(Gender) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
##   Gender  Means Medians    Q1     Q3
##   <fct>   <dbl>   <dbl> <dbl>  <dbl>
## 1 Female 18130.    6400 2872. 39488.
## 2 Male   18001.    7219 2954  37201
Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% group_by(Ethnicity) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
##   Ethnicity           Means Medians    Q1     Q3
##   <fct>               <dbl>   <dbl> <dbl>  <dbl>
## 1 Hispanic           11066.    3952 2331. 10292.
## 2 White not Hispanic 24698.   15718 3977  43134
Discrimination %>% filter(Age > 12, Age < 18) %>% group_by(Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 5
##   Age.Cohort Means Medians    Q1    Q3
##   <fct>      <dbl>   <dbl> <dbl> <dbl>
## 1 13-17      3923.    3952 3306. 4666.
# White Males
Discrimination %>% filter(Gender == "Male", Ethnicity=="White not Hispanic") %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
##     Means Medians      Q1      Q3
## 1 24573.8 27390.5 4195.25 40817.5
Discrimination %>% filter(Age > 12, Age < 18, Ethnicity=="Asian") %>% group_by(Ethnicity, Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` regrouping output by 'Ethnicity' (override with `.groups` argument)
## # A tibble: 1 x 6
## # Groups:   Ethnicity [1]
##   Ethnicity Age.Cohort Means Medians    Q1    Q3
##   <fct>     <fct>      <dbl>   <dbl> <dbl> <dbl>
## 1 Asian     13-17      3509.   3628. 3071. 4104.
  1. What is the range of the middle 50% of expenditures for: (a) all males vs. all females, (b) all Hispanics and all White, non-Hispanics, (c) all 13-17 year olds, (d) all male, White non-Hispanics, and (e) all Asian, 18-21 year olds?
Discrimination %>% group_by(Gender) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
##   Gender  Means Medians    Q1     Q3
##   <fct>   <dbl>   <dbl> <dbl>  <dbl>
## 1 Female 18130.    6400 2872. 39488.
## 2 Male   18001.    7219 2954  37201
Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% group_by(Ethnicity) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
##   Ethnicity           Means Medians    Q1     Q3
##   <fct>               <dbl>   <dbl> <dbl>  <dbl>
## 1 Hispanic           11066.    3952 2331. 10292.
## 2 White not Hispanic 24698.   15718 3977  43134
Discrimination %>% filter(Age > 12, Age < 18) %>% group_by(Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 5
##   Age.Cohort Means Medians    Q1    Q3
##   <fct>      <dbl>   <dbl> <dbl> <dbl>
## 1 13-17      3923.    3952 3306. 4666.
# White Males
Discrimination %>% filter(Gender == "Male", Ethnicity=="White not Hispanic") %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
##     Means Medians      Q1      Q3
## 1 24573.8 27390.5 4195.25 40817.5
Discrimination %>% filter(Age > 17, Age < 22, Ethnicity=="Asian") %>% group_by(Ethnicity, Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` regrouping output by 'Ethnicity' (override with `.groups` argument)
## # A tibble: 1 x 6
## # Groups:   Ethnicity [1]
##   Ethnicity Age.Cohort Means Medians    Q1    Q3
##   <fct>     <fct>      <dbl>   <dbl> <dbl> <dbl>
## 1 Asian     18-21      9598.    9846  7683 11282
  1. Does discrimination in expenditures exist? There are two relevant potential forms of actionable discrimination: gender and ethnicity. Evaluate these two questions with the appropriate pivot table equivalent defined for each.

We already saw Gender. No real difference For ethnicity, White not Hispanic and Hispanic are quite different.

Discrimination %>% group_by(Ethnicity) %>% skim(Expenditures)
Data summary
Name Piped data
Number of rows 1000
Number of columns 7
_______________________
Column type frequency:
numeric 1
________________________
Group variables Ethnicity

Variable type: numeric

skim_variable Ethnicity n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Expenditures American Indian 0 1 36438.25 25693.91 3726 22085.25 41817.5 56170.50 58392 ▃▁▃▁▇
Expenditures Asian 0 1 18392.37 19209.22 374 3382.00 9369.0 34274.00 75098 ▇▁▂▂▁
Expenditures Black 0 1 20884.59 20549.27 240 3870.00 8687.0 41857.00 60808 ▇▁▁▃▂
Expenditures Hispanic 0 1 11065.57 15629.85 222 2331.25 3952.0 10292.50 65581 ▇▁▁▁▁
Expenditures Multi Race 0 1 4456.73 7332.14 669 1689.75 2622.0 3749.50 38619 ▇▁▁▁▁
Expenditures Native Hawaiian 0 1 42782.33 6576.46 37479 39103.00 40727.0 45434.00 50141 ▇▇▁▁▇
Expenditures Other 0 1 3316.50 1836.36 2018 2667.25 3316.5 3965.75 4615 ▇▁▁▁▇
Expenditures White not Hispanic 0 1 24697.55 20604.38 340 3977.00 15718.0 43134.00 68890 ▇▁▃▃▁

We should wonder if there is enough data for some groups.

Discrimination %>% ggplot() + aes(x=Ethnicity) + geom_bar() + coord_flip()

  1. It is claimed that there are important disparities between Hispanic and White, not Hispanic. Is this true? Appears to be

  2. Use group_by and skim for expenditures after filtering only Hispanics and White, not Hispanic.

Discrimination %>% filter(Ethnicity == "White not Hispanic"| Ethnicity == "Hispanic") %>% group_by(Ethnicity) %>% skim(Expenditures)
Data summary
Name Piped data
Number of rows 777
Number of columns 7
_______________________
Column type frequency:
numeric 1
________________________
Group variables Ethnicity

Variable type: numeric

skim_variable Ethnicity n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Expenditures Hispanic 0 1 11065.57 15629.85 222 2331.25 3952 10292.5 65581 ▇▁▁▁▁
Expenditures White not Hispanic 0 1 24697.55 20604.38 340 3977.00 15718 43134.0 68890 ▇▁▃▃▁
  1. Provide a plot of Expenditures by age cohort and ethnicity after filtering only Hispanics and White, not Hispanic.
Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% ggplot() + aes(x=Ethnicity, y=Expenditures) + geom_boxplot() + facet_wrap(vars(Age.Cohort), scales = "free_y")

Unfortunately, they are out of order.

Summary

Consider the following additional issues keeping in mind that the data are a random sample from a broader population. (1) What is the relationship between age cohort and ethnicity for the comparison of White and Hispanic? (2) Are expenditures different by age cohorts? (3) How do the previous two answers interact to influence claims of discrimination?

UCB.Micro

Provide a table of M.F and Admit. Provide the two relevant tables of conditional probabilities for that table.

  1. Are men and women admitted to graduate programs at the same rate?
library(janitor)
UCB.Micro %>% tabyl(M.F,Admit) %>% adorn_percentages("row")
##     M.F  Admitted  Rejected
##    Male 0.4451877 0.5548123
##  Female 0.3035422 0.6964578
  1. Display and tabulate applications by department.
UCB.Micro %>% tabyl(Dept)
##  Dept   n   percent
##     A 933 0.2061423
##     B 585 0.1292532
##     C 918 0.2028281
##     D 792 0.1749890
##     E 584 0.1290323
##     F 714 0.1577552
UCB.Micro %>% tabyl(Dept,Admit)
##  Dept Admitted Rejected
##     A      601      332
##     B      370      215
##     C      322      596
##     D      269      523
##     E      147      437
##     F       46      668
  1. Do any departments show differences in admissions rates by gender?
UCB.Micro %>% tabyl(M.F,Admit,Dept) %>% adorn_percentages("row")
## $A
##     M.F  Admitted  Rejected
##    Male 0.6206061 0.3793939
##  Female 0.8240741 0.1759259
## 
## $B
##     M.F  Admitted  Rejected
##    Male 0.6303571 0.3696429
##  Female 0.6800000 0.3200000
## 
## $C
##     M.F  Admitted  Rejected
##    Male 0.3692308 0.6307692
##  Female 0.3406408 0.6593592
## 
## $D
##     M.F  Admitted  Rejected
##    Male 0.3309353 0.6690647
##  Female 0.3493333 0.6506667
## 
## $E
##     M.F  Admitted  Rejected
##    Male 0.2774869 0.7225131
##  Female 0.2391858 0.7608142
## 
## $F
##     M.F   Admitted  Rejected
##    Male 0.05898123 0.9410188
##  Female 0.07038123 0.9296188
  1. Discuss 1 and 2 in light of claims of discrimination.

The departments are of different size and have different rates of acceptance.

  1. Provide a graphic that showcases the apparent disparities and a graphic that dispels them that combines all three dimensions of the table.
ggplot(UCB.Micro) +
 aes(x = M.F, fill = Admit) +
 geom_bar(position = "fill") +
 scale_fill_viridis_d(option = "cividis") +
 labs(title = "Discrimination in Admission at Berkeley?") +
 coord_flip() +
 theme_minimal() +
 facet_wrap(vars(Dept))

```