Get Data

load(url("https://github.com/robertwwalker/DADMStuff/raw/master/Discrimination2020.RData"))
str(Discrimination)

## 'data.frame':    1000 obs. of  7 variables:
##  $ ID            : num  10210 10409 10486 10538 10568 ...
##  $ Age.Cohort    : Factor w/ 6 levels " 6 - 12","0 - 5",..: 3 5 2 4 3 3 3 3 3 3 ...
##  $ Age           : num  17 37 3 19 13 15 13 17 14 13 ...
##  $ Gender        : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 1 1 2 1 2 ...
##  $ Expenditures  : num  2113 41924 1454 6400 4412 ...
##  $ Ethnicity     : Factor w/ 8 levels "American Indian",..: 8 8 4 4 8 4 8 3 8 4 ...
##  $ Ethnicity.Char: chr  "White not Hispanic" "White not Hispanic" "Hispanic" "Hispanic" ...

str(UCB.Micro)

## 'data.frame':    4526 obs. of  3 variables:
##  $ Admit: Factor w/ 2 levels "Admitted","Rejected": 1 1 1 1 1 1 1 1 1 1 ...
##  $ M.F  : Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Dept : Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "out.attrs")=List of 2
##   ..$ dim     : Named int [1:3] 2 2 6
##   .. ..- attr(*, "names")= chr [1:3] "Admit" "Gender" "Dept"
##   ..$ dimnames:List of 3
##   .. ..$ Admit : chr [1:2] "Admit=Admitted" "Admit=Rejected"
##   .. ..$ Gender: chr [1:2] "Gender=Male" "Gender=Female"
##   .. ..$ Dept  : chr [1:6] "Dept=A" "Dept=B" "Dept=C" "Dept=D" ...

#Just the Questions

Questions for Discrimination:

What is the average of expenditures for: (a) all males vs. all females, (b) all Hispanics and all White, non-Hispanics, (c) all 22-50 year olds, (d) all male, White non-Hispanics, and (e) all Asian, 22-50 year olds?

Discrimination %>% group_by(Gender) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 5
##   Gender  Means Medians    Q1     Q3
##   <fct>   <dbl>   <dbl> <dbl>  <dbl>
## 1 Female 18130.    6400 2872. 39488.
## 2 Male   18001.    7219 2954  37201

Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% group_by(Ethnicity) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 5
##   Ethnicity           Means Medians    Q1     Q3
##   <fct>               <dbl>   <dbl> <dbl>  <dbl>
## 1 Hispanic           11066.    3952 2331. 10292.
## 2 White not Hispanic 24698.   15718 3977  43134

Discrimination %>% filter(Age > 21, Age < 51) %>% group_by(Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 1 x 5
##   Age.Cohort  Means Medians     Q1     Q3
##   <fct>       <dbl>   <dbl>  <dbl>  <dbl>
## 1 22-50      40209.  40456. 36447. 44721.

# White Males
Discrimination %>% filter(Gender == "Male", Ethnicity=="White not Hispanic") %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

##     Means Medians      Q1      Q3
## 1 24573.8 27390.5 4195.25 40817.5

Discrimination %>% filter(Age > 21, Age < 51, Ethnicity=="Asian") %>% group_by(Ethnicity, Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` regrouping output by 'Ethnicity' (override with `.groups` argument)

## # A tibble: 1 x 6
## # Groups:   Ethnicity [1]
##   Ethnicity Age.Cohort  Means Medians    Q1    Q3
##   <fct>     <fct>       <dbl>   <dbl> <dbl> <dbl>
## 1 Asian     22-50      39581.   40240 33634 44293

What is the median of expenditures for: (a) all males vs. all females, (b) all Hispanics and all White, non-Hispanics, (c) all 13-17 year olds, (d) all male, White non-Hispanics, and (e) all Asian, 13-17 year olds?

Discrimination %>% group_by(Gender) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 5
##   Gender  Means Medians    Q1     Q3
##   <fct>   <dbl>   <dbl> <dbl>  <dbl>
## 1 Female 18130.    6400 2872. 39488.
## 2 Male   18001.    7219 2954  37201

Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% group_by(Ethnicity) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 5
##   Ethnicity           Means Medians    Q1     Q3
##   <fct>               <dbl>   <dbl> <dbl>  <dbl>
## 1 Hispanic           11066.    3952 2331. 10292.
## 2 White not Hispanic 24698.   15718 3977  43134

Discrimination %>% filter(Age > 12, Age < 18) %>% group_by(Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 1 x 5
##   Age.Cohort Means Medians    Q1    Q3
##   <fct>      <dbl>   <dbl> <dbl> <dbl>
## 1 13-17      3923.    3952 3306. 4666.

# White Males
Discrimination %>% filter(Gender == "Male", Ethnicity=="White not Hispanic") %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

##     Means Medians      Q1      Q3
## 1 24573.8 27390.5 4195.25 40817.5

Discrimination %>% filter(Age > 12, Age < 18, Ethnicity=="Asian") %>% group_by(Ethnicity, Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` regrouping output by 'Ethnicity' (override with `.groups` argument)

## # A tibble: 1 x 6
## # Groups:   Ethnicity [1]
##   Ethnicity Age.Cohort Means Medians    Q1    Q3
##   <fct>     <fct>      <dbl>   <dbl> <dbl> <dbl>
## 1 Asian     13-17      3509.   3628. 3071. 4104.

What is the range of the middle 50% of expenditures for: (a) all males vs. all females, (b) all Hispanics and all White, non-Hispanics, (c) all 13-17 year olds, (d) all male, White non-Hispanics, and (e) all Asian, 18-21 year olds?

Discrimination %>% group_by(Gender) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 5
##   Gender  Means Medians    Q1     Q3
##   <fct>   <dbl>   <dbl> <dbl>  <dbl>
## 1 Female 18130.    6400 2872. 39488.
## 2 Male   18001.    7219 2954  37201

Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% group_by(Ethnicity) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 5
##   Ethnicity           Means Medians    Q1     Q3
##   <fct>               <dbl>   <dbl> <dbl>  <dbl>
## 1 Hispanic           11066.    3952 2331. 10292.
## 2 White not Hispanic 24698.   15718 3977  43134

Discrimination %>% filter(Age > 12, Age < 18) %>% group_by(Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 1 x 5
##   Age.Cohort Means Medians    Q1    Q3
##   <fct>      <dbl>   <dbl> <dbl> <dbl>
## 1 13-17      3923.    3952 3306. 4666.

# White Males
Discrimination %>% filter(Gender == "Male", Ethnicity=="White not Hispanic") %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

##     Means Medians      Q1      Q3
## 1 24573.8 27390.5 4195.25 40817.5

Discrimination %>% filter(Age > 17, Age < 22, Ethnicity=="Asian") %>% group_by(Ethnicity, Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))

## `summarise()` regrouping output by 'Ethnicity' (override with `.groups` argument)

## # A tibble: 1 x 6
## # Groups:   Ethnicity [1]
##   Ethnicity Age.Cohort Means Medians    Q1    Q3
##   <fct>     <fct>      <dbl>   <dbl> <dbl> <dbl>
## 1 Asian     18-21      9598.    9846  7683 11282

Does discrimination in expenditures exist? There are two relevant potential forms of actionable discrimination: gender and ethnicity. Evaluate these two questions with the appropriate pivot table equivalent defined for each.

We already saw Gender. No real difference For ethnicity, White not Hispanic and Hispanic are quite different.

Discrimination %>% group_by(Ethnicity) %>% skim(Expenditures)

Data summary
Name	Piped data
Number of rows	1000
Number of columns	7
_______________________
Column type frequency:
numeric	1
________________________
Group variables	Ethnicity

Variable type: numeric

skim_variable	Ethnicity	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Expenditures	American Indian	1	36438.25	25693.91	3726	22085.25	41817.5	56170.50	58392	▃▁▃▁▇
Expenditures	Asian	1	18392.37	19209.22	374	3382.00	9369.0	34274.00	75098	▇▁▂▂▁
Expenditures	Black	1	20884.59	20549.27	240	3870.00	8687.0	41857.00	60808	▇▁▁▃▂
Expenditures	Hispanic	1	11065.57	15629.85	222	2331.25	3952.0	10292.50	65581	▇▁▁▁▁
Expenditures	Multi Race	1	4456.73	7332.14	669	1689.75	2622.0	3749.50	38619	▇▁▁▁▁
Expenditures	Native Hawaiian	1	42782.33	6576.46	37479	39103.00	40727.0	45434.00	50141	▇▇▁▁▇
Expenditures	Other	1	3316.50	1836.36	2018	2667.25	3316.5	3965.75	4615	▇▁▁▁▇
Expenditures	White not Hispanic	1	24697.55	20604.38	340	3977.00	15718.0	43134.00	68890	▇▁▃▃▁

We should wonder if there is enough data for some groups.

Discrimination %>% ggplot() + aes(x=Ethnicity) + geom_bar() + coord_flip()

It is claimed that there are important disparities between Hispanic and White, not Hispanic. Is this true? Appears to be
Use group_by and skim for expenditures after filtering only Hispanics and White, not Hispanic.

Discrimination %>% filter(Ethnicity == "White not Hispanic"| Ethnicity == "Hispanic") %>% group_by(Ethnicity) %>% skim(Expenditures)

Data summary
Name	Piped data
Number of rows	777
Number of columns	7
_______________________
Column type frequency:
numeric	1
________________________
Group variables	Ethnicity

Variable type: numeric

skim_variable	Ethnicity	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Expenditures	Hispanic	0	1	11065.57	15629.85	222	2331.25	3952	10292.5	65581	▇▁▁▁▁
Expenditures	White not Hispanic	0	1	24697.55	20604.38	340	3977.00	15718	43134.0	68890	▇▁▃▃▁

Provide a plot of Expenditures by age cohort and ethnicity after filtering only Hispanics and White, not Hispanic.

Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% ggplot() + aes(x=Ethnicity, y=Expenditures) + geom_boxplot() + facet_wrap(vars(Age.Cohort), scales = "free_y")

Unfortunately, they are out of order.

Summary

Consider the following additional issues keeping in mind that the data are a random sample from a broader population. (1) What is the relationship between age cohort and ethnicity for the comparison of White and Hispanic? (2) Are expenditures different by age cohorts? (3) How do the previous two answers interact to influence claims of discrimination?

UCB.Micro

Provide a table of M.F and Admit. Provide the two relevant tables of conditional probabilities for that table.

Are men and women admitted to graduate programs at the same rate?

library(janitor)
UCB.Micro %>% tabyl(M.F,Admit) %>% adorn_percentages("row")

##     M.F  Admitted  Rejected
##    Male 0.4451877 0.5548123
##  Female 0.3035422 0.6964578

Display and tabulate applications by department.

UCB.Micro %>% tabyl(Dept)

##  Dept   n   percent
##     A 933 0.2061423
##     B 585 0.1292532
##     C 918 0.2028281
##     D 792 0.1749890
##     E 584 0.1290323
##     F 714 0.1577552

UCB.Micro %>% tabyl(Dept,Admit)

##  Dept Admitted Rejected
##     A      601      332
##     B      370      215
##     C      322      596
##     D      269      523
##     E      147      437
##     F       46      668

Do any departments show differences in admissions rates by gender?

UCB.Micro %>% tabyl(M.F,Admit,Dept) %>% adorn_percentages("row")

## $A
##     M.F  Admitted  Rejected
##    Male 0.6206061 0.3793939
##  Female 0.8240741 0.1759259
## 
## $B
##     M.F  Admitted  Rejected
##    Male 0.6303571 0.3696429
##  Female 0.6800000 0.3200000
## 
## $C
##     M.F  Admitted  Rejected
##    Male 0.3692308 0.6307692
##  Female 0.3406408 0.6593592
## 
## $D
##     M.F  Admitted  Rejected
##    Male 0.3309353 0.6690647
##  Female 0.3493333 0.6506667
## 
## $E
##     M.F  Admitted  Rejected
##    Male 0.2774869 0.7225131
##  Female 0.2391858 0.7608142
## 
## $F
##     M.F   Admitted  Rejected
##    Male 0.05898123 0.9410188
##  Female 0.07038123 0.9296188

Discuss 1 and 2 in light of claims of discrimination.

The departments are of different size and have different rates of acceptance.

Provide a graphic that showcases the apparent disparities and a graphic that dispels them that combines all three dimensions of the table.

ggplot(UCB.Micro) +
 aes(x = M.F, fill = Admit) +
 geom_bar(position = "fill") +
 scale_fill_viridis_d(option = "cividis") +
 labs(title = "Discrimination in Admission at Berkeley?") +
 coord_flip() +
 theme_minimal() +
 facet_wrap(vars(Dept))

```

Discrimination stepby step

KAROL OROZCO

9/28/2020

Instal and load packages

Get Data

Summary

UCB.Micro