library(tidyverse)
## -- Attaching packages ------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(skimr)
library(knitr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
load(url("https://github.com/robertwwalker/DADMStuff/raw/master/Discrimination2020.RData"))
str(Discrimination)
## 'data.frame': 1000 obs. of 7 variables:
## $ ID : num 10210 10409 10486 10538 10568 ...
## $ Age.Cohort : Factor w/ 6 levels " 6 - 12","0 - 5",..: 3 5 2 4 3 3 3 3 3 3 ...
## $ Age : num 17 37 3 19 13 15 13 17 14 13 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 1 1 2 1 2 ...
## $ Expenditures : num 2113 41924 1454 6400 4412 ...
## $ Ethnicity : Factor w/ 8 levels "American Indian",..: 8 8 4 4 8 4 8 3 8 4 ...
## $ Ethnicity.Char: chr "White not Hispanic" "White not Hispanic" "Hispanic" "Hispanic" ...
str(UCB.Micro)
## 'data.frame': 4526 obs. of 3 variables:
## $ Admit: Factor w/ 2 levels "Admitted","Rejected": 1 1 1 1 1 1 1 1 1 1 ...
## $ M.F : Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "out.attrs")=List of 2
## ..$ dim : Named int [1:3] 2 2 6
## .. ..- attr(*, "names")= chr [1:3] "Admit" "Gender" "Dept"
## ..$ dimnames:List of 3
## .. ..$ Admit : chr [1:2] "Admit=Admitted" "Admit=Rejected"
## .. ..$ Gender: chr [1:2] "Gender=Male" "Gender=Female"
## .. ..$ Dept : chr [1:6] "Dept=A" "Dept=B" "Dept=C" "Dept=D" ...
#Just the Questions
Questions for Discrimination:
Discrimination %>% group_by(Gender) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
## Gender Means Medians Q1 Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Female 18130. 6400 2872. 39488.
## 2 Male 18001. 7219 2954 37201
Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% group_by(Ethnicity) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
## Ethnicity Means Medians Q1 Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Hispanic 11066. 3952 2331. 10292.
## 2 White not Hispanic 24698. 15718 3977 43134
Discrimination %>% filter(Age > 21, Age < 51) %>% group_by(Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 5
## Age.Cohort Means Medians Q1 Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 22-50 40209. 40456. 36447. 44721.
# White Males
Discrimination %>% filter(Gender == "Male", Ethnicity=="White not Hispanic") %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## Means Medians Q1 Q3
## 1 24573.8 27390.5 4195.25 40817.5
Discrimination %>% filter(Age > 21, Age < 51, Ethnicity=="Asian") %>% group_by(Ethnicity, Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` regrouping output by 'Ethnicity' (override with `.groups` argument)
## # A tibble: 1 x 6
## # Groups: Ethnicity [1]
## Ethnicity Age.Cohort Means Medians Q1 Q3
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Asian 22-50 39581. 40240 33634 44293
Discrimination %>% group_by(Gender) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
## Gender Means Medians Q1 Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Female 18130. 6400 2872. 39488.
## 2 Male 18001. 7219 2954 37201
Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% group_by(Ethnicity) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
## Ethnicity Means Medians Q1 Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Hispanic 11066. 3952 2331. 10292.
## 2 White not Hispanic 24698. 15718 3977 43134
Discrimination %>% filter(Age > 12, Age < 18) %>% group_by(Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 5
## Age.Cohort Means Medians Q1 Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 13-17 3923. 3952 3306. 4666.
# White Males
Discrimination %>% filter(Gender == "Male", Ethnicity=="White not Hispanic") %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## Means Medians Q1 Q3
## 1 24573.8 27390.5 4195.25 40817.5
Discrimination %>% filter(Age > 12, Age < 18, Ethnicity=="Asian") %>% group_by(Ethnicity, Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` regrouping output by 'Ethnicity' (override with `.groups` argument)
## # A tibble: 1 x 6
## # Groups: Ethnicity [1]
## Ethnicity Age.Cohort Means Medians Q1 Q3
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Asian 13-17 3509. 3628. 3071. 4104.
Discrimination %>% group_by(Gender) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
## Gender Means Medians Q1 Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Female 18130. 6400 2872. 39488.
## 2 Male 18001. 7219 2954 37201
Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% group_by(Ethnicity) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 5
## Ethnicity Means Medians Q1 Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Hispanic 11066. 3952 2331. 10292.
## 2 White not Hispanic 24698. 15718 3977 43134
Discrimination %>% filter(Age > 12, Age < 18) %>% group_by(Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 5
## Age.Cohort Means Medians Q1 Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 13-17 3923. 3952 3306. 4666.
# White Males
Discrimination %>% filter(Gender == "Male", Ethnicity=="White not Hispanic") %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## Means Medians Q1 Q3
## 1 24573.8 27390.5 4195.25 40817.5
Discrimination %>% filter(Age > 17, Age < 22, Ethnicity=="Asian") %>% group_by(Ethnicity, Age.Cohort) %>% summarise(Means = mean(Expenditures), Medians = median(Expenditures), Q1 = quantile(Expenditures, probs=0.25), Q3 = quantile(Expenditures, probs=0.75))
## `summarise()` regrouping output by 'Ethnicity' (override with `.groups` argument)
## # A tibble: 1 x 6
## # Groups: Ethnicity [1]
## Ethnicity Age.Cohort Means Medians Q1 Q3
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Asian 18-21 9598. 9846 7683 11282
We already saw Gender. No real difference For ethnicity, White not Hispanic and Hispanic are quite different.
Discrimination %>% group_by(Ethnicity) %>% skim(Expenditures)
| Name | Piped data |
| Number of rows | 1000 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | Ethnicity |
Variable type: numeric
| skim_variable | Ethnicity | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Expenditures | American Indian | 0 | 1 | 36438.25 | 25693.91 | 3726 | 22085.25 | 41817.5 | 56170.50 | 58392 | ▃▁▃▁▇ |
| Expenditures | Asian | 0 | 1 | 18392.37 | 19209.22 | 374 | 3382.00 | 9369.0 | 34274.00 | 75098 | ▇▁▂▂▁ |
| Expenditures | Black | 0 | 1 | 20884.59 | 20549.27 | 240 | 3870.00 | 8687.0 | 41857.00 | 60808 | ▇▁▁▃▂ |
| Expenditures | Hispanic | 0 | 1 | 11065.57 | 15629.85 | 222 | 2331.25 | 3952.0 | 10292.50 | 65581 | ▇▁▁▁▁ |
| Expenditures | Multi Race | 0 | 1 | 4456.73 | 7332.14 | 669 | 1689.75 | 2622.0 | 3749.50 | 38619 | ▇▁▁▁▁ |
| Expenditures | Native Hawaiian | 0 | 1 | 42782.33 | 6576.46 | 37479 | 39103.00 | 40727.0 | 45434.00 | 50141 | ▇▇▁▁▇ |
| Expenditures | Other | 0 | 1 | 3316.50 | 1836.36 | 2018 | 2667.25 | 3316.5 | 3965.75 | 4615 | ▇▁▁▁▇ |
| Expenditures | White not Hispanic | 0 | 1 | 24697.55 | 20604.38 | 340 | 3977.00 | 15718.0 | 43134.00 | 68890 | ▇▁▃▃▁ |
We should wonder if there is enough data for some groups.
Discrimination %>% ggplot() + aes(x=Ethnicity) + geom_bar() + coord_flip()
It is claimed that there are important disparities between Hispanic and White, not Hispanic. Is this true? Appears to be
Use group_by and skim for expenditures after filtering only Hispanics and White, not Hispanic.
Discrimination %>% filter(Ethnicity == "White not Hispanic"| Ethnicity == "Hispanic") %>% group_by(Ethnicity) %>% skim(Expenditures)
| Name | Piped data |
| Number of rows | 777 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | Ethnicity |
Variable type: numeric
| skim_variable | Ethnicity | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Expenditures | Hispanic | 0 | 1 | 11065.57 | 15629.85 | 222 | 2331.25 | 3952 | 10292.5 | 65581 | ▇▁▁▁▁ |
| Expenditures | White not Hispanic | 0 | 1 | 24697.55 | 20604.38 | 340 | 3977.00 | 15718 | 43134.0 | 68890 | ▇▁▃▃▁ |
Discrimination %>% filter(Ethnicity %in% c("White not Hispanic","Hispanic")) %>% ggplot() + aes(x=Ethnicity, y=Expenditures) + geom_boxplot() + facet_wrap(vars(Age.Cohort), scales = "free_y")
Unfortunately, they are out of order.
Consider the following additional issues keeping in mind that the data are a random sample from a broader population. (1) What is the relationship between age cohort and ethnicity for the comparison of White and Hispanic? (2) Are expenditures different by age cohorts? (3) How do the previous two answers interact to influence claims of discrimination?
Provide a table of M.F and Admit. Provide the two relevant tables of conditional probabilities for that table.
library(janitor)
UCB.Micro %>% tabyl(M.F,Admit) %>% adorn_percentages("row")
## M.F Admitted Rejected
## Male 0.4451877 0.5548123
## Female 0.3035422 0.6964578
UCB.Micro %>% tabyl(Dept)
## Dept n percent
## A 933 0.2061423
## B 585 0.1292532
## C 918 0.2028281
## D 792 0.1749890
## E 584 0.1290323
## F 714 0.1577552
UCB.Micro %>% tabyl(Dept,Admit)
## Dept Admitted Rejected
## A 601 332
## B 370 215
## C 322 596
## D 269 523
## E 147 437
## F 46 668
UCB.Micro %>% tabyl(M.F,Admit,Dept) %>% adorn_percentages("row")
## $A
## M.F Admitted Rejected
## Male 0.6206061 0.3793939
## Female 0.8240741 0.1759259
##
## $B
## M.F Admitted Rejected
## Male 0.6303571 0.3696429
## Female 0.6800000 0.3200000
##
## $C
## M.F Admitted Rejected
## Male 0.3692308 0.6307692
## Female 0.3406408 0.6593592
##
## $D
## M.F Admitted Rejected
## Male 0.3309353 0.6690647
## Female 0.3493333 0.6506667
##
## $E
## M.F Admitted Rejected
## Male 0.2774869 0.7225131
## Female 0.2391858 0.7608142
##
## $F
## M.F Admitted Rejected
## Male 0.05898123 0.9410188
## Female 0.07038123 0.9296188
The departments are of different size and have different rates of acceptance.
ggplot(UCB.Micro) +
aes(x = M.F, fill = Admit) +
geom_bar(position = "fill") +
scale_fill_viridis_d(option = "cividis") +
labs(title = "Discrimination in Admission at Berkeley?") +
coord_flip() +
theme_minimal() +
facet_wrap(vars(Dept))
```