Instal and load packages
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(skimr)
library(knitr)
#Discrimination
LOAD AND PREPARE DATA
library(readxl)
Discrimination <- read_excel("Discrimination.xlsx")
ANALYZE DATA
1. What is the average of expenditures for:
- All males vs. all females
Discrimination %>%
group_by(Gender) %>%
skim(Expenditures)
Data summary
| Name |
Piped data |
| Number of rows |
1000 |
| Number of columns |
6 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
1 |
| ________________________ |
|
| Group variables |
Gender |
Variable type: numeric
| Expenditures |
Female |
0 |
1 |
18129.61 |
20019.71 |
222 |
2872.5 |
6400 |
39487.5 |
75098 |
▇▁▂▂▁ |
| Expenditures |
Male |
0 |
1 |
18001.20 |
19068.01 |
386 |
2954.0 |
7219 |
37201.0 |
68890 |
▇▁▂▂▁ |
- Hispanics and all White not Hispanics
Discrimination %>%
select(Gender, Ethnicity, Expenditures)%>%
filter(Ethnicity == "White not Hispanic"|Ethnicity == "Hispanic")%>%
group_by(Ethnicity)%>%
skim(Expenditures)
Data summary
| Name |
Piped data |
| Number of rows |
777 |
| Number of columns |
3 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
1 |
| ________________________ |
|
| Group variables |
Ethnicity |
Variable type: numeric
| Expenditures |
Hispanic |
0 |
1 |
11065.57 |
15629.85 |
222 |
2331.25 |
3952 |
10292.5 |
65581 |
▇▁▁▁▁ |
| Expenditures |
White not Hispanic |
0 |
1 |
24697.55 |
20604.38 |
340 |
3977.00 |
15718 |
43134.0 |
68890 |
▇▁▃▃▁ |
- All 22-50 year old
Discrimination %>%
select(Age, Expenditures)%>%
filter(Age>21,Age<51)%>%
group_by(Age)%>%
skim(Expenditures)
Data summary
| Name |
Piped data |
| Number of rows |
226 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
1 |
| ________________________ |
|
| Group variables |
Age |
Variable type: numeric
| Expenditures |
22 |
0 |
1 |
39088.28 |
7428.45 |
27058 |
33062.50 |
40444.0 |
43966.00 |
50795 |
▆▂▇▆▆ |
| Expenditures |
23 |
0 |
1 |
39657.91 |
7262.15 |
25348 |
36265.25 |
40276.5 |
43997.50 |
50542 |
▃▂▇▆▆ |
| Expenditures |
24 |
0 |
1 |
41715.00 |
6256.03 |
30803 |
37191.50 |
41586.0 |
46859.00 |
51392 |
▃▇▅▅▇ |
| Expenditures |
25 |
0 |
1 |
41253.63 |
5058.41 |
31943 |
38352.00 |
40999.0 |
45676.50 |
49142 |
▃▅▇▁▇ |
| Expenditures |
26 |
0 |
1 |
38700.58 |
4566.97 |
30504 |
36058.75 |
38595.5 |
41635.25 |
45507 |
▂▇▅▇▇ |
| Expenditures |
27 |
0 |
1 |
40239.52 |
6389.62 |
28581 |
35888.50 |
38331.0 |
45657.00 |
50353 |
▃▇▇▃▇ |
| Expenditures |
28 |
0 |
1 |
36582.56 |
7214.79 |
26178 |
31027.00 |
37247.0 |
41464.00 |
48968 |
▇▇▇▇▃ |
| Expenditures |
29 |
0 |
1 |
40469.70 |
3291.66 |
34708 |
38830.50 |
40884.0 |
41847.75 |
46356 |
▃▂▇▃▂ |
| Expenditures |
30 |
0 |
1 |
41699.59 |
4835.71 |
33721 |
40125.00 |
40927.0 |
44910.00 |
51486 |
▇▇▇▆▃ |
| Expenditures |
31 |
0 |
1 |
39863.29 |
5592.99 |
33051 |
36514.00 |
39172.0 |
43007.00 |
47778 |
▅▁▇▁▅ |
| Expenditures |
32 |
0 |
1 |
37234.38 |
5118.98 |
28205 |
33630.00 |
37637.0 |
40587.00 |
45017 |
▃▃▇▃▆ |
| Expenditures |
33 |
0 |
1 |
39533.58 |
5205.37 |
27090 |
38117.50 |
40649.0 |
42313.25 |
47204 |
▂▂▅▇▃ |
| Expenditures |
34 |
0 |
1 |
42266.67 |
2876.00 |
39521 |
40344.25 |
41034.0 |
44437.25 |
46325 |
▇▂▁▁▅ |
| Expenditures |
35 |
0 |
1 |
34390.40 |
5517.50 |
27417 |
32529.00 |
33070.0 |
36607.00 |
42329 |
▃▇▁▃▃ |
| Expenditures |
36 |
0 |
1 |
35263.33 |
5300.06 |
29144 |
33695.00 |
38246.0 |
38323.00 |
38400 |
▃▁▁▁▇ |
| Expenditures |
37 |
0 |
1 |
38900.80 |
4953.63 |
30627 |
37948.00 |
41550.0 |
41924.00 |
42455 |
▂▁▁▂▇ |
| Expenditures |
38 |
0 |
1 |
39091.50 |
10567.09 |
29709 |
33743.25 |
36233.0 |
41581.25 |
54191 |
▃▇▁▁▃ |
| Expenditures |
39 |
0 |
1 |
38849.60 |
10259.48 |
31076 |
33259.00 |
36576.0 |
36621.00 |
56716 |
▇▇▁▁▃ |
| Expenditures |
40 |
0 |
1 |
44693.00 |
2924.17 |
42840 |
43007.50 |
43175.0 |
45619.50 |
48064 |
▇▁▁▁▃ |
| Expenditures |
41 |
0 |
1 |
48919.75 |
4990.17 |
42139 |
47452.00 |
49712.5 |
51180.25 |
54115 |
▇▁▇▇▇ |
| Expenditures |
42 |
0 |
1 |
44500.00 |
9311.18 |
37916 |
41208.00 |
44500.0 |
47792.00 |
51084 |
▇▁▁▁▇ |
| Expenditures |
44 |
0 |
1 |
46144.00 |
NA |
46144 |
46144.00 |
46144.0 |
46144.00 |
46144 |
▁▁▇▁▁ |
| Expenditures |
45 |
0 |
1 |
48218.00 |
2950.05 |
46132 |
47175.00 |
48218.0 |
49261.00 |
50304 |
▇▁▁▁▇ |
| Expenditures |
46 |
0 |
1 |
52222.00 |
NA |
52222 |
52222.00 |
52222.0 |
52222.00 |
52222 |
▁▁▇▁▁ |
| Expenditures |
48 |
0 |
1 |
49424.00 |
NA |
49424 |
49424.00 |
49424.0 |
49424.00 |
49424 |
▁▁▇▁▁ |
- All male, White not Hispanics
allmwnh <- Discrimination %>%
select(Gender, Ethnicity, Expenditures)%>%
filter(Gender == "Male", Ethnicity == "White not Hispanic")%>%
skim(Expenditures)
print(allmwnh)
## -- Data Summary ------------------------
## Values
## Name Piped data
## Number of rows 196
## Number of columns 3
## _______________________
## Column type frequency:
## numeric 1
## ________________________
## Group variables None
##
## -- Variable type: numeric ------------------------------------------------------
## # A tibble: 1 x 11
## skim_variable n_missing complete_rate mean sd p0 p25 p50 p75
## * <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Expenditures 0 1 24574. 19828. 650 4195. 27390. 40818.
## p100 hist
## * <dbl> <chr>
## 1 68890 <U+2587><U+2581><U+2585><U+2583><U+2581>
- All Asian, 22-50 year old
Discrimination %>%
select(Age, Ethnicity, Expenditures)%>%
filter(Age>21, Age<51, Ethnicity == "Asian")%>%
group_by(Age)%>%
skim(Expenditures)
Data summary
| Name |
Piped data |
| Number of rows |
29 |
| Number of columns |
3 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
1 |
| ________________________ |
|
| Group variables |
Age |
Variable type: numeric
| Expenditures |
22 |
0 |
1 |
40240.00 |
NA |
40240 |
40240.00 |
40240.0 |
40240.00 |
40240 |
▁▁▇▁▁ |
| Expenditures |
23 |
0 |
1 |
37698.50 |
11102.48 |
25348 |
30595.00 |
37452.0 |
44555.50 |
50542 |
▇▇▁▇▇ |
| Expenditures |
24 |
0 |
1 |
40359.25 |
9199.66 |
30803 |
33406.25 |
40592.5 |
47545.50 |
49449 |
▇▁▁▁▇ |
| Expenditures |
25 |
0 |
1 |
41491.50 |
696.50 |
40999 |
41245.25 |
41491.5 |
41737.75 |
41984 |
▇▁▁▁▇ |
| Expenditures |
26 |
0 |
1 |
36281.50 |
5057.14 |
30504 |
32851.50 |
36739.0 |
40169.00 |
41144 |
▃▃▁▁▇ |
| Expenditures |
27 |
0 |
1 |
36617.33 |
5418.93 |
28581 |
33842.50 |
37245.0 |
39035.00 |
44293 |
▃▃▇▃▃ |
| Expenditures |
28 |
0 |
1 |
41123.00 |
11094.51 |
33278 |
37200.50 |
41123.0 |
45045.50 |
48968 |
▇▁▁▁▇ |
| Expenditures |
29 |
0 |
1 |
40813.50 |
282.14 |
40614 |
40713.75 |
40813.5 |
40913.25 |
41013 |
▇▁▁▁▇ |
| Expenditures |
30 |
0 |
1 |
33721.00 |
NA |
33721 |
33721.00 |
33721.0 |
33721.00 |
33721 |
▁▁▇▁▁ |
| Expenditures |
31 |
0 |
1 |
47778.00 |
NA |
47778 |
47778.00 |
47778.0 |
47778.00 |
47778 |
▁▁▇▁▁ |
| Expenditures |
40 |
0 |
1 |
48064.00 |
NA |
48064 |
48064.00 |
48064.0 |
48064.00 |
48064 |
▁▁▇▁▁ |
| Expenditures |
41 |
0 |
1 |
54115.00 |
NA |
54115 |
54115.00 |
54115.0 |
54115.00 |
54115 |
▁▁▇▁▁ |
3. What is the range of the middle 50% (IQR = Q3 - Q1) of expenditures for:
- all males vs. all females,
skim_variable Gender n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist * 1 Expenditures Female 0 1 18130. 20020. 222 2872. 6400 39488. 75098 ▇▁▂▂▁ 2 Expenditures Male 0 1 18001. 19068. 386 2954 7219 37201 68890 ▇▁▂▂▁
IQR_FEM <- 39488 - 2872
IQR_MALE <- 37201 -2954
IQR_FEM
## [1] 36616
IQR_MALE
## [1] 34247
- all Hispanics and all White non-Hispanics
– Variable type: numeric ——————————————————————————————————– # A tibble: 2 x 12 skim_variable Ethnicity n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist * 1 Expenditures Hispanic 0 1 11066. 15630. 222 2331. 3952 10292. 65581 ▇▁▁▁▁ 2 Expenditures White not Hispanic 0 1 24698. 20604. 340 3977 15718 43134 68890 ▇▁▃▃▁
IQR_HISP <- 10292 - 2331
IQR_WHITENH <- 43134 -3977
IQR_HISP
## [1] 7961
IQR_WHITENH
## [1] 39157
- all 13-17 year old
Discrimination%>%
select(Age, Expenditures)%>%
filter(Age>12,Age<18)%>%
group_by(Age)%>%
skim(Expenditures)
Data summary
| Name |
Piped data |
| Number of rows |
212 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
1 |
| ________________________ |
|
| Group variables |
Age |
Variable type: numeric
| Expenditures |
13 |
0 |
1 |
3918.67 |
980.86 |
1195 |
3406.00 |
4003.0 |
4518.50 |
5872 |
▁▂▆▇▃ |
| Expenditures |
14 |
0 |
1 |
3974.82 |
1196.83 |
386 |
3223.00 |
4058.0 |
4808.00 |
5920 |
▁▂▇▇▆ |
| Expenditures |
15 |
0 |
1 |
3827.16 |
817.12 |
2029 |
3205.25 |
3715.0 |
4531.50 |
5304 |
▂▇▇▇▅ |
| Expenditures |
16 |
0 |
1 |
3762.86 |
1038.39 |
1432 |
3158.75 |
3817.5 |
4466.25 |
5718 |
▃▃▇▆▅ |
| Expenditures |
17 |
0 |
1 |
4134.20 |
989.45 |
2015 |
3640.00 |
4218.5 |
4886.25 |
6798 |
▃▇▇▆▁ |
IQR13 <- 4518 - 3406
IQR14 <- 4808 - 3223
IQR15 <- 4532 - 3205
IQR16 <- 4466 - 3159
IQR17 <- 4886 - 3640
IQR13
## [1] 1112
IQR14
## [1] 1585
IQR15
## [1] 1327
IQR16
## [1] 1307
IQR17
## [1] 1246
- all male, White non-Hispanics
– Variable type: numeric ——————————————————————————————————————- # A tibble: 1 x 11 skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist * 1 Expenditures 0 1 24574. 19828. 650 4195. 27390. 40818. 68890 ▇▁▅▃▁
IQR <- 40818-4195
IQR
## [1] 36623
- all Asian, 18-21 year old
Discrimination%>%
select(Age, Ethnicity, Expenditures)%>%
filter(Age>17, Age<22, Ethnicity == "Asian")%>%
group_by(Age)%>%
skim(Expenditures)
Data summary
| Name |
Piped data |
| Number of rows |
41 |
| Number of columns |
3 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
1 |
| ________________________ |
|
| Group variables |
Age |
Variable type: numeric
| Expenditures |
18 |
0 |
1 |
9368.07 |
3799.71 |
3153 |
7374.75 |
9523.0 |
10977.50 |
18435 |
▃▇▇▂▂ |
| Expenditures |
19 |
0 |
1 |
9374.00 |
1485.70 |
7475 |
8657.50 |
8767.0 |
10117.00 |
12243 |
▂▇▂▂▂ |
| Expenditures |
20 |
0 |
1 |
10554.83 |
2538.50 |
6255 |
8848.75 |
11389.5 |
12296.75 |
13345 |
▅▁▂▅▇ |
| Expenditures |
21 |
0 |
1 |
8674.57 |
2526.45 |
3988 |
7627.00 |
9504.0 |
10154.00 |
11668 |
▃▁▇▇▇ |
IQR18 <- 10978 - 7375
IQR18
## [1] 3603
IQR19 <- 10117 - 8658
IQR19
## [1] 1459
IQR20 <- 12297 - 8849
IQR20
## [1] 3448
IQR21 <- 10154 - 7627
IQR21
## [1] 2527
5. It is claimed that there are important disparities between Hispanic and White not Hispanic. Is this true?
White_not_Hispanic <- 24697.549
Hispanic <- 11065.569
x <- White_not_Hispanic - Hispanic
x
## [1] 13631.98
6. Use group_by and skim for expenditures after filtering only Hispanics and White not Hispanic.
Discrimination %>%
select(Gender, Ethnicity, Expenditures)%>%
filter(Ethnicity == "White not Hispanic"|Ethnicity == "Hispanic")%>%
group_by(Ethnicity)%>%
skim(Expenditures)
Data summary
| Name |
Piped data |
| Number of rows |
777 |
| Number of columns |
3 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
1 |
| ________________________ |
|
| Group variables |
Ethnicity |
Variable type: numeric
| Expenditures |
Hispanic |
0 |
1 |
11065.57 |
15629.85 |
222 |
2331.25 |
3952 |
10292.5 |
65581 |
▇▁▁▁▁ |
| Expenditures |
White not Hispanic |
0 |
1 |
24697.55 |
20604.38 |
340 |
3977.00 |
15718 |
43134.0 |
68890 |
▇▁▃▃▁ |
7. Provide a plot of Expenditures by age cohort and ethnicity after filtering only Hispanics and White not Hispanic.
WNH <- Discrimination %>%
filter(Ethnicity %in% c("White not Hispanic", "Hispanic"))
ggplot(WNH) +
aes(x = Expenditures, y = Age.Cohort, fill = Ethnicity) +
geom_boxplot(adjust = 1L) +
scale_fill_hue() +
theme_minimal()
## Warning: Ignoring unknown parameters: adjust

Clear mind :)