Instal and load packages

       library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
       library(dplyr)
       library(skimr)
       library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
       library(knitr)

#Discrimination

LOAD AND PREPARE DATA

library(readxl)
Discrimination <- read_excel("Discrimination.xlsx")

ANALYZE DATA

1. What is the average of expenditures for:

  1. All males vs. all females
Discrimination %>%
  group_by(Gender) %>%
  skim(Expenditures)
Data summary
Name Piped data
Number of rows 1000
Number of columns 6
_______________________
Column type frequency:
numeric 1
________________________
Group variables Gender

Variable type: numeric

skim_variable Gender n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Expenditures Female 0 1 18129.61 20019.71 222 2872.5 6400 39487.5 75098 ▇▁▂▂▁
Expenditures Male 0 1 18001.20 19068.01 386 2954.0 7219 37201.0 68890 ▇▁▂▂▁
  1. Hispanics and all White not Hispanics
Discrimination %>% 
  select(Gender, Ethnicity, Expenditures)%>% 
  filter(Ethnicity == "White not Hispanic"|Ethnicity == "Hispanic")%>%
  group_by(Ethnicity)%>%

skim(Expenditures)
Data summary
Name Piped data
Number of rows 777
Number of columns 3
_______________________
Column type frequency:
numeric 1
________________________
Group variables Ethnicity

Variable type: numeric

skim_variable Ethnicity n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Expenditures Hispanic 0 1 11065.57 15629.85 222 2331.25 3952 10292.5 65581 ▇▁▁▁▁
Expenditures White not Hispanic 0 1 24697.55 20604.38 340 3977.00 15718 43134.0 68890 ▇▁▃▃▁
  1. All 22-50 year old
Discrimination %>% 
  select(Age, Expenditures)%>% 
  filter(Age>21,Age<51)%>%
  group_by(Age)%>%
  
  skim(Expenditures)
Data summary
Name Piped data
Number of rows 226
Number of columns 2
_______________________
Column type frequency:
numeric 1
________________________
Group variables Age

Variable type: numeric

skim_variable Age n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Expenditures 22 0 1 39088.28 7428.45 27058 33062.50 40444.0 43966.00 50795 ▆▂▇▆▆
Expenditures 23 0 1 39657.91 7262.15 25348 36265.25 40276.5 43997.50 50542 ▃▂▇▆▆
Expenditures 24 0 1 41715.00 6256.03 30803 37191.50 41586.0 46859.00 51392 ▃▇▅▅▇
Expenditures 25 0 1 41253.63 5058.41 31943 38352.00 40999.0 45676.50 49142 ▃▅▇▁▇
Expenditures 26 0 1 38700.58 4566.97 30504 36058.75 38595.5 41635.25 45507 ▂▇▅▇▇
Expenditures 27 0 1 40239.52 6389.62 28581 35888.50 38331.0 45657.00 50353 ▃▇▇▃▇
Expenditures 28 0 1 36582.56 7214.79 26178 31027.00 37247.0 41464.00 48968 ▇▇▇▇▃
Expenditures 29 0 1 40469.70 3291.66 34708 38830.50 40884.0 41847.75 46356 ▃▂▇▃▂
Expenditures 30 0 1 41699.59 4835.71 33721 40125.00 40927.0 44910.00 51486 ▇▇▇▆▃
Expenditures 31 0 1 39863.29 5592.99 33051 36514.00 39172.0 43007.00 47778 ▅▁▇▁▅
Expenditures 32 0 1 37234.38 5118.98 28205 33630.00 37637.0 40587.00 45017 ▃▃▇▃▆
Expenditures 33 0 1 39533.58 5205.37 27090 38117.50 40649.0 42313.25 47204 ▂▂▅▇▃
Expenditures 34 0 1 42266.67 2876.00 39521 40344.25 41034.0 44437.25 46325 ▇▂▁▁▅
Expenditures 35 0 1 34390.40 5517.50 27417 32529.00 33070.0 36607.00 42329 ▃▇▁▃▃
Expenditures 36 0 1 35263.33 5300.06 29144 33695.00 38246.0 38323.00 38400 ▃▁▁▁▇
Expenditures 37 0 1 38900.80 4953.63 30627 37948.00 41550.0 41924.00 42455 ▂▁▁▂▇
Expenditures 38 0 1 39091.50 10567.09 29709 33743.25 36233.0 41581.25 54191 ▃▇▁▁▃
Expenditures 39 0 1 38849.60 10259.48 31076 33259.00 36576.0 36621.00 56716 ▇▇▁▁▃
Expenditures 40 0 1 44693.00 2924.17 42840 43007.50 43175.0 45619.50 48064 ▇▁▁▁▃
Expenditures 41 0 1 48919.75 4990.17 42139 47452.00 49712.5 51180.25 54115 ▇▁▇▇▇
Expenditures 42 0 1 44500.00 9311.18 37916 41208.00 44500.0 47792.00 51084 ▇▁▁▁▇
Expenditures 44 0 1 46144.00 NA 46144 46144.00 46144.0 46144.00 46144 ▁▁▇▁▁
Expenditures 45 0 1 48218.00 2950.05 46132 47175.00 48218.0 49261.00 50304 ▇▁▁▁▇
Expenditures 46 0 1 52222.00 NA 52222 52222.00 52222.0 52222.00 52222 ▁▁▇▁▁
Expenditures 48 0 1 49424.00 NA 49424 49424.00 49424.0 49424.00 49424 ▁▁▇▁▁
  1. All male, White not Hispanics
allmwnh <- Discrimination %>% 
  select(Gender, Ethnicity, Expenditures)%>% 
  filter(Gender == "Male", Ethnicity == "White not Hispanic")%>%
   skim(Expenditures)

print(allmwnh)
## -- Data Summary ------------------------
##                            Values    
## Name                       Piped data
## Number of rows             196       
## Number of columns          3         
## _______________________              
## Column type frequency:               
##   numeric                  1         
## ________________________             
## Group variables            None      
## 
## -- Variable type: numeric ------------------------------------------------------
## # A tibble: 1 x 11
##   skim_variable n_missing complete_rate   mean     sd    p0   p25    p50    p75
## * <chr>             <int>         <dbl>  <dbl>  <dbl> <dbl> <dbl>  <dbl>  <dbl>
## 1 Expenditures          0             1 24574. 19828.   650 4195. 27390. 40818.
##    p100 hist 
## * <dbl> <chr>
## 1 68890 <U+2587><U+2581><U+2585><U+2583><U+2581>
  1. All Asian, 22-50 year old
Discrimination %>% 
  select(Age, Ethnicity, Expenditures)%>% 
  filter(Age>21, Age<51, Ethnicity == "Asian")%>%
    group_by(Age)%>%
  
  skim(Expenditures)
Data summary
Name Piped data
Number of rows 29
Number of columns 3
_______________________
Column type frequency:
numeric 1
________________________
Group variables Age

Variable type: numeric

skim_variable Age n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Expenditures 22 0 1 40240.00 NA 40240 40240.00 40240.0 40240.00 40240 ▁▁▇▁▁
Expenditures 23 0 1 37698.50 11102.48 25348 30595.00 37452.0 44555.50 50542 ▇▇▁▇▇
Expenditures 24 0 1 40359.25 9199.66 30803 33406.25 40592.5 47545.50 49449 ▇▁▁▁▇
Expenditures 25 0 1 41491.50 696.50 40999 41245.25 41491.5 41737.75 41984 ▇▁▁▁▇
Expenditures 26 0 1 36281.50 5057.14 30504 32851.50 36739.0 40169.00 41144 ▃▃▁▁▇
Expenditures 27 0 1 36617.33 5418.93 28581 33842.50 37245.0 39035.00 44293 ▃▃▇▃▃
Expenditures 28 0 1 41123.00 11094.51 33278 37200.50 41123.0 45045.50 48968 ▇▁▁▁▇
Expenditures 29 0 1 40813.50 282.14 40614 40713.75 40813.5 40913.25 41013 ▇▁▁▁▇
Expenditures 30 0 1 33721.00 NA 33721 33721.00 33721.0 33721.00 33721 ▁▁▇▁▁
Expenditures 31 0 1 47778.00 NA 47778 47778.00 47778.0 47778.00 47778 ▁▁▇▁▁
Expenditures 40 0 1 48064.00 NA 48064 48064.00 48064.0 48064.00 48064 ▁▁▇▁▁
Expenditures 41 0 1 54115.00 NA 54115 54115.00 54115.0 54115.00 54115 ▁▁▇▁▁

2. What is the median of expenditures for:

  1. all males vs. all females
Discrimination %>% 
    group_by(Gender) %>% 
  summarise(Median_Expenditures = median(Expenditures))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
##   Gender Median_Expenditures
##   <chr>                <dbl>
## 1 Female                6400
## 2 Male                  7219
  1. all Hispanics and all White non-Hispanics
Discrimination %>%
  filter(Ethnicity == "Hispanic"| Ethnicity =="White not Hispanic")%>%
   group_by(Ethnicity)%>%
  summarise(Median_Expenditures = median(Expenditures))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
##   Ethnicity          Median_Expenditures
##   <chr>                            <dbl>
## 1 Hispanic                          3952
## 2 White not Hispanic               15718
  1. all 13-17 year old
Discrimination%>% 
  select(Age, Expenditures)%>% 
  filter(Age>12,Age<18)%>%
     group_by(Age)%>%
  summarise(Median_Expenditures = median(Expenditures))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 2
##     Age Median_Expenditures
##   <dbl>               <dbl>
## 1    13               4003 
## 2    14               4058 
## 3    15               3715 
## 4    16               3818.
## 5    17               4218.
  1. all male, White non Hispanics
Discrimination%>% 
  filter(Gender == "Male", Ethnicity == "White not Hispanic")%>%
   summarise(Median_Expenditures = median(Expenditures))
## # A tibble: 1 x 1
##   Median_Expenditures
##                 <dbl>
## 1              27390.
  1. all Asian, 13-17 year old
Discrimination%>% 
  filter(Age>13,Age<18, Ethnicity == "Asian")%>%
  summarise(Median_Expenditures = median(Expenditures))
## # A tibble: 1 x 1
##   Median_Expenditures
##                 <dbl>
## 1               3460.

3. What is the range of the middle 50% (IQR = Q3 - Q1) of expenditures for:

  1. all males vs. all females,

skim_variable Gender n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist * 1 Expenditures Female 0 1 18130. 20020. 222 2872. 6400 39488. 75098 ▇▁▂▂▁ 2 Expenditures Male 0 1 18001. 19068. 386 2954 7219 37201 68890 ▇▁▂▂▁

IQR_FEM  <- 39488 - 2872
IQR_MALE <- 37201 -2954
IQR_FEM
## [1] 36616
IQR_MALE
## [1] 34247
  1. all Hispanics and all White non-Hispanics

– Variable type: numeric ——————————————————————————————————– # A tibble: 2 x 12 skim_variable Ethnicity n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist * 1 Expenditures Hispanic 0 1 11066. 15630. 222 2331. 3952 10292. 65581 ▇▁▁▁▁ 2 Expenditures White not Hispanic 0 1 24698. 20604. 340 3977 15718 43134 68890 ▇▁▃▃▁

IQR_HISP  <- 10292 - 2331
IQR_WHITENH <- 43134 -3977

IQR_HISP
## [1] 7961
IQR_WHITENH
## [1] 39157
  1. all 13-17 year old
Discrimination%>% 
  select(Age, Expenditures)%>% 
  filter(Age>12,Age<18)%>%
  group_by(Age)%>%
  
  skim(Expenditures)
Data summary
Name Piped data
Number of rows 212
Number of columns 2
_______________________
Column type frequency:
numeric 1
________________________
Group variables Age

Variable type: numeric

skim_variable Age n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Expenditures 13 0 1 3918.67 980.86 1195 3406.00 4003.0 4518.50 5872 ▁▂▆▇▃
Expenditures 14 0 1 3974.82 1196.83 386 3223.00 4058.0 4808.00 5920 ▁▂▇▇▆
Expenditures 15 0 1 3827.16 817.12 2029 3205.25 3715.0 4531.50 5304 ▂▇▇▇▅
Expenditures 16 0 1 3762.86 1038.39 1432 3158.75 3817.5 4466.25 5718 ▃▃▇▆▅
Expenditures 17 0 1 4134.20 989.45 2015 3640.00 4218.5 4886.25 6798 ▃▇▇▆▁
IQR13 <- 4518 - 3406 
IQR14 <- 4808 - 3223
IQR15 <- 4532 - 3205
IQR16 <- 4466 - 3159
IQR17 <- 4886 - 3640

IQR13
## [1] 1112
IQR14 
## [1] 1585
IQR15
## [1] 1327
IQR16 
## [1] 1307
IQR17
## [1] 1246
  1. all male, White non-Hispanics

– Variable type: numeric ——————————————————————————————————————- # A tibble: 1 x 11 skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist * 1 Expenditures 0 1 24574. 19828. 650 4195. 27390. 40818. 68890 ▇▁▅▃▁

IQR <- 40818-4195
IQR
## [1] 36623
  1. all Asian, 18-21 year old
Discrimination%>% 
  select(Age, Ethnicity, Expenditures)%>% 
  filter(Age>17, Age<22, Ethnicity == "Asian")%>%
    group_by(Age)%>%
  
  skim(Expenditures)
Data summary
Name Piped data
Number of rows 41
Number of columns 3
_______________________
Column type frequency:
numeric 1
________________________
Group variables Age

Variable type: numeric

skim_variable Age n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Expenditures 18 0 1 9368.07 3799.71 3153 7374.75 9523.0 10977.50 18435 ▃▇▇▂▂
Expenditures 19 0 1 9374.00 1485.70 7475 8657.50 8767.0 10117.00 12243 ▂▇▂▂▂
Expenditures 20 0 1 10554.83 2538.50 6255 8848.75 11389.5 12296.75 13345 ▅▁▂▅▇
Expenditures 21 0 1 8674.57 2526.45 3988 7627.00 9504.0 10154.00 11668 ▃▁▇▇▇
IQR18 <- 10978 - 7375
IQR18
## [1] 3603
IQR19 <- 10117 - 8658
IQR19
## [1] 1459
IQR20 <- 12297 - 8849
IQR20
## [1] 3448
IQR21 <- 10154 - 7627
IQR21
## [1] 2527

4. Does discrimination in expenditures exist? There are two relevant potential forms of actionable discrimination: gender and ethnicity. Evaluate these two questions with the appropriate pivot table equivalent defined for each.

Discrimination %>%
  group_by(Gender)%>%
summarise(Expenditures_by_gender = mean(Expenditures))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
##   Gender Expenditures_by_gender
##   <chr>                   <dbl>
## 1 Female                 18130.
## 2 Male                   18001.
Discrimination %>%
  group_by(Ethnicity)%>%
summarise(Expenditures_by_ethn = mean(Expenditures))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 2
##   Ethnicity          Expenditures_by_ethn
##   <chr>                             <dbl>
## 1 American Indian                  36438.
## 2 Asian                            18392.
## 3 Black                            20885.
## 4 Hispanic                         11066.
## 5 Multi Race                        4457.
## 6 Native Hawaiian                  42782.
## 7 Other                             3316.
## 8 White not Hispanic               24698.

5. It is claimed that there are important disparities between Hispanic and White not Hispanic. Is this true?

White_not_Hispanic <-   24697.549   
Hispanic <- 11065.569   
x <- White_not_Hispanic - Hispanic
x
## [1] 13631.98

6. Use group_by and skim for expenditures after filtering only Hispanics and White not Hispanic.

Discrimination %>% 
  select(Gender, Ethnicity, Expenditures)%>% 
  filter(Ethnicity == "White not Hispanic"|Ethnicity == "Hispanic")%>%
  group_by(Ethnicity)%>%

skim(Expenditures)
Data summary
Name Piped data
Number of rows 777
Number of columns 3
_______________________
Column type frequency:
numeric 1
________________________
Group variables Ethnicity

Variable type: numeric

skim_variable Ethnicity n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Expenditures Hispanic 0 1 11065.57 15629.85 222 2331.25 3952 10292.5 65581 ▇▁▁▁▁
Expenditures White not Hispanic 0 1 24697.55 20604.38 340 3977.00 15718 43134.0 68890 ▇▁▃▃▁

7. Provide a plot of Expenditures by age cohort and ethnicity after filtering only Hispanics and White not Hispanic.

WNH <- Discrimination %>%
  filter(Ethnicity %in% c("White not Hispanic", "Hispanic"))
ggplot(WNH) +
  aes(x = Expenditures, y = Age.Cohort, fill = Ethnicity) +
  geom_boxplot(adjust = 1L) +
  scale_fill_hue() +
  theme_minimal()
## Warning: Ignoring unknown parameters: adjust

# CLEAN UP #################################################

Clear environment

rm(list = ls())

Clear packages

p_unload(all) # Remove all add-ons

Clear console

cat(“\014”) # ctrl+L

Clear mind :)