## ── Attaching packages ────────
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ─────────────────
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

To begin, I just wanted to point out that I unintentionally set up some categories that were not inclusive of all the individuals that participated in the survey. Apologies if that caused any confusion. Below are the answers for the questions as stated. In Question 1a, I omitted the category nh other. If you included that in your code, that’s fine. Let me know if you have any problems –scott

The first section is done with the reduced data frame. After that I showed the code and answers utilizing the full data frame

Exercises

1. What if we were interested in examining the education of individuals from the Northern Triangle countries of Guatemala, Honduras, and El Salvador as compared to other Hispanic individuals, non-Hispanic white, non-Hispanic black, and non-Hispanic other individuals. Create a data frame that has the following variables:

a. Race/Ethnicity variable with the following categories: 1) nhw 2) nhb 3) hisp northern triangle origin 4) hisp other
b. Education variable with the following categories: 1) less than a GED or High School diploma 2) GED or High School Diploma 4) Bachelor’s Degree or higher
c. Age only between 18 and 65 inclusive
dat <- read_csv("2015_5yr_ACS_4oct.csv")
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
set.seed(1230)
datred <-dat[sample(nrow(dat),100000),]

dat5 <- datred %>% 
  transmute(
    race_eth = ifelse(RACE == 1 & HISPAN == 0, "nhw",
                      ifelse(RACE == 2 & HISPAN == 0, "nhb",
                             ifelse(HISPAND == 412 | HISPAND == 413 | HISPAND == 416, "hisp_nt",
                                    ifelse((HISPAN >= 1 & HISPAN <= 4) & (HISPAND != 412 | HISPAND != 413 | HISPAND != 416), "hisp_othr",NA)))),
    educ = ifelse(EDUCD <= 61, 'lths',
                  ifelse(EDUCD >= 62 & EDUCD <=64, 'hs',
                         ifelse(EDUCD >= 101, "bach+", NA))),
    age = AGE
    ) %>% 
  filter(age >=18 & age <= 65)
head(dat5)
## # A tibble: 6 x 3
##   race_eth  educ    age
##   <chr>     <chr> <dbl>
## 1 nhw       <NA>     21
## 2 nhw       hs       39
## 3 hisp_othr lths     62
## 4 nhw       bach+    59
## 5 nhb       bach+    41
## 6 nhw       bach+    61
dat5 <- na.omit(dat5)
head(dat5)
## # A tibble: 6 x 3
##   race_eth  educ    age
##   <chr>     <chr> <dbl>
## 1 nhw       hs       39
## 2 hisp_othr lths     62
## 3 nhw       bach+    59
## 4 nhb       bach+    41
## 5 nhw       bach+    61
## 6 nhw       hs       57
#table(dat5$race_eth)

2. Determine the percentage by race/ethnicity of each of these education categories

tabq2 <- table(dat5$race_eth,dat5$educ)
kable(tabq2)
bach+ hs lths
hisp_nt 43 168 296
hisp_othr 1072 2234 2272
nhb 1163 2182 1029
nhw 13542 11418 2950
tabq2a<- round(prop.table(tabq2,2), digits = 4)*100
kable(tabq2a)
bach+ hs lths
hisp_nt 0.27 1.05 4.52
hisp_othr 6.78 13.96 34.70
nhb 7.35 13.64 15.72
nhw 85.60 71.35 45.06

3. Given that Texas is FIPS code 48 and Bexar county is 29, Compare the income, utilizing the same categories as example two, amongst the following race ethnicities: 1) non-Hispanic white 2) non Hispanic black 3)non-Hispanic other and 4) Hispanic in the following locations:

dat6 <- datred %>% 
   filter(INCTOT > 0 & INCTOT < 9000000) %>% 
  transmute(
    sex = ifelse(SEX == 1, "m","f"),
    statefip = STATEFIP,
    countyfip = COUNTYFIP,
    inc = ifelse(INCTOT <= 20000, "<$20k",
                        ifelse(INCTOT >= 20001 & INCTOT <= 60000, "$20k-$60k",
                               ifelse(INCTOT >= 60001 & INCTOT <= 120000, "$60k-$120k",
                                      ifelse(INCTOT >= 120001,">$120k", NA)))),
    age = AGE,
    race = ifelse(RACE == 1 & HISPAN == 0, 'nhw',
                  ifelse(RACE == 2 & HISPAN == 0, 'nhb',
                         ifelse(RACE >=3 & HISPAN == 0,'nh_othr',
                                ifelse(HISPAN >= 1 & HISPAN <= 4, 'hisp', NA)))),
    educ = ifelse(EDUCD <= 61, 'lths',
                  ifelse(EDUCD >= 62 & EDUCD <=64, 'hs',
                         ifelse(EDUCD >= 101, "bach+", NA)))
  ) %>% 
      filter(age >= 18) 
dat6 <- na.omit(dat6)
table(dat6$race)
## 
##    hisp nh_othr     nhb     nhw 
##    5724    3600    4533   34692

a. The United States

tabq3 <- table(dat6$race,dat6$inc)
tabq3 <- tabq3[,c(1,3,4,2)]
kable(tabq3)
<$20k $20k-$60k $60k-$120k >$120k
hisp 2905 2238 455 126
nh_othr 1418 1243 664 275
nhb 2348 1613 468 104
nhw 12058 14107 5870 2657
tabq3a <- round(prop.table(tabq3,2), digits = 4)*100
kable(tabq3a)
<$20k $20k-$60k $60k-$120k >$120k
hisp 15.51 11.66 6.10 3.98
nh_othr 7.57 6.47 8.90 8.70
nhb 12.54 8.40 6.28 3.29
nhw 64.38 73.47 78.72 84.03

b. Texas

dat6st <- dat6 %>% 
  filter(statefip == 48)
tabq3b <- table(dat6st$race,dat6st$inc)
tabq3b <- tabq3b[,c(1,3,4,2)]
kable(tabq3b)
<$20k $20k-$60k $60k-$120k >$120k
hisp 534 439 84 21
nh_othr 87 74 51 18
nhb 176 117 35 5
nhw 649 779 391 219
tabq3bb <- round(prop.table(tabq3b,2), digits = 4)*100
kable(tabq3bb)
<$20k $20k-$60k $60k-$120k >$120k
hisp 36.93 31.16 14.97 7.98
nh_othr 6.02 5.25 9.09 6.84
nhb 12.17 8.30 6.24 1.90
nhw 44.88 55.29 69.70 83.27

c. Bexar county

dat6cty <- dat6 %>% 
  filter(statefip == 48, countyfip == 29)
tabq3c <- table(dat6cty$race,dat6cty$inc)
tabq3c <- tabq3c[,c(1,3,4,2)]
kable(tabq3c)
<$20k $20k-$60k $60k-$120k >$120k
hisp 61 47 11 0
nh_othr 2 3 1 1
nhb 6 5 1 0
nhw 31 35 14 11
tabq3cb <- round(prop.table(tabq3c,2), digits = 4)*100
kable(tabq3cb)
<$20k $20k-$60k $60k-$120k >$120k
hisp 61 52.22 40.74 0.00
nh_othr 2 3.33 3.70 8.33
nhb 6 5.56 3.70 0.00
nhw 31 38.89 51.85 91.67

4. Using the Education categories from exercise 1b, determine the percentages by the race ethnicity categories in 3 in the following locations:

a. The United States

tabq4 <- table(dat6$race,dat6$educ)
tabq4 <- tabq4[,c(3,2,1)]
kable(tabq4)
lths hs bach+
hisp 2445 2155 1124
nh_othr 647 972 1981
nhb 1121 2138 1274
nhw 4236 14517 15939
tabq4a <- round(prop.table(tabq4,2), digits = 4)*100
kable(tabq4a)
lths hs bach+
hisp 28.94 10.89 5.53
nh_othr 7.66 4.91 9.75
nhb 13.27 10.81 6.27
nhw 50.14 73.38 78.45

b. Texas

tabq4b <- table(dat6st$race,dat6st$educ)
tabq4b <- tabq4[,c(3,2,1)]
kable(tabq4)
lths hs bach+
hisp 2445 2155 1124
nh_othr 647 972 1981
nhb 1121 2138 1274
nhw 4236 14517 15939
tabq4ba <- round(prop.table(tabq4b,2), digits = 4)*100
kable(tabq4a)
lths hs bach+
hisp 28.94 10.89 5.53
nh_othr 7.66 4.91 9.75
nhb 13.27 10.81 6.27
nhw 50.14 73.38 78.45

c. Bexar county

tabq4c <- table(dat6cty$race,dat6cty$educ)
tabq4c <- tabq4c[,c(3,2,1)]
kable(tabq4c)
lths hs bach+
hisp 35 57 27
nh_othr 1 1 5
nhb 1 7 4
nhw 7 28 56
tabq4cb <- round(prop.table(tabq4c,2), digits = 4)*100
kable(tabq4cb)
lths hs bach+
hisp 79.55 61.29 29.35
nh_othr 2.27 1.08 5.43
nhb 2.27 7.53 4.35
nhw 15.91 30.11 60.87

The above answers were done utilizing the reduced data frame with the set.seed = 1230. Below I will show the answers for the full 15.6 million data frame

1 a,b,&c

datf <- dat %>% 
  transmute(
    race_eth = ifelse(RACE == 1 & HISPAN == 0, "nhw",
                      ifelse(RACE == 2 & HISPAN == 0, "nhb",
                             ifelse(HISPAND == 412 | HISPAND == 413 | HISPAND == 416, "hisp_nt",
                                    ifelse((HISPAN >= 1 & HISPAN <= 4) & (HISPAND != 412 | HISPAND != 413 | HISPAND != 416), "hisp_othr",NA)))),
    educ = ifelse(EDUCD <= 61, 'lths',
                  ifelse(EDUCD >= 62 & EDUCD <=64, 'hs',
                         ifelse(EDUCD >= 101, "bach+", NA))),
    age = AGE
    ) %>% 
  filter(age >=18 & age <= 65)
head(datf)
## # A tibble: 6 x 3
##   race_eth educ    age
##   <chr>    <chr> <dbl>
## 1 nhb      <NA>     19
## 2 nhw      <NA>     57
## 3 nhw      hs       63
## 4 nhw      <NA>     53
## 5 nhw      <NA>     24
## 6 nhw      <NA>     19
dat5 <- na.omit(datf)
head(datf)
## # A tibble: 6 x 3
##   race_eth educ    age
##   <chr>    <chr> <dbl>
## 1 nhb      <NA>     19
## 2 nhw      <NA>     57
## 3 nhw      hs       63
## 4 nhw      <NA>     53
## 5 nhw      <NA>     24
## 6 nhw      <NA>     19

2.

tabq2 <- table(datf$race_eth,datf$educ)
kable(tabq2)
bach+ hs lths
hisp_nt 8924 24723 44743
hisp_othr 175021 344346 362629
nhb 178808 332891 168760
nhw 2103427 1772274 464750
tabq2a<- round(prop.table(tabq2,2), digits = 4)*100
kable(tabq2a)
bach+ hs lths
hisp_nt 0.36 1.00 4.30
hisp_othr 7.10 13.92 34.84
nhb 7.25 13.45 16.21
nhw 85.29 71.63 44.65
## 3.
dat6f <- dat %>% 
   filter(INCTOT > 0 & INCTOT < 9000000) %>% 
  transmute(
    sex = ifelse(SEX == 1, "m","f"),
    statefip = STATEFIP,
    countyfip = COUNTYFIP,
    inc = ifelse(INCTOT <= 20000, "<$20k",
                        ifelse(INCTOT >= 20001 & INCTOT <= 60000, "$20k-$60k",
                               ifelse(INCTOT >= 60001 & INCTOT <= 120000, "$60k-$120k",
                                      ifelse(INCTOT >= 120001,">$120k", NA)))),
    age = AGE,
    race = ifelse(RACE == 1 & HISPAN == 0, 'nhw',
                  ifelse(RACE == 2 & HISPAN == 0, 'nhb',
                         ifelse(RACE >=3 & HISPAN == 0,'nh_othr',
                                ifelse(HISPAN >= 1 & HISPAN <= 4, 'hisp', NA)))),
    educ = ifelse(EDUCD <= 61, 'lths',
                  ifelse(EDUCD >= 62 & EDUCD <=64, 'hs',
                         ifelse(EDUCD >= 101, "bach+", NA)))
  ) %>% 
      filter(age >= 18) 
dat6f <- na.omit(dat6f)

3a.

tabq3 <- table(dat6f$race,dat6f$inc)
tabq3 <- tabq3[,c(1,3,4,2)]
kable(tabq3)
<$20k $20k-$60k $60k-$120k >$120k
hisp 453342 349335 75838 20481
nh_othr 214099 187799 105954 48055
nhb 366529 250461 72162 16088
nhw 1891496 2191779 932920 418208
tabq3a <- round(prop.table(tabq3,2), digits = 4)*100
kable(tabq3a)
<$20k $20k-$60k $60k-$120k >$120k
hisp 15.50 11.73 6.39 4.07
nh_othr 7.32 6.30 8.93 9.56
nhb 12.53 8.41 6.08 3.20
nhw 64.66 73.57 78.60 83.17

3b.

dat6st <- dat6f %>% 
  filter(statefip == 48)
tabq3b <- table(dat6st$race,dat6st$inc)
tabq3b <- tabq3b[,c(1,3,4,2)]
kable(tabq3b)
<$20k $20k-$60k $60k-$120k >$120k
hisp 90551 68405 13260 3348
nh_othr 11222 10821 7337 3295
nhb 26318 18778 5759 1234
nhw 101796 123869 59614 32112
tabq3bb <- round(prop.table(tabq3b,2), digits = 4)*100
kable(tabq3bb)
<$20k $20k-$60k $60k-$120k >$120k
hisp 39.39 30.83 15.42 8.37
nh_othr 4.88 4.88 8.53 8.24
nhb 11.45 8.46 6.70 3.09
nhw 44.28 55.83 69.34 80.30

3c.

dat6cty <- dat6f %>% 
  filter(statefip == 48, countyfip == 29)
tabq3c <- table(dat6cty$race,dat6cty$inc)
tabq3c <- tabq3c[,c(1,3,4,2)]
kable(tabq3c)
<$20k $20k-$60k $60k-$120k >$120k
hisp 10129 7790 1640 416
nh_othr 600 544 315 125
nhb 1038 754 298 80
nhw 3910 5079 2965 1551
tabq3cb <- round(prop.table(tabq3c,2), digits = 4)*100
kable(tabq3cb)
<$20k $20k-$60k $60k-$120k >$120k
hisp 64.61 54.99 31.43 19.15
nh_othr 3.83 3.84 6.04 5.76
nhb 6.62 5.32 5.71 3.68
nhw 24.94 35.85 56.82 71.41

4a

tabq4 <- table(dat6f$race,dat6f$educ)
tabq4 <- tabq4[,c(3,2,1)]
kable(tabq4)
lths hs bach+
hisp 381474 333958 183564
nh_othr 98460 149201 308246
nhb 179637 328704 196899
nhw 662036 2283014 2489353
tabq4a <- round(prop.table(tabq4,2), digits = 4)*100
kable(tabq4a)
lths hs bach+
hisp 28.86 10.79 5.78
nh_othr 7.45 4.82 9.70
nhb 13.59 10.62 6.20
nhw 50.09 73.77 78.33

4b

tabq4b <- table(dat6st$race,dat6st$educ)
tabq4b <- tabq4[,c(3,2,1)]
kable(tabq4)
lths hs bach+
hisp 381474 333958 183564
nh_othr 98460 149201 308246
nhb 179637 328704 196899
nhw 662036 2283014 2489353
tabq4ba <- round(prop.table(tabq4b,2), digits = 4)*100
kable(tabq4a)
lths hs bach+
hisp 28.86 10.79 5.78
nh_othr 7.45 4.82 9.70
nhb 13.59 10.62 6.20
nhw 50.09 73.77 78.33

4c

tabq4c <- table(dat6cty$race,dat6cty$educ)
tabq4c <- tabq4c[,c(3,2,1)]
kable(tabq4c)
lths hs bach+
hisp 6990 8461 4524
nh_othr 230 420 934
nhb 362 904 904
nhw 909 4123 8473
tabq4cb <- round(prop.table(tabq4c,2), digits = 4)*100
kable(tabq4cb)
lths hs bach+
hisp 82.32 60.84 30.50
nh_othr 2.71 3.02 6.30
nhb 4.26 6.50 6.09
nhw 10.71 29.64 57.11