## ── Attaching packages ────────
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ─────────────────
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
To begin, I just wanted to point out that I unintentionally set up some categories that were not inclusive of all the individuals that participated in the survey. Apologies if that caused any confusion. Below are the answers for the questions as stated. In Question 1a, I omitted the category nh other. If you included that in your code, that’s fine. Let me know if you have any problems –scott
The first section is done with the reduced data frame. After that I showed the code and answers utilizing the full data frame
Exercises
1. What if we were interested in examining the education of individuals from the Northern Triangle countries of Guatemala, Honduras, and El Salvador as compared to other Hispanic individuals, non-Hispanic white, non-Hispanic black, and non-Hispanic other individuals. Create a data frame that has the following variables:
a. Race/Ethnicity variable with the following categories: 1) nhw 2) nhb 3) hisp northern triangle origin 4) hisp other
b. Education variable with the following categories: 1) less than a GED or High School diploma 2) GED or High School Diploma 4) Bachelor’s Degree or higher
c. Age only between 18 and 65 inclusive
dat <- read_csv("2015_5yr_ACS_4oct.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
set.seed(1230)
datred <-dat[sample(nrow(dat),100000),]
dat5 <- datred %>%
transmute(
race_eth = ifelse(RACE == 1 & HISPAN == 0, "nhw",
ifelse(RACE == 2 & HISPAN == 0, "nhb",
ifelse(HISPAND == 412 | HISPAND == 413 | HISPAND == 416, "hisp_nt",
ifelse((HISPAN >= 1 & HISPAN <= 4) & (HISPAND != 412 | HISPAND != 413 | HISPAND != 416), "hisp_othr",NA)))),
educ = ifelse(EDUCD <= 61, 'lths',
ifelse(EDUCD >= 62 & EDUCD <=64, 'hs',
ifelse(EDUCD >= 101, "bach+", NA))),
age = AGE
) %>%
filter(age >=18 & age <= 65)
head(dat5)
## # A tibble: 6 x 3
## race_eth educ age
## <chr> <chr> <dbl>
## 1 nhw <NA> 21
## 2 nhw hs 39
## 3 hisp_othr lths 62
## 4 nhw bach+ 59
## 5 nhb bach+ 41
## 6 nhw bach+ 61
dat5 <- na.omit(dat5)
head(dat5)
## # A tibble: 6 x 3
## race_eth educ age
## <chr> <chr> <dbl>
## 1 nhw hs 39
## 2 hisp_othr lths 62
## 3 nhw bach+ 59
## 4 nhb bach+ 41
## 5 nhw bach+ 61
## 6 nhw hs 57
#table(dat5$race_eth)
2. Determine the percentage by race/ethnicity of each of these education categories
tabq2 <- table(dat5$race_eth,dat5$educ)
kable(tabq2)
| hisp_nt |
43 |
168 |
296 |
| hisp_othr |
1072 |
2234 |
2272 |
| nhb |
1163 |
2182 |
1029 |
| nhw |
13542 |
11418 |
2950 |
tabq2a<- round(prop.table(tabq2,2), digits = 4)*100
kable(tabq2a)
| hisp_nt |
0.27 |
1.05 |
4.52 |
| hisp_othr |
6.78 |
13.96 |
34.70 |
| nhb |
7.35 |
13.64 |
15.72 |
| nhw |
85.60 |
71.35 |
45.06 |
3. Given that Texas is FIPS code 48 and Bexar county is 29, Compare the income, utilizing the same categories as example two, amongst the following race ethnicities: 1) non-Hispanic white 2) non Hispanic black 3)non-Hispanic other and 4) Hispanic in the following locations:
dat6 <- datred %>%
filter(INCTOT > 0 & INCTOT < 9000000) %>%
transmute(
sex = ifelse(SEX == 1, "m","f"),
statefip = STATEFIP,
countyfip = COUNTYFIP,
inc = ifelse(INCTOT <= 20000, "<$20k",
ifelse(INCTOT >= 20001 & INCTOT <= 60000, "$20k-$60k",
ifelse(INCTOT >= 60001 & INCTOT <= 120000, "$60k-$120k",
ifelse(INCTOT >= 120001,">$120k", NA)))),
age = AGE,
race = ifelse(RACE == 1 & HISPAN == 0, 'nhw',
ifelse(RACE == 2 & HISPAN == 0, 'nhb',
ifelse(RACE >=3 & HISPAN == 0,'nh_othr',
ifelse(HISPAN >= 1 & HISPAN <= 4, 'hisp', NA)))),
educ = ifelse(EDUCD <= 61, 'lths',
ifelse(EDUCD >= 62 & EDUCD <=64, 'hs',
ifelse(EDUCD >= 101, "bach+", NA)))
) %>%
filter(age >= 18)
dat6 <- na.omit(dat6)
table(dat6$race)
##
## hisp nh_othr nhb nhw
## 5724 3600 4533 34692
a. The United States
tabq3 <- table(dat6$race,dat6$inc)
tabq3 <- tabq3[,c(1,3,4,2)]
kable(tabq3)
| hisp |
2905 |
2238 |
455 |
126 |
| nh_othr |
1418 |
1243 |
664 |
275 |
| nhb |
2348 |
1613 |
468 |
104 |
| nhw |
12058 |
14107 |
5870 |
2657 |
tabq3a <- round(prop.table(tabq3,2), digits = 4)*100
kable(tabq3a)
| hisp |
15.51 |
11.66 |
6.10 |
3.98 |
| nh_othr |
7.57 |
6.47 |
8.90 |
8.70 |
| nhb |
12.54 |
8.40 |
6.28 |
3.29 |
| nhw |
64.38 |
73.47 |
78.72 |
84.03 |
b. Texas
dat6st <- dat6 %>%
filter(statefip == 48)
tabq3b <- table(dat6st$race,dat6st$inc)
tabq3b <- tabq3b[,c(1,3,4,2)]
kable(tabq3b)
| hisp |
534 |
439 |
84 |
21 |
| nh_othr |
87 |
74 |
51 |
18 |
| nhb |
176 |
117 |
35 |
5 |
| nhw |
649 |
779 |
391 |
219 |
tabq3bb <- round(prop.table(tabq3b,2), digits = 4)*100
kable(tabq3bb)
| hisp |
36.93 |
31.16 |
14.97 |
7.98 |
| nh_othr |
6.02 |
5.25 |
9.09 |
6.84 |
| nhb |
12.17 |
8.30 |
6.24 |
1.90 |
| nhw |
44.88 |
55.29 |
69.70 |
83.27 |
c. Bexar county
dat6cty <- dat6 %>%
filter(statefip == 48, countyfip == 29)
tabq3c <- table(dat6cty$race,dat6cty$inc)
tabq3c <- tabq3c[,c(1,3,4,2)]
kable(tabq3c)
| hisp |
61 |
47 |
11 |
0 |
| nh_othr |
2 |
3 |
1 |
1 |
| nhb |
6 |
5 |
1 |
0 |
| nhw |
31 |
35 |
14 |
11 |
tabq3cb <- round(prop.table(tabq3c,2), digits = 4)*100
kable(tabq3cb)
| hisp |
61 |
52.22 |
40.74 |
0.00 |
| nh_othr |
2 |
3.33 |
3.70 |
8.33 |
| nhb |
6 |
5.56 |
3.70 |
0.00 |
| nhw |
31 |
38.89 |
51.85 |
91.67 |
4. Using the Education categories from exercise 1b, determine the percentages by the race ethnicity categories in 3 in the following locations:
a. The United States
tabq4 <- table(dat6$race,dat6$educ)
tabq4 <- tabq4[,c(3,2,1)]
kable(tabq4)
| hisp |
2445 |
2155 |
1124 |
| nh_othr |
647 |
972 |
1981 |
| nhb |
1121 |
2138 |
1274 |
| nhw |
4236 |
14517 |
15939 |
tabq4a <- round(prop.table(tabq4,2), digits = 4)*100
kable(tabq4a)
| hisp |
28.94 |
10.89 |
5.53 |
| nh_othr |
7.66 |
4.91 |
9.75 |
| nhb |
13.27 |
10.81 |
6.27 |
| nhw |
50.14 |
73.38 |
78.45 |
b. Texas
tabq4b <- table(dat6st$race,dat6st$educ)
tabq4b <- tabq4[,c(3,2,1)]
kable(tabq4)
| hisp |
2445 |
2155 |
1124 |
| nh_othr |
647 |
972 |
1981 |
| nhb |
1121 |
2138 |
1274 |
| nhw |
4236 |
14517 |
15939 |
tabq4ba <- round(prop.table(tabq4b,2), digits = 4)*100
kable(tabq4a)
| hisp |
28.94 |
10.89 |
5.53 |
| nh_othr |
7.66 |
4.91 |
9.75 |
| nhb |
13.27 |
10.81 |
6.27 |
| nhw |
50.14 |
73.38 |
78.45 |
c. Bexar county
tabq4c <- table(dat6cty$race,dat6cty$educ)
tabq4c <- tabq4c[,c(3,2,1)]
kable(tabq4c)
| hisp |
35 |
57 |
27 |
| nh_othr |
1 |
1 |
5 |
| nhb |
1 |
7 |
4 |
| nhw |
7 |
28 |
56 |
tabq4cb <- round(prop.table(tabq4c,2), digits = 4)*100
kable(tabq4cb)
| hisp |
79.55 |
61.29 |
29.35 |
| nh_othr |
2.27 |
1.08 |
5.43 |
| nhb |
2.27 |
7.53 |
4.35 |
| nhw |
15.91 |
30.11 |
60.87 |
The above answers were done utilizing the reduced data frame with the set.seed = 1230. Below I will show the answers for the full 15.6 million data frame
1 a,b,&c
datf <- dat %>%
transmute(
race_eth = ifelse(RACE == 1 & HISPAN == 0, "nhw",
ifelse(RACE == 2 & HISPAN == 0, "nhb",
ifelse(HISPAND == 412 | HISPAND == 413 | HISPAND == 416, "hisp_nt",
ifelse((HISPAN >= 1 & HISPAN <= 4) & (HISPAND != 412 | HISPAND != 413 | HISPAND != 416), "hisp_othr",NA)))),
educ = ifelse(EDUCD <= 61, 'lths',
ifelse(EDUCD >= 62 & EDUCD <=64, 'hs',
ifelse(EDUCD >= 101, "bach+", NA))),
age = AGE
) %>%
filter(age >=18 & age <= 65)
head(datf)
## # A tibble: 6 x 3
## race_eth educ age
## <chr> <chr> <dbl>
## 1 nhb <NA> 19
## 2 nhw <NA> 57
## 3 nhw hs 63
## 4 nhw <NA> 53
## 5 nhw <NA> 24
## 6 nhw <NA> 19
dat5 <- na.omit(datf)
head(datf)
## # A tibble: 6 x 3
## race_eth educ age
## <chr> <chr> <dbl>
## 1 nhb <NA> 19
## 2 nhw <NA> 57
## 3 nhw hs 63
## 4 nhw <NA> 53
## 5 nhw <NA> 24
## 6 nhw <NA> 19
2.
tabq2 <- table(datf$race_eth,datf$educ)
kable(tabq2)
| hisp_nt |
8924 |
24723 |
44743 |
| hisp_othr |
175021 |
344346 |
362629 |
| nhb |
178808 |
332891 |
168760 |
| nhw |
2103427 |
1772274 |
464750 |
tabq2a<- round(prop.table(tabq2,2), digits = 4)*100
kable(tabq2a)
| hisp_nt |
0.36 |
1.00 |
4.30 |
| hisp_othr |
7.10 |
13.92 |
34.84 |
| nhb |
7.25 |
13.45 |
16.21 |
| nhw |
85.29 |
71.63 |
44.65 |
| ## 3. |
|
|
|
dat6f <- dat %>%
filter(INCTOT > 0 & INCTOT < 9000000) %>%
transmute(
sex = ifelse(SEX == 1, "m","f"),
statefip = STATEFIP,
countyfip = COUNTYFIP,
inc = ifelse(INCTOT <= 20000, "<$20k",
ifelse(INCTOT >= 20001 & INCTOT <= 60000, "$20k-$60k",
ifelse(INCTOT >= 60001 & INCTOT <= 120000, "$60k-$120k",
ifelse(INCTOT >= 120001,">$120k", NA)))),
age = AGE,
race = ifelse(RACE == 1 & HISPAN == 0, 'nhw',
ifelse(RACE == 2 & HISPAN == 0, 'nhb',
ifelse(RACE >=3 & HISPAN == 0,'nh_othr',
ifelse(HISPAN >= 1 & HISPAN <= 4, 'hisp', NA)))),
educ = ifelse(EDUCD <= 61, 'lths',
ifelse(EDUCD >= 62 & EDUCD <=64, 'hs',
ifelse(EDUCD >= 101, "bach+", NA)))
) %>%
filter(age >= 18)
dat6f <- na.omit(dat6f)
3a.
tabq3 <- table(dat6f$race,dat6f$inc)
tabq3 <- tabq3[,c(1,3,4,2)]
kable(tabq3)
| hisp |
453342 |
349335 |
75838 |
20481 |
| nh_othr |
214099 |
187799 |
105954 |
48055 |
| nhb |
366529 |
250461 |
72162 |
16088 |
| nhw |
1891496 |
2191779 |
932920 |
418208 |
tabq3a <- round(prop.table(tabq3,2), digits = 4)*100
kable(tabq3a)
| hisp |
15.50 |
11.73 |
6.39 |
4.07 |
| nh_othr |
7.32 |
6.30 |
8.93 |
9.56 |
| nhb |
12.53 |
8.41 |
6.08 |
3.20 |
| nhw |
64.66 |
73.57 |
78.60 |
83.17 |
3b.
dat6st <- dat6f %>%
filter(statefip == 48)
tabq3b <- table(dat6st$race,dat6st$inc)
tabq3b <- tabq3b[,c(1,3,4,2)]
kable(tabq3b)
| hisp |
90551 |
68405 |
13260 |
3348 |
| nh_othr |
11222 |
10821 |
7337 |
3295 |
| nhb |
26318 |
18778 |
5759 |
1234 |
| nhw |
101796 |
123869 |
59614 |
32112 |
tabq3bb <- round(prop.table(tabq3b,2), digits = 4)*100
kable(tabq3bb)
| hisp |
39.39 |
30.83 |
15.42 |
8.37 |
| nh_othr |
4.88 |
4.88 |
8.53 |
8.24 |
| nhb |
11.45 |
8.46 |
6.70 |
3.09 |
| nhw |
44.28 |
55.83 |
69.34 |
80.30 |
3c.
dat6cty <- dat6f %>%
filter(statefip == 48, countyfip == 29)
tabq3c <- table(dat6cty$race,dat6cty$inc)
tabq3c <- tabq3c[,c(1,3,4,2)]
kable(tabq3c)
| hisp |
10129 |
7790 |
1640 |
416 |
| nh_othr |
600 |
544 |
315 |
125 |
| nhb |
1038 |
754 |
298 |
80 |
| nhw |
3910 |
5079 |
2965 |
1551 |
tabq3cb <- round(prop.table(tabq3c,2), digits = 4)*100
kable(tabq3cb)
| hisp |
64.61 |
54.99 |
31.43 |
19.15 |
| nh_othr |
3.83 |
3.84 |
6.04 |
5.76 |
| nhb |
6.62 |
5.32 |
5.71 |
3.68 |
| nhw |
24.94 |
35.85 |
56.82 |
71.41 |
4a
tabq4 <- table(dat6f$race,dat6f$educ)
tabq4 <- tabq4[,c(3,2,1)]
kable(tabq4)
| hisp |
381474 |
333958 |
183564 |
| nh_othr |
98460 |
149201 |
308246 |
| nhb |
179637 |
328704 |
196899 |
| nhw |
662036 |
2283014 |
2489353 |
tabq4a <- round(prop.table(tabq4,2), digits = 4)*100
kable(tabq4a)
| hisp |
28.86 |
10.79 |
5.78 |
| nh_othr |
7.45 |
4.82 |
9.70 |
| nhb |
13.59 |
10.62 |
6.20 |
| nhw |
50.09 |
73.77 |
78.33 |
4b
tabq4b <- table(dat6st$race,dat6st$educ)
tabq4b <- tabq4[,c(3,2,1)]
kable(tabq4)
| hisp |
381474 |
333958 |
183564 |
| nh_othr |
98460 |
149201 |
308246 |
| nhb |
179637 |
328704 |
196899 |
| nhw |
662036 |
2283014 |
2489353 |
tabq4ba <- round(prop.table(tabq4b,2), digits = 4)*100
kable(tabq4a)
| hisp |
28.86 |
10.79 |
5.78 |
| nh_othr |
7.45 |
4.82 |
9.70 |
| nhb |
13.59 |
10.62 |
6.20 |
| nhw |
50.09 |
73.77 |
78.33 |
4c
tabq4c <- table(dat6cty$race,dat6cty$educ)
tabq4c <- tabq4c[,c(3,2,1)]
kable(tabq4c)
| hisp |
6990 |
8461 |
4524 |
| nh_othr |
230 |
420 |
934 |
| nhb |
362 |
904 |
904 |
| nhw |
909 |
4123 |
8473 |
tabq4cb <- round(prop.table(tabq4c,2), digits = 4)*100
kable(tabq4cb)
| hisp |
82.32 |
60.84 |
30.50 |
| nh_othr |
2.71 |
3.02 |
6.30 |
| nhb |
4.26 |
6.50 |
6.09 |
| nhw |
10.71 |
29.64 |
57.11 |