library(tidyverse)library(tidycensus)library(scales)library(viridis)options(scipen =999)# load a list of all decennial census data from the Redistricting Filepl_vars_2020 <-load_variables(2020, "pl", cache = T) # load a list of all acs variablesacs_vars_2020 <-load_variables(2020, "acs5", cache = T)# Total Population - P1_001N# Percent Hispanic or Latino - P2_002N/P1_001N# Percent White alone, not Hispanic or Latino - P2_005N/P1_001N# Percent Black alone, not Hispanic or Latino - P2_006N/P1_001N# Percent Asian alone, not Hispanic or Latino - P2_008N/P1_001N# import population by race for each county in Mississippiraw_race_2020 =get_decennial(geography ="county", variables=c("P1_001N", "P2_002N", "P2_005N", "P2_006N", "P2_007N", "P2_008N", "P2_009N", "P2_010N"),state='MS', geometry =FALSE, year =2020,output ="wide")
Homework: tidy race/ethnicity data
######### Create Tidy Data ############pop20 <- raw_race_2020 |>rename(pop20 = P1_001N, # !!Total:, universe = RACEhisp_pop20 = P2_002N, # Hispanic or Latinoblack_pop20 = P2_006N, # Not Hispanic or Latino:!!Population of one race:!!Black or African American aloneasian_pop20 = P2_008N, # Not Hispanic or Latino:!!Population of one race:!!Asian alonewhite_pop20 = P2_005N, # Not Hispanic or Latino:!!Population of one race:!!White alonenative_pop20 = P2_007N, # Not Hispanic or Latino:!!Population of one race:!!American Indian and Alaska Native alonehaw_pi_pop20 = P2_009N, # Not Hispanic or Latino:!!Population of one race:!!Native Hawaiian and Other Pacific Islander aloneother_pop20 = P2_010N) |># Not Hispanic or Latino:!!Population of one race:!!Some Other Race alonemutate(bipoc_pop20 = pop20 - white_pop20,pct_hisp=round(hisp_pop20/pop20, 3), pct_white_alone_not_hisp =round(white_pop20/pop20, 3), pct_black_alone_not_hisp =round(black_pop20/pop20, 3), pct_asian_alone_not_hisp =round(asian_pop20/pop20, 3), pct_indiginous_alone_not_hisp =round(native_pop20/pop20, 3), pct_haw_pi_alone_not_hisp =round(haw_pi_pop20/pop20, 3), pct_other_alone_not_hisp =round(other_pop20/pop20, 3), pct_bipoc_20 =round(bipoc_pop20/pop20, 3))
Homework: import ACS data
get_acs() defaults to 5-year data
# import mhi estimate for Mississippi countiesraw_mhi_2020 <-get_acs(geography ="county", variables ="B19013_001", state ="MS",year =2020, output ="wide")## create data frame with estimate, margin of error, and GEOID for joiningmhi <- raw_mhi_2020 |>rename(mhi20 = B19013_001E, mhi_moe = B19013_001M) |>select(GEOID, mhi20, mhi_moe)# join datamiss_income_race <- pop20 |>left_join(mhi, by ="GEOID")
GEOID
NAME
pop20
hisp_pop20
white_pop20
black_pop20
native_pop20
asian_pop20
haw_pi_pop20
other_pop20
bipoc_pop20
pct_hisp
pct_white_alone_not_hisp
pct_black_alone_not_hisp
pct_asian_alone_not_hisp
pct_indiginous_alone_not_hisp
pct_haw_pi_alone_not_hisp
pct_other_alone_not_hisp
pct_bipoc_20
mhi20
mhi_moe
28001
Adams County, Mississippi
29538
1012
10926
16709
56
165
7
35
18612
0.034
0.370
0.566
0.006
0.002
0.000
0.001
0.630
30633
1459
28003
Alcorn County, Mississippi
34740
1248
27738
4316
61
180
19
50
7002
0.036
0.798
0.124
0.005
0.002
0.001
0.001
0.202
40938
2944
Homework: create summary stats
######### Create summary stats ## using the percent function from the scales package to format nicelyincome_race_stats <- miss_income_race |>summarise(Counties =n(),`Average Median Household Income`=dollar(mean(mhi20)),`Percent Latinx`=percent(sum(hisp_pop20)/sum(pop20)),`Percent White Alone, Not Latinx`=percent(sum(white_pop20)/sum(pop20)),`Percent Black Alone, Not Latinx`=percent(sum(black_pop20)/sum(pop20)),`Percent Asian Alone, Not Latinx`=percent(sum(asian_pop20)/sum(pop20)),`Percent Other Alone, Not Latinx`=percent(sum(other_pop20)/sum(pop20)),`Percent BIPOC`=percent(sum(bipoc_pop20)/sum(pop20)))
Counties
Average Median Household Income
Percent Latinx
Percent White Alone, Not Latinx
Percent Black Alone, Not Latinx
Percent Asian Alone, Not Latinx
Percent Other Alone, Not Latinx
Percent BIPOC
82
$40,818.78
4%
55%
36%
1%
0%
45%
Defining colors in R: Color names
R also recognizes hundreds of predefined color names like “blue” and “ivory”. You can find a full list of them here: ggplot colors
You can list them with the colors() function
type colors() in your Console and press return
Defining colors in R: hex codes
Hex colors are 6-digit way to represent a color that is common in web development. You can pick colors and find their hex codes here:
gsub(): function to replace text, use it in a mutate
gsub(pattern, replacement, x)
Homework: bar chart of deviation from average
Homework: code, bar chart of deviation from average
scale_fill_gradient(): creates a gradient palette based on two colors
define low and high colors
ms_income_diff <-ggplot(data = miss_income_race_diff, aes(x=mhi_diff_from_avg,y=reorder(County, mhi_diff_from_avg),fill = pct_bipoc_20)) +geom_col(width =1) +# create color palette with hex code for high and low valuescale_fill_gradient(low ="#ffffff", high ="#5e1b81",name ="Percent BIPOC",labels =percent_format(accuracy =1)) +scale_x_continuous(labels =dollar_format(accuracy =1)) +labs(x ="Median Household Income, Difference from Average", y ="",title="Mississippi Counties, Median Household Income ",subtitle="Difference from Average ($49,230)") +theme_bw() +theme(axis.text.y =element_text(size =5)) # change the text size for y-axis labelsms_income_diff
Write out your data and plots
Write data to processed folder. No need to write out the raw census data since you can reimport it with this script.
Because ACS data is an estimate, the numbers are not 100% certain. The Census provides the Margin Of Error to show how accurate.
The Margin of Error is the 90% confidence interval. For example if:
the estimate for Median Household Income in Adams County, MS = $30,633
and the margin of error = $1,459
We can be 90% certain the the Median Household Income is $30,633 plus or minus $1,459
We’ll import Median Household Income data to explore Margin of Error and how to handle it
ACS - import mhi for all counties
Import median household income for all counties in the country. Create a chart to compare MHI and look at the margin of error.
# import mhi estimate for all countiesraw_mhi_2022_all_states <-get_acs(geography ="county", variables =c(mhi ="B19013_001"), year =2022,output ="wide")
GEOID
NAME
mhiE
mhiM
01001
Autauga County, Alabama
68315
4941
01003
Baldwin County, Alabama
71039
2374
01005
Barbour County, Alabama
39712
3289
01007
Bibb County, Alabama
50669
8260
01009
Blount County, Alabama
57440
3308
ACS example: process ACS mhi data for plot
separate(): divide column into multiple multiple columns
separate(col, into, sep = ““)
## create data frame with county and state column## calculate margin of error as proportion of valuemhi <- raw_mhi_2022_all_states |>separate(NAME, into =c("county", "state"), sep =",") |>select(GEOID,county, state, mhiE, mhiM) |>mutate(moe_ratio =round(mhiM/mhiE, 4))
GEOID
county
state
mhiE
mhiM
moe_ratio
01001
Autauga County
Alabama
68315
4941
0.0723
01003
Baldwin County
Alabama
71039
2374
0.0334
Count plot
First we’ll compare the range in MHI in each state with a count plot. A count plot is a scatterplot with categories as one of the variables. You can see the range of values for each category, and compare them.
Count plot, code
ggplot(data = mhi, aes(x=state, y=mhiE)) +geom_count(alpha=0.5, size =2, color ="green4") +scale_y_continuous(labels =dollar_format()) +theme(axis.title=element_blank(),axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +labs(title ="Estimated Median Household Income by County",caption ="Source: American Community Survey, 2016-20")
Count plot - to explore margin of error
You should always look at the margin of error and assess whether the estimate is good enough for you to keep in your analysis. See the margin of error, sorted by highest proportion). The margin of error is VERY high in some counties.
GEOID
county
state
mhiE
mhiM
moe_ratio
35011
De Baca County
New Mexico
34702
25897
0.7463
32009
Esmeralda County
Nevada
40694
30242
0.7432
48243
Jeff Davis County
Texas
38125
25205
0.6611
48109
Culberson County
Texas
35924
18455
0.5137
48269
King County
Texas
59375
29395
0.4951
20101
Lane County
Kansas
52222
25196
0.4825
48271
Kinney County
Texas
52386
23728
0.4529
48127
Dimmit County
Texas
27374
12374
0.4520
30069
Petroleum County
Montana
57981
25523
0.4402
30109
Wibaux County
Montana
58750
25620
0.4361
32011
Eureka County
Nevada
73929
32052
0.4336
13307
Webster County
Georgia
35000
14998
0.4285
16025
Camas County
Idaho
63750
26584
0.4170
48311
McMullen County
Texas
60313
24836
0.4118
02070
Dillingham Census Area
Alaska
69412
26682
0.3844
46071
Jackson County
South Dakota
26078
9885
0.3791
Count plot, code - to explore margin of error
ggplot(data = mhi, aes(x=state, y=mhiM)) +geom_count(alpha=0.5, size =2, color ="green4") +scale_y_continuous(labels =dollar_format()) +theme(axis.title=element_blank(),axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +labs(title ="Margin of Error, Median Household Income by County",caption ="Source: American Community Survey, 2016-20")
Count plot - to explore margin of error
Count plot - to explore margin of error as proportion
Count plot EXCLUDING HIGH MARGIN OF ERROR
There is not a rule about margin or error. In general, you get to know your data, and determine what margin of error you can tolerate depending on the question you are asking.
In this case, I have a high tolerance since I am just getting a snapshot of the country with a chart. I think that if the margin of error is higher than 1/5th of the MHI, the estimate is not precise enough to include.
mhi_final <- mhi |>filter(moe_ratio < .2)
In-class - create data and plot of educational attainment
create a new script, save it in part2/scripts as educational_attainment_by_state
Use the educational attainment data for every state from the American Community Survey to create a processed dataframe and plot of the percent of people with at least a Bachelors Degree in each state. See instructions on this and the next 3 slides:
In-class (part 1) - find EDUCATIONAL ATTAINMENT data
Use the load_variables() function to look at the variables in the available variables in the 5-year ACS.
Open the dataframe and search the variables for ‘educational attainment’
There will be a lot of them, find the variables where: CONCEPT = “EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER” without any other words
Look at the variable names - they are in the format Table_variable
ex. B15003_002:
table = B15003
variable = 002 (No schooling completed)
In-class (part 2) - find EDUCATIONAL ATTAINMENT data
You can filter the acs_vars dataframe by table to view only those variables
filter rows with “B15003” in the name
In-class 3 (part 3) - import EDUCATIONAL ATTAINMENT data for all states and plot
Filtered by b15003_vars is a more manageable way to look at the variables about educational attainment.
Use it to identify the variables you need to calculate the percent of people with at least a bachelors degree
Import those variables for every state using the get_acs() function
Process the data to create a tidy dataframe with the percent bachelors degree
Create a bar chart to display the percent of people with at least a bachelors degree for every state, ordered from lowest to highest
write out the processed data, and bar chart to the correct folders in your part2 folder
Homework 8a.
Clean up your script from the in-class assignment processing educational attainment data. Submit the script on canvas.
Homework 8b.
Download census data to answer a research question about access to home ownership in cities across the country inspired by this article about the decline in access to home ownership in San Diego from the New York Times.
When Ms. Coats moved into the Baxter Street house, a family needed right around the area’s median income to afford the $82 monthly mortgage payment — the definition of middle class. Today a typical Clairemont home costs $850,000, up 30 percent from 2019. A family would need to make about double San Diego’s median income to afford one.