library(tidyverse)library(tidycensus)library(scales)library(viridis)options(scipen =999)# load a list of all decennial census data from the Redistricting Filepl_vars_2020 <-load_variables(2020, "pl", cache = T) # load a list of all acs variablesacs_vars_2020 <-load_variables(2020, "acs5", cache = T)# Total Population - P1_001N# Percent Hispanic or Latino - P2_002N/P1_001N# Percent White alone, not Hispanic or Latino - P2_005N/P1_001N# Percent Black alone, not Hispanic or Latino - P2_006N/P1_001N# Percent Asian alone, not Hispanic or Latino - P2_008N/P1_001N# import population by race for each county in Mississippiraw_race_2020 =get_decennial(geography ="county", variables=c("P1_001N", "P2_002N", "P2_005N", "P2_006N", "P2_007N", "P2_008N", "P2_009N", "P2_010N"),state='MS', geometry =FALSE, year =2020,output ="wide")
Homework: tidy race/ethnicity data
######### Create Tidy Data ############pop20 <- raw_race_2020 |>rename(pop20 = P1_001N, # !!Total:, universe = RACEhisp_pop20 = P2_002N, # Hispanic or Latinoblack_pop20 = P2_006N, # Not Hispanic or Latino:!!Population of one race:!!Black or African American aloneasian_pop20 = P2_008N, # Not Hispanic or Latino:!!Population of one race:!!Asian alonewhite_pop20 = P2_005N, # Not Hispanic or Latino:!!Population of one race:!!White alonenative_pop20 = P2_007N, # Not Hispanic or Latino:!!Population of one race:!!American Indian and Alaska Native alonehaw_pi_pop20 = P2_009N, # Not Hispanic or Latino:!!Population of one race:!!Native Hawaiian and Other Pacific Islander aloneother_pop20 = P2_010N) |># Not Hispanic or Latino:!!Population of one race:!!Some Other Race alonemutate(bipoc_pop20 = pop20 - white_pop20,pct_hisp=round(hisp_pop20/pop20, 3), pct_white_alone_not_hisp =round(white_pop20/pop20, 3), pct_black_alone_not_hisp =round(black_pop20/pop20, 3), pct_asian_alone_not_hisp =round(asian_pop20/pop20, 3), pct_indiginous_alone_not_hisp =round(native_pop20/pop20, 3), pct_haw_pi_alone_not_hisp =round(haw_pi_pop20/pop20, 3), pct_other_alone_not_hisp =round(other_pop20/pop20, 3), pct_bipoc_20 =round(bipoc_pop20/pop20, 3))
Homework: import ACS data
get_acs() defaults to 5-year data
# import mhi estimate for Mississippi countiesraw_mhi_2020 <-get_acs(geography ="county", variables ="B19013_001", state ="MS",year =2020, output ="wide")## create data frame with estimate, margin of error, and GEOID for joiningmhi <- raw_mhi_2020 |>rename(mhi20 = B19013_001E, mhi_moe = B19013_001M) |>select(GEOID, mhi20, mhi_moe)# join datamiss_income_race <- pop20 |>left_join(mhi, by ="GEOID")
GEOID
NAME
pop20
hisp_pop20
white_pop20
black_pop20
native_pop20
asian_pop20
haw_pi_pop20
other_pop20
bipoc_pop20
pct_hisp
pct_white_alone_not_hisp
pct_black_alone_not_hisp
pct_asian_alone_not_hisp
pct_indiginous_alone_not_hisp
pct_haw_pi_alone_not_hisp
pct_other_alone_not_hisp
pct_bipoc_20
mhi20
mhi_moe
28001
Adams County, Mississippi
29538
1012
10926
16709
56
165
7
35
18612
0.034
0.370
0.566
0.006
0.002
0.000
0.001
0.630
30633
1459
28003
Alcorn County, Mississippi
34740
1248
27738
4316
61
180
19
50
7002
0.036
0.798
0.124
0.005
0.002
0.001
0.001
0.202
40938
2944
Homework: create summary stats
######### Create summary stats ## using the percent function from the scales package to format nicelyincome_race_stats <- miss_income_race |>summarise(Counties =n(),`Average Median Household Income`=dollar(mean(mhi20)),`Percent Latinx`=percent(sum(hisp_pop20)/sum(pop20)),`Percent White Alone, Not Latinx`=percent(sum(white_pop20)/sum(pop20)),`Percent Black Alone, Not Latinx`=percent(sum(black_pop20)/sum(pop20)),`Percent Asian Alone, Not Latinx`=percent(sum(asian_pop20)/sum(pop20)),`Percent Other Alone, Not Latinx`=percent(sum(other_pop20)/sum(pop20)),`Percent BIPOC`=percent(sum(bipoc_pop20)/sum(pop20)))
Counties
Average Median Household Income
Percent Latinx
Percent White Alone, Not Latinx
Percent Black Alone, Not Latinx
Percent Asian Alone, Not Latinx
Percent Other Alone, Not Latinx
Percent BIPOC
82
$40,818.78
4%
55%
36%
1%
0%
45%
Defining colors in R: Color names
R also recognizes hundreds of predefined color names like “blue” and “ivory”. You can find a full list of them here: ggplot colors
You can list them with the colors() function
type colors() in your Console and press return
Defining colors in R: hex codes
Hex colors are 6-digit way to represent a color that is common in web development. You can pick colors and find their hex codes here:
gsub(): function to replace text, use it in a mutate
gsub(pattern, replacement, x)
Homework: bar chart of deviation from average
Homework: code, bar chart of deviation from average
scale_fill_gradient(): creates a gradient palette based on two colors
define low and high colors
ms_income_diff <-ggplot(data = miss_income_race_diff, aes(x=mhi_diff_from_avg,y=reorder(County, mhi_diff_from_avg),fill = pct_bipoc_20)) +geom_col(width =1) +# create color palette with hex code for high and low valuescale_fill_gradient(low ="#ffffff", high ="#5e1b81",name ="Percent BIPOC",labels =percent_format(accuracy =1)) +scale_x_continuous(labels =dollar_format(accuracy =1)) +labs(x ="Median Household Income, Difference from Average", y ="",title="Mississippi Counties, Median Household Income ",subtitle="Difference from Average ($49,230)") +theme_bw() +theme(axis.text.y =element_text(size =5)) # change the text size for y-axis labelsms_income_diff
Write out your data and plots
Write data to processed folder. No need to write out the raw census data since you can reimport it with this script.
Because ACS data is an estimate, the numbers are not 100% certain. The Census provides the Margin Of Error to show how accurate.
The Margin of Error is the 90% confidence interval. For example if:
the estimate for Median Household Income in Adams County, MS = $30,633
and the margin of error = $1,459
We can be 90% certain the the Median Household Income is $30,633 plus or minus $1,459
We’ll import Median Household Income data to explore Margin of Error and how to handle it
ACS - import mhi for all counties
Import median household income for all counties in the country. Create a chart to compare MHI and look at the margin of error.
# import mhi estimate for all countiesraw_mhi_2020_all_states <-get_acs(geography ="county", variables =c(mhi ="B19013_001"), year =2021,output ="wide")
GEOID
NAME
mhiE
mhiM
01001
Autauga County, Alabama
62660
4834
01003
Baldwin County, Alabama
64346
2377
01005
Barbour County, Alabama
36422
3282
01007
Bibb County, Alabama
54277
7325
01009
Blount County, Alabama
52830
3197
ACS example: process ACS mhi data for plot
separate(): divide column into multiple multiple columns
separate(col, into, sep = ““)
## create data frame with county and state column## calculate margin of error as proportion of valuemhi <- raw_mhi_2020_all_states |>separate(NAME, into =c("county", "state"), sep =",") |>select(GEOID,county, state, mhiE, mhiM) |>mutate(moe_ratio =round(mhiM/mhiE, 4))
GEOID
county
state
mhiE
mhiM
moe_ratio
01001
Autauga County
Alabama
62660
4834
0.0771
01003
Baldwin County
Alabama
64346
2377
0.0369
Count plot
First we’ll compare the range in MHI in each state with a count plot. A count plot is a scatterplot with categories as one of the variables. You can see the range of values for each category, and compare them.
Count plot, code
ggplot(data = mhi, aes(x=state, y=mhiE)) +geom_count(alpha=0.5, size =2, color ="green4") +scale_y_continuous(labels =dollar_format()) +theme(axis.title=element_blank(),axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +labs(title ="Estimated Median Household Income by County",caption ="Source: American Community Survey, 2016-20")
Count plot - to explore margin of error
You should always look at the margin of error and assess whether the estimate is good enough for you to keep in your analysis. See the margin of error, sorted by highest proportion). The margin of error is VERY high in some counties.
GEOID
county
state
mhiE
mhiM
moe_ratio
28063
Jefferson County
Mississippi
29205
26465
0.9062
28055
Issaquena County
Mississippi
17109
10468
0.6118
48269
King County
Texas
42125
24798
0.5887
48413
Schleicher County
Texas
61094
34090
0.5580
13007
Baker County
Georgia
33417
16333
0.4888
13307
Webster County
Georgia
32083
15490
0.4828
35011
De Baca County
New Mexico
32750
15191
0.4638
37177
Tyrrell County
North Carolina
40938
18714
0.4571
30069
Petroleum County
Montana
54375
23010
0.4232
35033
Mora County
New Mexico
37549
15884
0.4230
48109
Culberson County
Texas
34239
14292
0.4174
20101
Lane County
Kansas
44219
17809
0.4027
20171
Scott County
Kansas
46583
18656
0.4005
48211
Hemphill County
Texas
54867
21516
0.3921
48385
Real County
Texas
44083
16813
0.3814
13125
Glascock County
Georgia
53125
19674
0.3703
Count plot, code - to explore margin of error
ggplot(data = mhi, aes(x=state, y=mhiM)) +geom_count(alpha=0.5, size =2, color ="green4") +scale_y_continuous(labels =dollar_format()) +theme(axis.title=element_blank(),axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +labs(title ="Margin of Error, Median Household Income by County",caption ="Source: American Community Survey, 2016-20")
Count plot - to explore margin of error
Count plot - to explore margin of error as proportion
Count plot EXCLUDING HIGH MARGIN OF ERROR
There is not a rule about margin or error. In general, you get to know your data, and determine what margin of error you can tolerate depending on the question you are asking.
In this case, I have a high tolerance since I am just getting a snapshot of the country with a chart. I think that if the margin of error is higher than 1/5th of the MHI, the estimate is not precise enough to include.
mhi_final <- mhi |>filter(moe_ratio < .2)
In-class - create data and plot of educational attainment
create a new script, save it in part2/scripts/data_processing as educational_attainment_by_state
Use the educational attainment data for every state from the American Community Survey to create a processed dataframe and plot of the percent of people with at least a Bachelors Degree in each state. See instructions on this and the next 3 slides:
In-class (part 1) - find EDUCATIONAL ATTAINMENT data
Use the load_variables() function to look at the variables in the available variables in the 5-year ACS.
Open the dataframe and search the variables for ‘educational attainment’
There will be a lot of them, find the variables where: CONCEPT = “EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER” without any other words
Look at the variable names - they are in the format Table_variable
ex. B15003_002:
table = B15003
variable = 002 (No schooling completed)
In-class (part 2) - find EDUCATIONAL ATTAINMENT data
You can filter the acs_vars dataframe by table to view only those variables
filter rows with “B15003” in the name
In-class 3 (part 3) - import EDUCATIONAL ATTAINMENT data for all states and plot
Filtered by b15003_vars is a more manageable way to look at the variables about educational attainment.
Use it to identify the variables you need to calculate the percent of people with at least a bachelors degree
Import those variables for every state using the get_acs() function
Process the data to create a tidy dataframe with the percent bachelors degree
Create a bar chart to display the percent of people with at least a bachelors degree for every state, ordered from lowest to highest
write out the processed data, and bar chart to the correct folders in your part2 folder
Homework 7a.
Clean up your script from the in-class assignment processing educational attainment data. Submit the script on canvas.
Homework 7b.
Download census data to answer a research question about access to home ownership in cities across the country inspired by this article about the decline in access to home ownership in San Diego from the New York Times.
When Ms. Coats moved into the Baxter Street house, a family needed right around the area’s median income to afford the $82 monthly mortgage payment — the definition of middle class. Today a typical Clairemont home costs $850,000, up 30 percent from 2019. A family would need to make about double San Diego’s median income to afford one.