OurCommunities_DataPrep.knit

This creates datasets for a shiny app: https://isquared.shinyapps.io/OurCommunities_Maryland/. This process will speed up the app performance by skipping API access each time when the app is loaded each time.

Data come from US Census Bureau data tables, based on decennial censuses or American Community Surveys.
*** Year = 2020
*** All ACS estimates are five-year data (i.e., 2016-2020)

See ACS tables here:
* https://api.census.gov/data/2020/acs/acs5/variables
* https://api.census.gov/data/2020/acs/acs1/variables

0.1 Access API data for 2020

0.1.1 Total pop (Census)

County-level

dta <- get_decennial(
            geography = "county",
            variables = "P1_001N", #Total pop
            year = 2020,
            state = "MD", 
            geometry = FALSE
        )

    colnames(dta)<-tolower(colnames(dta))
    
    md_co_totalpop <- dta %>%
        mutate(
            county = name, 
            county = gsub(", Maryland", "", county), 
            county = gsub(" County", "", county),
            totalpop = value
            )%>%
        select(-variable, -value)

Tract-level

    dta <- get_decennial(
        geography = "tract",
        variables = "P1_001N", #Total pop
        year = 2020,
        state = "MD", 
        geometry = FALSE
    )
    
    colnames(dta)<-tolower(colnames(dta))
    
    md_tr_totalpop <- dta %>%
        mutate(
            county = name, 
            county = gsub(", Maryland", "", county), 
            county = gsub(" County", "", county),
            county = sub("^.*?, ", "", county), 
            totalpop = value
            )%>%
        select(-variable, -value)

0.1.2 Pop by race (ACS)

    race_vars <- c(
        NH_White = "B03002_003", #NH White 
        NH_Black = "B03002_004",
        NH_AIAN = "B03002_005",
        NH_Asian = "B03002_006",
        NH_NHPI = "B03002_007",
        NH_Other = "B03002_008",
        NH_Multiple = "B03002_009",
        Hispanic = "B03002_012"
    )

County-level

    dta <- get_acs(
            state = "MD",
            geography = "county",
            variables = race_vars, #8 mutually exclusive groups
            summary_var = "B03002_001", #Total population 
            year = 2020
        )

    colnames(dta)<-tolower(colnames(dta))
    
    md_co_racelong <- dta %>%
        mutate(
            county=name, 
            county = gsub(", Maryland", "", county), 
            county = gsub(" County", "", county),
            size = estimate, 
            percent = round( 100 * (estimate / summary_est), 2) ) %>%
        select(geoid, county, variable, size, percent)%>%
        
        rename(race=variable)%>%
        
        mutate(race = ifelse(race=="NH_AIAN", "NH_AIAN/NHPI/Other", 
                         ifelse(race=="NH_NHPI", "NH_AIAN/NHPI/Other", 
                            ifelse(race=="NH_Other", "NH_AIAN/NHPI/Other", 
                                   race))))%>%
        
        group_by(geoid, county, race)%>%
        summarize(
            size = sum(size), 
            percent = sum(percent) )%>%
        
        mutate(group="race")

0.1.3 Median age (ACS)

County-level

    dta <- get_acs(
        state = "MD", 
        geography = "county", 
        variable = "B06002_001", #Median age
        year = 2020
    )
    
    colnames(dta)<-tolower(colnames(dta))
    
    md_co_medage <- dta %>%
        select(geoid, estimate, moe) %>%
        mutate(medageround=round(estimate)) %>%
        rename(
            medage = estimate,
            medage_moe = moe)

Tract-level

    dta <- get_acs(
        state = "MD", 
        geography = "tract", 
        variable = "B06002_001", #Median age
        year = 2020, 
        geometry = FALSE
    )
    
    colnames(dta)<-tolower(colnames(dta))
    
    md_tr_medage <- dta %>%
        select(geoid, estimate, moe) %>%
        mutate(medageround=round(estimate)) %>%
        rename(
            medage = estimate,
            medage_moe = moe)

0.1.4 Pop by age (ACS)

    maleage_vars <- c(
        age_0_4  = "B01001_003",
        age_5_9  = "B01001_004",
        age_10_14    = "B01001_005",
        age_15_17    = "B01001_006",
        age_18_19    = "B01001_007",
        age_20   = "B01001_008",
        age_21   = "B01001_009",
        age_22_24    = "B01001_010",
        age_25_29    = "B01001_011",
        age_30_34    = "B01001_012",
        age_35_39    = "B01001_013",
        age_40_44    = "B01001_014",
        age_45_49    = "B01001_015",
        age_50_54    = "B01001_016",
        age_55_59    = "B01001_017",
        age_60_61    = "B01001_018",
        age_62_64    = "B01001_019",
        age_65_66    = "B01001_020",
        age_67_69    = "B01001_021",
        age_70_74    = "B01001_022",
        age_75_79    = "B01001_023",
        age_80_84    = "B01001_024",
        age_85_and_over  = "B01001_025"
    )
    
    femaleage_vars <- c(
        age_0_4  = "B01001_027",
        age_5_9  = "B01001_028",
        age_10_14    = "B01001_029",
        age_15_17    = "B01001_030",
        age_18_19    = "B01001_031",
        age_20   = "B01001_032",
        age_21   = "B01001_033",
        age_22_24    = "B01001_034",
        age_25_29    = "B01001_035",
        age_30_34    = "B01001_036",
        age_35_39    = "B01001_037",
        age_40_44    = "B01001_038",
        age_45_49    = "B01001_039",
        age_50_54    = "B01001_040",
        age_55_59    = "B01001_041",
        age_60_61    = "B01001_042",
        age_62_64    = "B01001_043",
        age_65_66    = "B01001_044",
        age_67_69    = "B01001_045",
        age_70_74    = "B01001_046",
        age_75_79    = "B01001_047",
        age_80_84    = "B01001_048",
        age_85_and_over  = "B01001_049"
    )

County-level

    dtamale <- get_acs(
        state = "MD",
        geography = "county",
        variables = maleage_vars,
        summary_var = "B01001_002", #Total population 
        year = 2020
    )
    
    dtafemale <- get_acs(
        state = "MD",
        geography = "county",
        variables = femaleage_vars,
        summary_var = "B01001_026", #Total population 
        year = 2020
    )    
    
    colnames(dtamale)<-tolower(colnames(dtamale))
    colnames(dtafemale)<-tolower(colnames(dtafemale))
    
    dta<-left_join(dtamale, dtafemale, 
                   by=c("geoid", "variable"))
    
    md_co_agelong <- dta %>%
        rename(
            agegroup=variable,
            name = name.x)%>%  
        mutate(
            estimate = estimate.x + estimate.y,
            summary_est = summary_est.x + summary_est.y, 
            percent = round( 100 * (estimate / summary_est), 2)  )%>%
        select(geoid, name, agegroup, estimate, summary_est, percent)%>%
        
        mutate(
            #beginning age for the age interval
            age=as.numeric(sapply(strsplit(agegroup,"_"), `[`, 2)), 
            age=floor(age/5)*5
        )%>%
        
        group_by(geoid, name, age)%>%
        summarize(percent = sum(percent))%>%
        
        mutate(
            agegroup=paste0(age, "-", age+4),
            county = name, 
            county = gsub(", Maryland", "", county), 
            county = gsub(" County", "", county))

    md_co_pop25over <- md_co_agelong %>%
        filter(age>=25)%>%
        group_by(geoid)%>%
        summarize_at(vars(percent), funs(sum(.)))%>%
        rename(percent25over = percent)
    
    md_co_pop25over <-left_join(md_co_totalpop, md_co_pop25over, 
                                by=c("geoid"))%>%
        mutate(pop25over=round(totalpop*percent25over/100, 0))%>%
        select(geoid, pop25over)

Tract-level

    dtamale <- get_acs(
        state = "MD",
        geography = "tract",
        variables = maleage_vars,
        summary_var = "B01001_002", #Total population 
        year = 2020
    )
    
    dtafemale <- get_acs(
        state = "MD",
        geography = "tract",
        variables = femaleage_vars,
        summary_var = "B01001_026", #Total population 
        year = 2020
    )    
    
    colnames(dtamale)<-tolower(colnames(dtamale))
    colnames(dtafemale)<-tolower(colnames(dtafemale))
    
    dta<-left_join(dtamale, dtafemale, 
                   by=c("geoid", "variable"))
    
    md_tr_agelong <- dta %>%
        rename(
            agegroup=variable,
            name = name.x)%>%   
        mutate(
            estimate = estimate.x + estimate.y,
            summary_est = summary_est.x + summary_est.y, 
            percent = round( 100 * (estimate / summary_est), 2)  )%>%
        select(geoid, name, agegroup, estimate, summary_est, percent)%>%
        
        mutate(
            #beginning age for the age interval
            age=as.numeric(sapply(strsplit(agegroup,"_"), `[`, 2)), 
            age=floor(age/5)*5
        )%>%
        
        group_by(geoid, name, age)%>%
        summarize(percent = sum(percent))%>%
        
        mutate(
            agegroup=paste0(age, "-", age+4),
            county = name, 
            county = gsub(", Maryland", "", county), 
            county = gsub(" County", "", county), 
            county = sub("^.*?, ", "", county))

    md_tr_pop25over <- md_tr_agelong %>%
        filter(age>=25)%>%
        group_by(geoid)%>%
        summarize_at(vars(percent), funs(sum(.)))%>%
        rename(percent25over = percent)
    
    md_tr_pop25over <-left_join(md_tr_totalpop, md_tr_pop25over, 
                                by=c("geoid"))%>%
        mutate(pop25over=round(totalpop*percent25over/100, 0))%>%
        select(geoid, pop25over)

0.1.5 Pop by education and econ (ACS)

    ses_vars <- c(
        NH_Black = "B03002_004",
        NH_AIAN = "B03002_005"
    )

County-level

    dta <- get_acs(
        state = "MD", 
        geography = "county", 
        variables = "B19013_001", #Median household income
        year = 2020
    )
    
            colnames(dta)<-tolower(colnames(dta))
            
            md_co_hhmedincome <- dta %>%
                mutate(
                    hhmedincome = round(estimate/1000, 1) , #now converted to thousands
                    hhmedincomemoe = round(moe/1000, 1),
                    hhmedincomeround = round(estimate/1000, 0)) %>%
                select(geoid, starts_with("hhmedincome"))
    
    #EDUCATION 
    #https://api.census.gov/data/2019/acs/acs1/groups/B15003.html
    dta <- get_acs(
        state = "MD", 
        geography = "county", 
        variables = c("B15003_025E", "B15003_024E", "B15003_023E", "B15003_022E", 
                       "B15003_021E",  "B15003_020E",  "B15003_019E"), 
        year = 2020
    )
    
            colnames(dta)<-tolower(colnames(dta))
            
            md_co_edu <- dta %>%
                select(-moe, -name)%>%
                arrange(geoid, variable)%>%
                spread(key = "variable", 
                       value = "estimate", convert = TRUE)%>%
                
                rename(
                    edu_doc = "B15003_025", #Doctorate degree","EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND
                    edu_prof = "B15003_024", #Professional school degree","EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"],
                    edu_master = "B15003_023", #Master's degree","EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"],
                    edu_bachelor = "B15003_022", #Bachelor's degree",        
                    edu_somelessthan1 = "B15003_019",   #Some college, less than 1 year 
                    edu_somemorethan1 = "B15003_020",   #Some college, 1 or more years, no degree   
                    edu_someassoc = "B15003_021"    #Associate's degree

                )%>%
                mutate(
                    edu_profdoc  = edu_doc + edu_prof,  
                    edu_somecollege = edu_somelessthan1 + edu_somemorethan1 + edu_someassoc
                )
            
            md_co_edu<-left_join(md_co_pop25over, md_co_edu, by=c("geoid"))%>%    
            select(-ends_with(".x"), -ends_with(".y"))%>%
        
            mutate(
                pctedu_profdoc = round(100 * edu_profdoc / pop25over, 0),   
                pctedu_master = round(100 * edu_master  / pop25over, 0), 
                pctedu_bachelor = round(100 * edu_bachelor  / pop25over, 0),  
                pctedu_somecollege = round(100 * edu_somecollege  / pop25over, 0),  
                pctedu_lesscollege = 100 - (pctedu_profdoc + pctedu_master + pctedu_bachelor + pctedu_somecollege), 
                
                pctedu_mastermore = pctedu_master + pctedu_profdoc, 
                pctedu_bachelormore = pctedu_bachelor + pctedu_master + pctedu_profdoc 
            )

Tract-level

dta <- get_acs(
    state = "MD", 
    geography = "tract", 
    variables = "B19013_001", #Median household income
    year = 2020
)

    colnames(dta)<-tolower(colnames(dta))
    
    md_tr_hhmedincome <- dta %>%
        mutate(
            hhmedincome = round(estimate/1000, 1) , #now converted to thousands
            hhmedincomemoe = round(moe/1000, 1),
            hhmedincomeround = round(estimate/1000, 0)) %>%
        select(geoid, starts_with("hhmedincome")) 

dta <- get_acs(
    state = "MD", 
    geography = "tract", 
    variables = c("B15003_025E", "B15003_024E", "B15003_023E", "B15003_022E", 
                       "B15003_021E",  "B15003_020E",  "B15003_019E"), 
    year = 2020
)

        colnames(dta)<-tolower(colnames(dta))
        
        md_tr_edu <- dta %>%
            select(-moe, -name)%>%
            arrange(geoid, variable)%>%
            spread(key = "variable", 
                   value = "estimate", convert = TRUE)%>%
        
                rename(
                    edu_doc = "B15003_025", #Doctorate degree","EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND
                    edu_prof = "B15003_024", #Professional school degree","EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"],
                    edu_master = "B15003_023", #Master's degree","EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"],
                    edu_bachelor = "B15003_022", #Bachelor's degree",        
                    edu_somelessthan1 = "B15003_019",   #Some college, less than 1 year 
                    edu_somemorethan1 = "B15003_020",   #Some college, 1 or more years, no degree   
                    edu_someassoc = "B15003_021"    #Associate's degree

                )%>%
                mutate(
                    edu_profdoc  = edu_doc + edu_prof,  
                    edu_somecollege = edu_somelessthan1 + edu_somemorethan1 + edu_someassoc
                )
    
      md_tr_edu<-left_join(md_tr_pop25over, md_tr_edu, by=c("geoid"))%>%    
          select(-ends_with(".x"), -ends_with(".y"))%>%
          
          
          mutate(
              pctedu_profdoc = round(100 * edu_profdoc / pop25over, 0),   
              pctedu_master = round(100 * edu_master  / pop25over, 0), 
              pctedu_bachelor = round(100 * edu_bachelor  / pop25over, 0),  
              pctedu_somecollege = round(100 * edu_somecollege  / pop25over, 0),  
              pctedu_lesscollege = 100 - (pctedu_profdoc + pctedu_master + pctedu_bachelor + pctedu_somecollege), 
              
              pctedu_mastermore = pctedu_master + pctedu_profdoc, 
              pctedu_bachelormore = pctedu_bachelor + pctedu_master + pctedu_profdoc 
            )

[“B06001_001E”,“Estimate!!Total:”,“PLACE OF BIRTH BY AGE IN THE UNITED STATES”], [“B06007_001E”,“Estimate!!Total:”,“PLACE OF BIRTH BY LANGUAGE SPOKEN AT HOME AND ABILITY TO SPEAK ENGLISH IN THE UNITED STATES”],

0.2 Bring in previous census population data

#https://api.census.gov/data/2010/dec/sf1/variables.html
dta2010 <- get_decennial(
        geography = "county",
        variables = "P001001", #Total pop in 2010. Note different var name.   
        year = 2010,  
        state = "MD", 
        geometry = FALSE)%>%
    mutate(geoid = GEOID, totalpop2010 = value)%>%
    select(geoid, starts_with("totalpop"))

#https://api.census.gov/data/2000/dec/sf1/variables.html
dta2000 <- get_decennial(
        geography = "county",
        variables = "P001001", #Total pop in 2000. Note different var name.   
        year = 2000, 
        state = "MD", 
        geometry = FALSE)%>%
    mutate(geoid = GEOID, totalpop2000 = value)%>%
    select(geoid, starts_with("totalpop"))
    
md_co_totalpop <- left_join(md_co_totalpop, dta2010, by=c("geoid"))
md_co_totalpop <- left_join(md_co_totalpop, dta2000, by=c("geoid"))

0.2 Assess and merge

dfcolist <-list(
              md_co_totalpop, 
              #md_co_pop25over, 
              md_co_medage,
              md_co_hhmedincome, 
              md_co_edu
              ) 

for(df in dfcolist) {
    print(class(df))
    print(dim(df))
    print(colnames(df))
}

[1] “tbl_df” “tbl” “data.frame” [1] 24 6 [1] “geoid” “name” “county” “totalpop” “totalpop2010” [6] “totalpop2000” [1] “tbl_df” “tbl” “data.frame” [1] 24 4 [1] “geoid” “medage” “medage_moe” “medageround” [1] “tbl_df” “tbl” “data.frame” [1] 24 4 [1] “geoid” “hhmedincome” “hhmedincomemoe” “hhmedincomeround” [1] “tbl_df” “tbl” “data.frame” [1] 24 18 [1] “geoid” “pop25over” “edu_somelessthan1”
[4] “edu_somemorethan1” “edu_someassoc” “edu_bachelor”
[7] “edu_master” “edu_prof” “edu_doc”
[10] “edu_profdoc” “edu_somecollege” “pctedu_profdoc”
[13] “pctedu_master” “pctedu_bachelor” “pctedu_somecollege” [16] “pctedu_lesscollege” “pctedu_mastermore” “pctedu_bachelormore”

md_co <- left_join(md_co_totalpop, md_co_medage, by=c("geoid"))
md_co <- left_join(md_co, md_co_hhmedincome, by=c("geoid"))
md_co <- left_join(md_co, md_co_edu, by=c("geoid"))

md_co <- md_co %>% mutate(geoid = as.character(geoid))

dim(md_co)

[1] 24 29

colnames(md_co)

[1] “geoid” “name” “county”
[4] “totalpop” “totalpop2010” “totalpop2000”
[7] “medage” “medage_moe” “medageround”
[10] “hhmedincome” “hhmedincomemoe” “hhmedincomeround”
[13] “pop25over” “edu_somelessthan1” “edu_somemorethan1”
[16] “edu_someassoc” “edu_bachelor” “edu_master”
[19] “edu_prof” “edu_doc” “edu_profdoc”
[22] “edu_somecollege” “pctedu_profdoc” “pctedu_master”
[25] “pctedu_bachelor” “pctedu_somecollege” “pctedu_lesscollege” [28] “pctedu_mastermore” “pctedu_bachelormore”

dftrlist <-list(
              md_tr_totalpop,  
              #md_tr_pop25over, 
              md_tr_medage, 
              md_tr_hhmedincome,
              md_tr_edu
              )

for(df in dftrlist) {
    print(class(df))
    print(dim(df))
    print(colnames(df))
}

[1] “tbl_df” “tbl” “data.frame” [1] 1475 4 [1] “geoid” “name” “county” “totalpop” [1] “tbl_df” “tbl” “data.frame” [1] 1475 4 [1] “geoid” “medage” “medage_moe” “medageround” [1] “tbl_df” “tbl” “data.frame” [1] 1475 4 [1] “geoid” “hhmedincome” “hhmedincomemoe” “hhmedincomeround” [1] “tbl_df” “tbl” “data.frame” [1] 1475 18 [1] “geoid” “pop25over” “edu_somelessthan1”
[4] “edu_somemorethan1” “edu_someassoc” “edu_bachelor”
[7] “edu_master” “edu_prof” “edu_doc”
[10] “edu_profdoc” “edu_somecollege” “pctedu_profdoc”
[13] “pctedu_master” “pctedu_bachelor” “pctedu_somecollege” [16] “pctedu_lesscollege” “pctedu_mastermore” “pctedu_bachelormore”

md_tr <- left_join(md_tr_totalpop, md_tr_medage, by=c("geoid"))
md_tr <- left_join(md_tr, md_tr_hhmedincome, by=c("geoid"))
md_tr <- left_join(md_tr, md_tr_edu, by=c("geoid"))

md_tr <- md_tr %>% mutate(geoid = as.character(geoid))

dim(md_tr)

[1] 1475 27

colnames(md_tr)

[1] “geoid” “name” “county”
[4] “totalpop” “medage” “medage_moe”
[7] “medageround” “hhmedincome” “hhmedincomemoe”
[10] “hhmedincomeround” “pop25over” “edu_somelessthan1”
[13] “edu_somemorethan1” “edu_someassoc” “edu_bachelor”
[16] “edu_master” “edu_prof” “edu_doc”
[19] “edu_profdoc” “edu_somecollege” “pctedu_profdoc”
[22] “pctedu_master” “pctedu_bachelor” “pctedu_somecollege” [25] “pctedu_lesscollege” “pctedu_mastermore” “pctedu_bachelormore”

colnames(md_co_racelong)

[1] “geoid” “county” “race” “size” “percent” “group”

colnames(md_co_agelong)

[1] “geoid” “name” “age” “percent” “agegroup” “county”

colnames(md_tr_agelong)

[1] “geoid” “name” “age” “percent” “agegroup” “county”

0.3 Export to csv

write.csv (md_co, "Shiny_OurCommunities_MD/md_co.csv", na='.')
write.csv (md_co_racelong, "Shiny_OurCommunities_MD/md_co_racelong.csv", na='.')
write.csv (md_co_agelong, "Shiny_OurCommunities_MD/md_co_agelong.csv", na='.')

write.csv (md_tr, "Shiny_OurCommunities_MD/md_tr.csv", na='.')
write.csv (md_tr_agelong, "Shiny_OurCommunities_MD/md_tr_agelong.csv", na='.')