############ Colorado Census by Age Data #####################
# Upload packages --------------------------------------------------------------
pacman:: p_load(
rio,
here,
skimr,
janitor,
epikit,
tidyverse,
ggplot2,
gtsummary,
dplyr,
stringr
)
#Import data -------------------------------------------------------------------
census_raw <- import ("C:/users/lockwora/Downloads/Colorado County Estimates by Age Group .xlsx")
exists("census_raw")
## [1] TRUE
#Look at the data---------------------------------------------------------------
dim(census_raw) #Rows and columns
## [1] 453115 9
head(census_raw, 10) #first few rows
## id countyfips year age county malepopulation femalepopulation
## 1 1 0 1990 0 Colorado 26850 25695
## 2 2 0 1990 1 Colorado 25924 25081
## 3 3 0 1990 2 Colorado 25942 24659
## 4 4 0 1990 3 Colorado 26255 25106
## 5 5 0 1990 4 Colorado 26654 25556
## 6 6 0 1990 5 Colorado 26079 24852
## 7 7 0 1990 6 Colorado 26259 24777
## 8 8 0 1990 7 Colorado 26468 25066
## 9 9 0 1990 8 Colorado 24822 23667
## 10 10 0 1990 9 Colorado 26293 24816
## totalpopulation datatype
## 1 52545 Estimate
## 2 51005 Estimate
## 3 50601 Estimate
## 4 51361 Estimate
## 5 52210 Estimate
## 6 50931 Estimate
## 7 51036 Estimate
## 8 51534 Estimate
## 9 48489 Estimate
## 10 51109 Estimate
names(census_raw) #column names
## [1] "id" "countyfips" "year" "age"
## [5] "county" "malepopulation" "femalepopulation" "totalpopulation"
## [9] "datatype"
str(census_raw) #variable structure
## 'data.frame': 453115 obs. of 9 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ countyfips : num 0 0 0 0 0 0 0 0 0 0 ...
## $ year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ age : num 0 1 2 3 4 5 6 7 8 9 ...
## $ county : chr "Colorado" "Colorado" "Colorado" "Colorado" ...
## $ malepopulation : num 26850 25924 25942 26255 26654 ...
## $ femalepopulation: num 25695 25081 24659 25106 25556 ...
## $ totalpopulation : num 52545 51005 50601 51361 52210 ...
## $ datatype : chr "Estimate" "Estimate" "Estimate" "Estimate" ...
dplyr::glimpse(census_raw)
## Rows: 453,115
## Columns: 9
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ countyfips <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ year <dbl> 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990,…
## $ age <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
## $ county <chr> "Colorado", "Colorado", "Colorado", "Colorado", "Colo…
## $ malepopulation <dbl> 26850, 25924, 25942, 26255, 26654, 26079, 26259, 2646…
## $ femalepopulation <dbl> 25695, 25081, 24659, 25106, 25556, 24852, 24777, 2506…
## $ totalpopulation <dbl> 52545, 51005, 50601, 51361, 52210, 50931, 51036, 5153…
## $ datatype <chr> "Estimate", "Estimate", "Estimate", "Estimate", "Esti…
#Lots of Cleaning to do #####################################################
#Need to: filter by years > make age categories > counts in those age groups not by year,
#then determine how to compare across counties, make new regions?--
census_clean <- census_raw %>%
filter(year %in% 2020:2024) %>% #Only 2020-2024 data---
mutate(
county_std = str_squish(county), # standardize 'county' text and flag statewide
is_statewide = countyfips == 0 | str_to_upper(county_std) == "COLORADO",
age_cat = cut(# make age groups; right = FALSE gives [lower, upper)
# so [0,18) = 0-17, [18,25) = 18-24, etc.
age,
breaks = c(0, 18, 25, 35, 45, 55, 65, 75, 85, Inf),
labels = c("0-17","18-24","25-34","35-44","45-54","55-64","65-74","75-84","85+"),
right = FALSE,
include.lowest = TRUE
)
) %>%
mutate(
age_cat = factor(age_cat,
levels = c("0-17","18-24","25-34","35-44","45-54","55-64","65-74","75-84","85+"),
ordered = TRUE)
)
census_clean <- census_clean %>%
group_by(is_statewide, year, age_cat) %>%
mutate(age_cat_totals = sum(totalpopulation, na.rm = TRUE)) %>%
ungroup()
census_clean <- census_clean %>%
group_by(county_std, year, age_cat) %>%
mutate(age_cat_totals = sum(totalpopulation, na.rm = TRUE)) %>%
ungroup()
#Check your work and take a look see--------------------------------------------
dim(census_clean) #Rows and columns
## [1] 32825 13
head(census_clean, 10) #first few rows
## # A tibble: 10 × 13
## id countyfips year age county malepopulation femalepopulation
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 2831 0 2020 0 Colorado 32318 30890
## 2 2832 0 2020 1 Colorado 32060 30677
## 3 2833 0 2020 2 Colorado 32852 31454
## 4 2834 0 2020 3 Colorado 33597 32190
## 5 2835 0 2020 4 Colorado 34581 33156
## 6 2836 0 2020 5 Colorado 34621 33409
## 7 2837 0 2020 6 Colorado 34960 33568
## 8 2838 0 2020 7 Colorado 35158 33664
## 9 2839 0 2020 8 Colorado 35628 34108
## 10 2840 0 2020 9 Colorado 36824 34901
## # ℹ 6 more variables: totalpopulation <dbl>, datatype <chr>, county_std <chr>,
## # is_statewide <lgl>, age_cat <ord>, age_cat_totals <dbl>
names(census_clean) #column names
## [1] "id" "countyfips" "year" "age"
## [5] "county" "malepopulation" "femalepopulation" "totalpopulation"
## [9] "datatype" "county_std" "is_statewide" "age_cat"
## [13] "age_cat_totals"
str(census_clean) #variable structure
## tibble [32,825 × 13] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:32825] 2831 2832 2833 2834 2835 ...
## $ countyfips : num [1:32825] 0 0 0 0 0 0 0 0 0 0 ...
## $ year : num [1:32825] 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
## $ age : num [1:32825] 0 1 2 3 4 5 6 7 8 9 ...
## $ county : chr [1:32825] "Colorado" "Colorado" "Colorado" "Colorado" ...
## $ malepopulation : num [1:32825] 32318 32060 32852 33597 34581 ...
## $ femalepopulation: num [1:32825] 30890 30677 31454 32190 33156 ...
## $ totalpopulation : num [1:32825] 63208 62738 64307 65787 67737 ...
## $ datatype : chr [1:32825] "Estimate" "Estimate" "Estimate" "Estimate" ...
## $ county_std : chr [1:32825] "Colorado" "Colorado" "Colorado" "Colorado" ...
## $ is_statewide : logi [1:32825] TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ age_cat : Ord.factor w/ 9 levels "0-17"<"18-24"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ age_cat_totals : num [1:32825] 1270943 1270943 1270943 1270943 1270943 ...
glimpse(census_clean)
## Rows: 32,825
## Columns: 13
## $ id <dbl> 2831, 2832, 2833, 2834, 2835, 2836, 2837, 2838, 2839,…
## $ countyfips <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ year <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,…
## $ age <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
## $ county <chr> "Colorado", "Colorado", "Colorado", "Colorado", "Colo…
## $ malepopulation <dbl> 32318, 32060, 32852, 33597, 34581, 34621, 34960, 3515…
## $ femalepopulation <dbl> 30890, 30677, 31454, 32190, 33156, 33409, 33568, 3366…
## $ totalpopulation <dbl> 63208, 62738, 64307, 65787, 67737, 68030, 68528, 6882…
## $ datatype <chr> "Estimate", "Estimate", "Estimate", "Estimate", "Esti…
## $ county_std <chr> "Colorado", "Colorado", "Colorado", "Colorado", "Colo…
## $ is_statewide <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
## $ age_cat <ord> 0-17, 0-17, 0-17, 0-17, 0-17, 0-17, 0-17, 0-17, 0-17,…
## $ age_cat_totals <dbl> 1270943, 1270943, 1270943, 1270943, 1270943, 1270943,…
#Need to aggregate one row per county x age cat x year -------------------------
county_age_year <- census_clean %>%
filter(!is_statewide) %>% # drop statewide; remove this line if including
group_by(county_std, year, age_cat) %>%
summarize(
total_pop = sum(totalpopulation, na.rm = TRUE),
.groups = "drop"
)
ty_age_year <- county_age_year %>%
mutate(
# Standardize county ID
county_id5 = county_std |>
str_to_upper() |> # upper case
str_replace_all("[^A-Z ]", "") |># remove non-letters
str_squish() |>
str_replace_all(" ", "") |> # remove spaces
str_sub(1, 5), # first 5
# Final ID = COUNTY5-AGECAT-YEAR
id_cay = paste0(county_id5, "-", age_cat, "-", year)
)