Example of Data Wrangling in R

############ Colorado Census by Age Data #####################

# Upload packages --------------------------------------------------------------
pacman:: p_load(
  rio,
  here,
  skimr,
  janitor,
  epikit,
  tidyverse,
  ggplot2,
  gtsummary, 
  dplyr, 
  stringr
)

#Import data -------------------------------------------------------------------
census_raw <- import ("C:/users/lockwora/Downloads/Colorado County Estimates by Age Group .xlsx")
exists("census_raw")

## [1] TRUE

#Look at the data---------------------------------------------------------------
dim(census_raw)         #Rows and columns

## [1] 453115      9

head(census_raw, 10)    #first few rows

##    id countyfips year age   county malepopulation femalepopulation
## 1   1          0 1990   0 Colorado          26850            25695
## 2   2          0 1990   1 Colorado          25924            25081
## 3   3          0 1990   2 Colorado          25942            24659
## 4   4          0 1990   3 Colorado          26255            25106
## 5   5          0 1990   4 Colorado          26654            25556
## 6   6          0 1990   5 Colorado          26079            24852
## 7   7          0 1990   6 Colorado          26259            24777
## 8   8          0 1990   7 Colorado          26468            25066
## 9   9          0 1990   8 Colorado          24822            23667
## 10 10          0 1990   9 Colorado          26293            24816
##    totalpopulation datatype
## 1            52545 Estimate
## 2            51005 Estimate
## 3            50601 Estimate
## 4            51361 Estimate
## 5            52210 Estimate
## 6            50931 Estimate
## 7            51036 Estimate
## 8            51534 Estimate
## 9            48489 Estimate
## 10           51109 Estimate

names(census_raw)       #column names

## [1] "id"               "countyfips"       "year"             "age"             
## [5] "county"           "malepopulation"   "femalepopulation" "totalpopulation" 
## [9] "datatype"

str(census_raw)         #variable structure

## 'data.frame':    453115 obs. of  9 variables:
##  $ id              : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ countyfips      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ year            : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ age             : num  0 1 2 3 4 5 6 7 8 9 ...
##  $ county          : chr  "Colorado" "Colorado" "Colorado" "Colorado" ...
##  $ malepopulation  : num  26850 25924 25942 26255 26654 ...
##  $ femalepopulation: num  25695 25081 24659 25106 25556 ...
##  $ totalpopulation : num  52545 51005 50601 51361 52210 ...
##  $ datatype        : chr  "Estimate" "Estimate" "Estimate" "Estimate" ...

dplyr::glimpse(census_raw)

## Rows: 453,115
## Columns: 9
## $ id               <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ countyfips       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ year             <dbl> 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990,…
## $ age              <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
## $ county           <chr> "Colorado", "Colorado", "Colorado", "Colorado", "Colo…
## $ malepopulation   <dbl> 26850, 25924, 25942, 26255, 26654, 26079, 26259, 2646…
## $ femalepopulation <dbl> 25695, 25081, 24659, 25106, 25556, 24852, 24777, 2506…
## $ totalpopulation  <dbl> 52545, 51005, 50601, 51361, 52210, 50931, 51036, 5153…
## $ datatype         <chr> "Estimate", "Estimate", "Estimate", "Estimate", "Esti…

#Lots of Cleaning to do #####################################################
#Need to: filter by years > make age categories > counts in those age groups not by year, 
#then determine how to compare across counties, make new regions?--
census_clean <- census_raw %>%
  filter(year %in% 2020:2024) %>% #Only 2020-2024 data---
  mutate(
    county_std   = str_squish(county),  # standardize 'county' text and flag statewide
    is_statewide = countyfips == 0 | str_to_upper(county_std) == "COLORADO",
    
    age_cat = cut(# make age groups; right = FALSE gives [lower, upper)
    # so [0,18) = 0-17, [18,25) = 18-24, etc.
      age,
      breaks  = c(0, 18, 25, 35, 45, 55, 65, 75, 85, Inf),
      labels  = c("0-17","18-24","25-34","35-44","45-54","55-64","65-74","75-84","85+"),
      right   = FALSE,
      include.lowest = TRUE
    )
  ) %>%
  mutate(
    age_cat = factor(age_cat,
                     levels = c("0-17","18-24","25-34","35-44","45-54","55-64","65-74","75-84","85+"),
                     ordered = TRUE)
  )


census_clean <- census_clean %>%
  group_by(is_statewide, year, age_cat) %>%
  mutate(age_cat_totals = sum(totalpopulation, na.rm = TRUE)) %>%
  ungroup()


census_clean <- census_clean %>%
  group_by(county_std, year, age_cat) %>%
  mutate(age_cat_totals = sum(totalpopulation, na.rm = TRUE)) %>%
  ungroup()



#Check your work and take a look see--------------------------------------------
dim(census_clean)         #Rows and columns

## [1] 32825    13

head(census_clean, 10)    #first few rows

## # A tibble: 10 × 13
##       id countyfips  year   age county   malepopulation femalepopulation
##    <dbl>      <dbl> <dbl> <dbl> <chr>             <dbl>            <dbl>
##  1  2831          0  2020     0 Colorado          32318            30890
##  2  2832          0  2020     1 Colorado          32060            30677
##  3  2833          0  2020     2 Colorado          32852            31454
##  4  2834          0  2020     3 Colorado          33597            32190
##  5  2835          0  2020     4 Colorado          34581            33156
##  6  2836          0  2020     5 Colorado          34621            33409
##  7  2837          0  2020     6 Colorado          34960            33568
##  8  2838          0  2020     7 Colorado          35158            33664
##  9  2839          0  2020     8 Colorado          35628            34108
## 10  2840          0  2020     9 Colorado          36824            34901
## # ℹ 6 more variables: totalpopulation <dbl>, datatype <chr>, county_std <chr>,
## #   is_statewide <lgl>, age_cat <ord>, age_cat_totals <dbl>

names(census_clean)       #column names

##  [1] "id"               "countyfips"       "year"             "age"             
##  [5] "county"           "malepopulation"   "femalepopulation" "totalpopulation" 
##  [9] "datatype"         "county_std"       "is_statewide"     "age_cat"         
## [13] "age_cat_totals"

str(census_clean)         #variable structure

## tibble [32,825 × 13] (S3: tbl_df/tbl/data.frame)
##  $ id              : num [1:32825] 2831 2832 2833 2834 2835 ...
##  $ countyfips      : num [1:32825] 0 0 0 0 0 0 0 0 0 0 ...
##  $ year            : num [1:32825] 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ age             : num [1:32825] 0 1 2 3 4 5 6 7 8 9 ...
##  $ county          : chr [1:32825] "Colorado" "Colorado" "Colorado" "Colorado" ...
##  $ malepopulation  : num [1:32825] 32318 32060 32852 33597 34581 ...
##  $ femalepopulation: num [1:32825] 30890 30677 31454 32190 33156 ...
##  $ totalpopulation : num [1:32825] 63208 62738 64307 65787 67737 ...
##  $ datatype        : chr [1:32825] "Estimate" "Estimate" "Estimate" "Estimate" ...
##  $ county_std      : chr [1:32825] "Colorado" "Colorado" "Colorado" "Colorado" ...
##  $ is_statewide    : logi [1:32825] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ age_cat         : Ord.factor w/ 9 levels "0-17"<"18-24"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ age_cat_totals  : num [1:32825] 1270943 1270943 1270943 1270943 1270943 ...

glimpse(census_clean)

## Rows: 32,825
## Columns: 13
## $ id               <dbl> 2831, 2832, 2833, 2834, 2835, 2836, 2837, 2838, 2839,…
## $ countyfips       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ year             <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,…
## $ age              <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
## $ county           <chr> "Colorado", "Colorado", "Colorado", "Colorado", "Colo…
## $ malepopulation   <dbl> 32318, 32060, 32852, 33597, 34581, 34621, 34960, 3515…
## $ femalepopulation <dbl> 30890, 30677, 31454, 32190, 33156, 33409, 33568, 3366…
## $ totalpopulation  <dbl> 63208, 62738, 64307, 65787, 67737, 68030, 68528, 6882…
## $ datatype         <chr> "Estimate", "Estimate", "Estimate", "Estimate", "Esti…
## $ county_std       <chr> "Colorado", "Colorado", "Colorado", "Colorado", "Colo…
## $ is_statewide     <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
## $ age_cat          <ord> 0-17, 0-17, 0-17, 0-17, 0-17, 0-17, 0-17, 0-17, 0-17,…
## $ age_cat_totals   <dbl> 1270943, 1270943, 1270943, 1270943, 1270943, 1270943,…

#Need to aggregate one row per county x age cat x year -------------------------
county_age_year <- census_clean %>%
  filter(!is_statewide) %>%  # drop statewide; remove this line if including
  group_by(county_std, year, age_cat) %>%
  summarize(
    total_pop = sum(totalpopulation, na.rm = TRUE),
    .groups = "drop"
  )

ty_age_year <- county_age_year %>%
  mutate(
    # Standardize county ID
    county_id5 = county_std |> 
      str_to_upper() |>                # upper case 
      str_replace_all("[^A-Z ]", "") |># remove non-letters 
      str_squish() |>
      str_replace_all(" ", "") |>      # remove spaces
      str_sub(1, 5),                   # first 5 
    
    # Final ID = COUNTY5-AGECAT-YEAR
    id_cay = paste0(county_id5, "-", age_cat, "-", year)
  )

Example of Data Wrangling in R_Rachel Lockwood

2026-05-14