Tidying US Census dataset

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(usmap)

## Warning: package 'usmap' was built under R version 4.4.3

library(dplyr)

Tidying dataset

census_data <- read.csv("https://raw.githubusercontent.com/farhodibr/CUNY-SPS-MSDS/refs/heads/main/DATA607/PROJECT2/US_Census_Data.csv")

census_data <- census_data |>
  select(-Fact.Note)
glimpse(census_data)

## Rows: 85
## Columns: 51
## $ Fact           <chr> "Population estimates, July 1, 2016,  (V2016)", "Popula…
## $ Alabama        <chr> "4,863,300", "4,780,131", "1.70%", "4,779,736", "6.00%"…
## $ Alaska         <chr> "741,894", "710,249", "4.50%", "710,231", "7.30%", "7.6…
## $ Arizona        <chr> "6,931,071", "6,392,301", "8.40%", "6,392,017", "6.30%"…
## $ Arkansas       <chr> "2,988,248", "2,916,025", "2.50%", "2,915,918", "6.40%"…
## $ California     <chr> "39,250,017", "37,254,522", "5.40%", "37,253,956", "6.3…
## $ Colorado       <chr> "5,540,545", "5,029,324", "10.20%", "5,029,196", "6.10%…
## $ Connecticut    <chr> "3,576,452", "3,574,114", "0.10%", "3,574,097", "5.20%"…
## $ Delaware       <chr> "952,065", "897,936", "6.00%", "897,934", "5.80%", "6.2…
## $ Florida        <chr> "20,612,439", "18,804,592", "9.60%", "18,801,310", "5.5…
## $ Georgia        <chr> "10,310,371", "9,688,680", "6.40%", "9,687,653", "6.40%…
## $ Hawaii         <chr> "1,428,557", "1,360,301", "5.00%", "1,360,301", "6.40%"…
## $ Idaho          <chr> "1,683,140", "1,567,650", "7.40%", "1,567,582", "6.80%"…
## $ Illinois       <chr> "12,801,539", "12,831,574", "-0.20%", "12,830,632", "6.…
## $ Indiana        <chr> "6,633,053", "6,484,136", "2.30%", "6,483,802", "6.40%"…
## $ Iowa           <chr> "3,134,693", "3,046,869", "2.90%", "3,046,355", "6.40%"…
## $ Kansas         <chr> "2,907,289", "2,853,129", "1.90%", "2,853,118", "6.70%"…
## $ Kentucky       <chr> "4,436,974", "4,339,344", "2.20%", "4,339,367", "6.20%"…
## $ Louisiana      <chr> "4,681,666", "4,533,479", "3.30%", "4,533,372", "6.60%"…
## $ Maine          <chr> "1,331,479", "1,328,364", "0.20%", "1,328,361", "4.90%"…
## $ Maryland       <chr> "6,016,447", "5,773,786", "4.20%", "5,773,552", "6.10%"…
## $ Massachusetts  <chr> "6,811,779", "6,547,813", "4.00%", "6,547,629", "5.30%"…
## $ Michigan       <chr> "9,928,300", "9,884,129", "0.40%", "9,883,640", "5.80%"…
## $ Minnesota      <chr> "5,519,952", "5,303,924", "4.10%", "5,303,925", "6.40%"…
## $ Mississippi    <chr> "2,988,726", "2,968,103", "0.70%", "2,967,297", "6.30%"…
## $ Missouri       <chr> "6,093,000", "5,988,928", "1.70%", "5,988,927", "6.10%"…
## $ Montana        <chr> "1,042,520", "989,414", "5.40%", "989,415", "6.00%", "6…
## $ Nebraska       <chr> "1,907,116", "1,826,334", "4.40%", "1,826,341", "7.00%"…
## $ Nevada         <chr> "2,940,058", "2,700,691", "8.90%", "2,700,551", "6.30%"…
## $ New.Hampshire  <chr> "1,334,795", "1,316,461", "1.40%", "1,316,470", "4.80%"…
## $ New.Jersey     <chr> "8,944,469", "8,791,953", "1.70%", "8,791,894", "5.80%"…
## $ New.Mexico     <chr> "2081015", "2059198", "0.011", "2059179", "0.062", "0.0…
## $ New.York       <chr> "19745289", "19378110", "0.019", "19378102", "0.059", "…
## $ North.Carolina <chr> "10146788", "9535688", "0.064", "9535483", "0.06", "0.0…
## $ North.Dakota   <chr> "757952", "672591", "0.127", "672591", "0.073", "0.066"…
## $ Ohio           <chr> "11614373", "11536727", "0.007", "11536504", "0.06", "0…
## $ Oklahoma       <chr> "3923561", "3751615", "0.046", "3751351", "0.068", "0.0…
## $ Oregon         <chr> "4093465", "3831072", "0.068", "3831074", "0.058", "0.0…
## $ Pennsylvania   <chr> "12784227", "12702857", "0.006", "12702379", "0.056", "…
## $ Rhode.Island   <chr> "1056426", "1052940", "0.003", "1052567", "0.052", "0.0…
## $ South.Carolina <chr> "4961119", "4625410", "0.073", "4625364", "0.059", "0.0…
## $ South.Dakota   <chr> "865454", "814195", "0.063", "814180", "0.071", "0.073"…
## $ Tennessee      <chr> "6651194", "6346298", "0.048", "6346105", "0.061", "0.0…
## $ Texas          <chr> "27,862,596", "25,146,100", "10.80%", "25,145,561", "7.…
## $ Utah           <chr> "3,051,217", "2,763,888", "10.40%", "2,763,885", "8.30%…
## $ Vermont        <chr> "624,594", "625,741", "-0.20%", "625,741", "4.90%", "5.…
## $ Virginia       <chr> "8,411,808", "8,001,041", "5.10%", "8,001,024", "6.10%"…
## $ Washington     <chr> "7,288,000", "6,724,545", "8.40%", "6,724,540", "6.20%"…
## $ West.Virginia  <chr> "1,831,102", "1,853,011", "-1.20%", "1,852,994", "5.50%…
## $ Wisconsin      <chr> "5,778,708", "5,687,289", "1.60%", "5,686,986", "5.80%"…
## $ Wyoming        <chr> "585,501", "563,767", "3.90%", "563,626", "6.50%", "7.1…

Why do I think this dataset is not tidy:

Multiple variables in one column:
“Fact” column contains several descriptions of the data, which needs to be separate to be tidy
States supposed to be rows (observations), not columns.
Each row represents multiple observations. In tidy dataset each row represents single observation, for example a specific state’s demographic data for a single year.

Here I create separate tidy data table for populations of each state in years 2010 and 2016

state_names <- colnames(census_data)[3:ncol(census_data)]
#state_names

population_data <- census_data |>
  filter(Fact %in% c("Population estimates, July 1, 2016,  (V2016)", 
                     "Population, Census, April 1, 2010"))

long_population_data <- population_data |>
  pivot_longer(
    cols = -Fact,
    names_to = "state",
    values_to = "population"
  ) |>
  mutate(year = case_when(
    grepl("2016", Fact) ~2016,
    grepl("2010", Fact) ~ 2010,
    TRUE ~ NA_integer_
  )) |>
  select(-Fact) |>
  pivot_wider(
    names_from = year,
    values_from = population
   ) |>
  # |>
  # mutate(
  #   `2010` = as.numeric(`2010`),
  #   `2016` = as.numeric(`2016`)
  # )
  select(state, "2010", "2016")

long_population_data$`2010` <- gsub(",", "", long_population_data$`2010`)
long_population_data$`2010` <-  as.numeric(long_population_data$`2010`)

long_population_data$`2016` <- gsub(",", "", long_population_data$`2016`)
long_population_data$`2016` <-  as.numeric(long_population_data$`2016`)
print(long_population_data)

## # A tibble: 50 × 3
##    state         `2010`   `2016`
##    <chr>          <dbl>    <dbl>
##  1 Alabama      4779736  4863300
##  2 Alaska        710231   741894
##  3 Arizona      6392017  6931071
##  4 Arkansas     2915918  2988248
##  5 California  37253956 39250017
##  6 Colorado     5029196  5540545
##  7 Connecticut  3574097  3576452
##  8 Delaware      897934   952065
##  9 Florida     18801310 20612439
## 10 Georgia      9687653 10310371
## # ℹ 40 more rows

This long_population_datadata table is tidy and ready for analysis.

In this code I created create_long_tablefunction which makes it easier to create different tidy data tables:

create_long_table <- function(column_name, rows) {
  result <- census_data |>
  slice(rows) |>
  pivot_longer(
    cols = !Fact,
    names_to = "state",
    values_to = column_name
  ) |>
   mutate(year = case_when(
    grepl("2016", Fact) ~2016,
    grepl("2010", Fact) ~ 2010,
    TRUE ~ NA_integer_
  )) |>
  select(-Fact) |>
  select(state, year, all_of(column_name)) |>
  mutate(
    !!column_name := round(as.numeric(gsub("%", "", !!sym(column_name))), 2)
    )
  
  return(result)
}

Here I create gender_table_longtidy data table which includes female population proportions for each state in years 2010 and 2016. I use create_long_tableto create this table. Also I did data transformation because some states had proportions in percents, and some decimal values as actual proportions.

gender_table_long <- create_long_table("female_prop", 11:12) 
gender_table_long <- gender_table_long |>
  mutate(
    female_prop = if_else(
                          female_prop < 1,
                          round(female_prop * 100, 2),
                          round(female_prop, 2)
  ))
print(gender_table_long)

## # A tibble: 100 × 3
##    state        year female_prop
##    <chr>       <dbl>       <dbl>
##  1 Alabama      2016        51.6
##  2 Alaska       2016        47.7
##  3 Arizona      2016        50.3
##  4 Arkansas     2016        50.9
##  5 California   2016        50.3
##  6 Colorado     2016        49.7
##  7 Connecticut  2016        51.2
##  8 Delaware     2016        51.6
##  9 Florida      2016        51.1
## 10 Georgia      2016        51.3
## # ℹ 90 more rows

This code creates new observations in gender_table_long for male population for each state which is better for analysis.

gender_table_long|>
  mutate(
    male_prop = 100 - female_prop
  ) |>
  pivot_longer(
    cols = contains("prop"),
    names_to = "gender",
    values_to = "value"
  ) |>
  pivot_wider(
    names_from = year,
    values_from = value,
    names_prefix = "X"
  ) |>
  select(state, gender, X2010, X2016) |>
  mutate(
    prop_change = X2016 - X2010
  )

## # A tibble: 100 × 5
##    state      gender      X2010 X2016 prop_change
##    <chr>      <chr>       <dbl> <dbl>       <dbl>
##  1 Alabama    female_prop  51.5  51.6       0.100
##  2 Alabama    male_prop    48.5  48.4      -0.100
##  3 Alaska     female_prop  48    47.7      -0.300
##  4 Alaska     male_prop    52    52.3       0.300
##  5 Arizona    female_prop  50.3  50.3       0    
##  6 Arizona    male_prop    49.7  49.7       0    
##  7 Arkansas   female_prop  50.9  50.9       0    
##  8 Arkansas   male_prop    49.1  49.1       0    
##  9 California female_prop  50.3  50.3       0    
## 10 California male_prop    49.7  49.7       0    
## # ℹ 90 more rows

head(gender_table_long)

## # A tibble: 6 × 3
##   state       year female_prop
##   <chr>      <dbl>       <dbl>
## 1 Alabama     2016        51.6
## 2 Alaska      2016        47.7
## 3 Arizona     2016        50.3
## 4 Arkansas    2016        50.9
## 5 California  2016        50.3
## 6 Colorado    2016        49.7

From here I created few data tables for different age ranges:

under_5_proportions_long <- create_long_table("prop_under_5", 5:6) |>
  group_by(year, state)
print(under_5_proportions_long)

## # A tibble: 100 × 3
## # Groups:   year, state [100]
##    state        year prop_under_5
##    <chr>       <dbl>        <dbl>
##  1 Alabama      2016          6  
##  2 Alaska       2016          7.3
##  3 Arizona      2016          6.3
##  4 Arkansas     2016          6.4
##  5 California   2016          6.3
##  6 Colorado     2016          6.1
##  7 Connecticut  2016          5.2
##  8 Delaware     2016          5.8
##  9 Florida      2016          5.5
## 10 Georgia      2016          6.4
## # ℹ 90 more rows

under_18_proportions_long <- create_long_table("prop_under_18", 7:8)
print(under_18_proportions_long)

## # A tibble: 100 × 3
##    state        year prop_under_18
##    <chr>       <dbl>         <dbl>
##  1 Alabama      2016          22.6
##  2 Alaska       2016          25.2
##  3 Arizona      2016          23.5
##  4 Arkansas     2016          23.6
##  5 California   2016          23.2
##  6 Colorado     2016          22.8
##  7 Connecticut  2016          21.1
##  8 Delaware     2016          21.5
##  9 Florida      2016          20.1
## 10 Georgia      2016          24.4
## # ℹ 90 more rows

over_65_proportions_long <- create_long_table("over_65", 9:10)
print(over_65_proportions_long)

## # A tibble: 100 × 3
##    state        year over_65
##    <chr>       <dbl>   <dbl>
##  1 Alabama      2016    16.1
##  2 Alaska       2016    10.4
##  3 Arizona      2016    16.9
##  4 Arkansas     2016    16.3
##  5 California   2016    13.6
##  6 Colorado     2016    13.4
##  7 Connecticut  2016    16.1
##  8 Delaware     2016    17.5
##  9 Florida      2016    19.9
## 10 Georgia      2016    13.1
## # ℹ 90 more rows

population_proportions <- under_5_proportions_long |>
  left_join(under_18_proportions_long, by = c("state", "year")) |>
  left_join(over_65_proportions_long, c("state", "year")) |>
  mutate(
    prop_18_to_65 = 100 - prop_under_18 - over_65
  )

This code creates data table for population proportions in 18-65 ages range for each state

population_18_65_long <- population_proportions |>
  select(state, year, prop_18_to_65)
print(population_18_65_long)

## # A tibble: 100 × 3
## # Groups:   year, state [100]
##    state        year prop_18_to_65
##    <chr>       <dbl>         <dbl>
##  1 Alabama      2016          61.3
##  2 Alaska       2016          64.4
##  3 Arizona      2016          59.6
##  4 Arkansas     2016          60.1
##  5 California   2016          63.2
##  6 Colorado     2016          63.8
##  7 Connecticut  2016          62.8
##  8 Delaware     2016          61  
##  9 Florida      2016          60  
## 10 Georgia      2016          62.5
## # ℹ 90 more rows

Analysis

This plot shows analysis for each state’s population in 2010.

library(usmap)
long_population_data$state <- gsub("\\.", " ", long_population_data$state) 

names(long_population_data)[names(long_population_data) == "2010"] <- "pop_2010"
names(long_population_data)[names(long_population_data) == "2016"] <- "pop_2016"

data_for_map <- long_population_data |>
  left_join(usmap::statepop, by = c("state" = "full"))

mismatches <- setdiff(long_population_data$state, usmap::statepop$full)
print(mismatches)

## character(0)

# long_population_data$state <- trimws(long_population_data$state)
# usmap::statepop$full <- trimws(usmap::statepop$full)
# head(data_for_map)
# plot_usmap(data_for_map, values = "pop_2010", color = "blue") +
#   scale_fill_continuous(name = "Population 2010", 
#                         low = "white",
#                         high = "blue") +
#   theme(legend.position = "right") +
#   labs(title = "US State Population in 2010")

plot_usmap(data = long_population_data, values = "pop_2010", color = "blue") +
  scale_fill_continuous(name = "Population 2010", 
                        low = "white", 
                        high = "blue",
                        labels = scales::comma) +
  theme(legend.position = "right") +
  labs(title = "US State Population in 2010")

As we see on the plot most populated states in 2010:

California
Texas
Florida
New York

This plot shows how population changed in states in period of 2010 - 2016

long_population_data <- long_population_data |>
  mutate(
    pop_change = pop_2016 - pop_2010
  )

plot_usmap(data=long_population_data, values = "pop_change", color = "gray") +
  scale_fill_gradient2(name = "Population change 2010 - 2016",
                        low = "red", mid = "blue", high = "green",
                        midpoint = 0,
                        labels = scales::comma
                        ) +
  theme(legend.position = "right") +
  labs(title = "Population Chanhe in US State Population 2010 - 2016")

From this plot we see states which had most increases in population:

Texas
California
Florida

Tidying US Census dataset

Farhod Ibragimov

2025-03-04

Tidying dataset

Analysis