Setup

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Get Data

ethnicity_region_0722 <- read_delim("Natality, 2007-2022.txt", delim = "\t", escape_double = FALSE, trim_ws = TRUE)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 15313 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (11): Notes, Census Region, Census Region Code, State, State Code, Age o...
## dbl  (3): Year, Year Code, Births
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Rename and Select

ethnicity_region_0722 = ethnicity_region_0722 %>% 
  rename(Region = `Census Region Code`,
         Ethnicity = `Mother's Hispanic Origin`,
         Age = `Age of Mother 9 Code`,
         Fpop = `Female Population`,
         Rate = `Fertility Rate` ) %>% 
  select(Ethnicity, Year, Region,State, Age, Fpop, Births, Rate)

glimpse(ethnicity_region_0722)

## Rows: 15,313
## Columns: 8
## $ Ethnicity <chr> "Hispanic or Latino", "Hispanic or Latino", "Hispanic or Lat…
## $ Year      <dbl> 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2020, …
## $ Region    <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "CENS…
## $ State     <chr> "Connecticut", "Connecticut", "Connecticut", "Connecticut", …
## $ Age       <chr> "15", "15", "15", "15", "15", "15", "15", "15", "15", "15", …
## $ Fpop      <chr> "Not Available", "Not Available", "Not Available", "Not Avai…
## $ Births    <dbl> 18, 11, 12, 14, 15, 14, 10, 11, 10, 12, 10, 15, 15, 1301, 13…
## $ Rate      <chr> "Not Available", "Not Available", "Not Available", "Not Avai…

Recode

ethnicity_region_0722 = ethnicity_region_0722 %>% 
  mutate(Region = ifelse(Region == "CENS-R1","NE",Region),
         Region = ifelse(Region == "CENS-R2","MW",Region),
         Region = ifelse(Region == "CENS-R3","SO",Region),
         Region = ifelse(Region == "CENS-R4","WE",Region),
         Fpop = as.numeric(Fpop),
         Rate = as.numeric(Rate)/1000) %>% 
drop_na()

## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Fpop = as.numeric(Fpop)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

head(ethnicity_region_0722)

## # A tibble: 6 × 8
##   Ethnicity           Year Region State       Age    Fpop Births   Rate
##   <chr>              <dbl> <chr>  <chr>       <chr> <dbl>  <dbl>  <dbl>
## 1 Hispanic or Latino  2007 NE     Connecticut 15-19 19523   1301 0.0666
## 2 Hispanic or Latino  2008 NE     Connecticut 15-19 20391   1353 0.0663
## 3 Hispanic or Latino  2009 NE     Connecticut 15-19 21060   1266 0.0601
## 4 Hispanic or Latino  2010 NE     Connecticut 15-19 21286   1104 0.0519
## 5 Hispanic or Latino  2011 NE     Connecticut 15-19 21670   1019 0.0470
## 6 Hispanic or Latino  2012 NE     Connecticut 15-19 22126    943 0.0426

First Plot

Plot the yearly Rate for age group 25-29 in a grid by Ethnicity and Region.

Solution

ethnicity_region_0722 %>% 
  filter(Age == "25-29") %>% 
  ggplot(aes(x= Year, y = Rate)) +
  geom_point(size = .1) +
  geom_smooth(color = "red") +
  facet_grid(Ethnicity~Region) +
  ggtitle("TS Plot of Rate for 25-29 by Ethnicity and Region")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Solution

ethnicity_region_0722 %>% 
  filter(Age == "25-29") %>% 
  ggplot(aes(x= Year, y = Rate)) +
  geom_point(size = .1) +
  geom_smooth() +
  facet_grid(Region~Ethnicity) +
  ggtitle("TS Plot of Rate for 25-29 by Ethnicity and Region")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

TFR by State

Compute the TFR for each state and year. To compute the TFR multiply every age-specific rate by 5 and sum them. First you need to get the birth rates for each state by adding the births and female populations for the separate ethnicities.

TFR_State_Year = ethnicity_region_0722 %>% 
  group_by(State,Year,Age) %>% 
  summarize(Births = sum(Births),
            Fpop = sum(Fpop)) %>% 
  mutate(Rate = Births/Fpop) %>% 
  ungroup() %>% 
  group_by(State,Year) %>% 
  summarise(TFR = sum(Rate) * 5) %>% 
  ungroup()

## `summarise()` has grouped output by 'State', 'Year'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.

head(TFR_State_Year,16)

## # A tibble: 16 × 3
##    State    Year   TFR
##    <chr>   <dbl> <dbl>
##  1 Alabama  2007  2.04
##  2 Alabama  2008  2.02
##  3 Alabama  2009  1.94
##  4 Alabama  2010  1.87
##  5 Alabama  2011  1.83
##  6 Alabama  2012  1.80
##  7 Alabama  2013  1.79
##  8 Alabama  2014  1.82
##  9 Alabama  2015  1.83
## 10 Alabama  2016  1.82
## 11 Alabama  2017  1.82
## 12 Alabama  2018  1.78
## 13 Alabama  2019  1.81
## 14 Alabama  2020  1.78
## 15 Alaska   2007  2.33
## 16 Alaska   2008  2.40

Scatterplot TFR and Year

Produce the scatterplot using plotly to be interactive. Display the state name with hover.

g1 = TFR_State_Year %>% 
  ggplot(aes(x = Year,y = TFR,group = State)) + geom_point(size = .5)

ggplotly(g1)

National TFR by Race

Create a plot showing the TFR for the whole country by ethnicity.

Solution

g1 = ethnicity_region_0722 %>% 
  group_by(Year,Ethnicity,Age) %>% 
  summarize(Births = sum(Births),
            Fpop = sum(Fpop)) %>% 
  mutate(Rate = Births/Fpop)%>% 
  summarize(TFR = sum(Rate) * 5) %>% 
  ungroup() %>% 
  ggplot(aes(x = Year,y = TFR, color = Ethnicity)) +
  geom_point() +
  geom_smooth() +
  ggtitle("National TFR by Year and Ethnicity")

## `summarise()` has grouped output by 'Year', 'Ethnicity'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

ggplotly(g1)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Ethnicity and Region

Create a plot showing the TFR by Ethnicity and Region. Use plotly.

Solution

g2 = ethnicity_region_0722 %>% 
  group_by(Year,Region,Ethnicity,Age) %>% 
  summarize(Births = sum(Births),
            Fpop = sum(Fpop)) %>% 
  mutate(Rate = Births/Fpop)%>% 
  summarize(TFR = sum(Rate) * 5) %>% 
  ungroup() %>% 
  ggplot(aes(x = Year,y = TFR, color = Ethnicity)) +
  geom_point() +
  facet_grid(Ethnicity~Region)

## `summarise()` has grouped output by 'Year', 'Region', 'Ethnicity'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'Year', 'Region'. You can override using
## the `.groups` argument.

  ggtitle("Regional TFR by Year and Ethnicity")

## $title
## [1] "Regional TFR by Year and Ethnicity"
## 
## attr(,"class")
## [1] "labels"

ggplotly(g2)

Fertility for AI

Setup

Get Data

Rename and Select

Recode

First Plot

Solution

Flip the Grid

Solution

TFR by State

Scatterplot TFR and Year

National TFR by Race

Solution

Ethnicity and Region

Solution