Fertility for AI

Harold Nelson

7/30/2024

Setup

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Get Data

ethnicity_region_0722 <- read_delim("Natality, 2007-2022.txt", delim = "\t", escape_double = FALSE, trim_ws = TRUE)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 15313 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (11): Notes, Census Region, Census Region Code, State, State Code, Age o...
## dbl  (3): Year, Year Code, Births
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Rename and Select

ethnicity_region_0722 = ethnicity_region_0722 %>% 
  rename(Region = `Census Region Code`,
         Ethnicity = `Mother's Hispanic Origin`,
         Age = `Age of Mother 9 Code`,
         Fpop = `Female Population`,
         Rate = `Fertility Rate` ) %>% 
  select(Ethnicity, Year, Region,State, Age, Fpop, Births, Rate)

glimpse(ethnicity_region_0722)
## Rows: 15,313
## Columns: 8
## $ Ethnicity <chr> "Hispanic or Latino", "Hispanic or Latino", "Hispanic or Lat…
## $ Year      <dbl> 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2020, …
## $ Region    <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "CENS…
## $ State     <chr> "Connecticut", "Connecticut", "Connecticut", "Connecticut", …
## $ Age       <chr> "15", "15", "15", "15", "15", "15", "15", "15", "15", "15", …
## $ Fpop      <chr> "Not Available", "Not Available", "Not Available", "Not Avai…
## $ Births    <dbl> 18, 11, 12, 14, 15, 14, 10, 11, 10, 12, 10, 15, 15, 1301, 13…
## $ Rate      <chr> "Not Available", "Not Available", "Not Available", "Not Avai…

Recode

ethnicity_region_0722 = ethnicity_region_0722 %>% 
  mutate(Region = ifelse(Region == "CENS-R1","NE",Region),
         Region = ifelse(Region == "CENS-R2","MW",Region),
         Region = ifelse(Region == "CENS-R3","SO",Region),
         Region = ifelse(Region == "CENS-R4","WE",Region),
         Fpop = as.numeric(Fpop),
         Rate = as.numeric(Rate)/1000) %>% 
drop_na()
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Fpop = as.numeric(Fpop)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
head(ethnicity_region_0722)
## # A tibble: 6 × 8
##   Ethnicity           Year Region State       Age    Fpop Births   Rate
##   <chr>              <dbl> <chr>  <chr>       <chr> <dbl>  <dbl>  <dbl>
## 1 Hispanic or Latino  2007 NE     Connecticut 15-19 19523   1301 0.0666
## 2 Hispanic or Latino  2008 NE     Connecticut 15-19 20391   1353 0.0663
## 3 Hispanic or Latino  2009 NE     Connecticut 15-19 21060   1266 0.0601
## 4 Hispanic or Latino  2010 NE     Connecticut 15-19 21286   1104 0.0519
## 5 Hispanic or Latino  2011 NE     Connecticut 15-19 21670   1019 0.0470
## 6 Hispanic or Latino  2012 NE     Connecticut 15-19 22126    943 0.0426

First Plot

Plot the yearly Rate for age group 25-29 in a grid by Ethnicity and Region.

Solution

ethnicity_region_0722 %>% 
  filter(Age == "25-29") %>% 
  ggplot(aes(x= Year, y = Rate)) +
  geom_point(size = .1) +
  geom_smooth(color = "red") +
  facet_grid(Ethnicity~Region) +
  ggtitle("TS Plot of Rate for 25-29 by Ethnicity and Region")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Flip the Grid

Solution

ethnicity_region_0722 %>% 
  filter(Age == "25-29") %>% 
  ggplot(aes(x= Year, y = Rate)) +
  geom_point(size = .1) +
  geom_smooth() +
  facet_grid(Region~Ethnicity) +
  ggtitle("TS Plot of Rate for 25-29 by Ethnicity and Region")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

TFR by State

Compute the TFR for each state and year. To compute the TFR multiply every age-specific rate by 5 and sum them. First you need to get the birth rates for each state by adding the births and female populations for the separate ethnicities.

TFR_State_Year = ethnicity_region_0722 %>% 
  group_by(State,Year,Age) %>% 
  summarize(Births = sum(Births),
            Fpop = sum(Fpop)) %>% 
  mutate(Rate = Births/Fpop) %>% 
  ungroup() %>% 
  group_by(State,Year) %>% 
  summarise(TFR = sum(Rate) * 5) %>% 
  ungroup()
## `summarise()` has grouped output by 'State', 'Year'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
head(TFR_State_Year,16)
## # A tibble: 16 × 3
##    State    Year   TFR
##    <chr>   <dbl> <dbl>
##  1 Alabama  2007  2.04
##  2 Alabama  2008  2.02
##  3 Alabama  2009  1.94
##  4 Alabama  2010  1.87
##  5 Alabama  2011  1.83
##  6 Alabama  2012  1.80
##  7 Alabama  2013  1.79
##  8 Alabama  2014  1.82
##  9 Alabama  2015  1.83
## 10 Alabama  2016  1.82
## 11 Alabama  2017  1.82
## 12 Alabama  2018  1.78
## 13 Alabama  2019  1.81
## 14 Alabama  2020  1.78
## 15 Alaska   2007  2.33
## 16 Alaska   2008  2.40

Scatterplot TFR and Year

Produce the scatterplot using plotly to be interactive. Display the state name with hover.

g1 = TFR_State_Year %>% 
  ggplot(aes(x = Year,y = TFR,group = State)) + geom_point(size = .5)

ggplotly(g1)

National TFR by Race

Create a plot showing the TFR for the whole country by ethnicity.

Solution

g1 = ethnicity_region_0722 %>% 
  group_by(Year,Ethnicity,Age) %>% 
  summarize(Births = sum(Births),
            Fpop = sum(Fpop)) %>% 
  mutate(Rate = Births/Fpop)%>% 
  summarize(TFR = sum(Rate) * 5) %>% 
  ungroup() %>% 
  ggplot(aes(x = Year,y = TFR, color = Ethnicity)) +
  geom_point() +
  geom_smooth() +
  ggtitle("National TFR by Year and Ethnicity")
## `summarise()` has grouped output by 'Year', 'Ethnicity'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
ggplotly(g1)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Ethnicity and Region

Create a plot showing the TFR by Ethnicity and Region. Use plotly.

Solution

g2 = ethnicity_region_0722 %>% 
  group_by(Year,Region,Ethnicity,Age) %>% 
  summarize(Births = sum(Births),
            Fpop = sum(Fpop)) %>% 
  mutate(Rate = Births/Fpop)%>% 
  summarize(TFR = sum(Rate) * 5) %>% 
  ungroup() %>% 
  ggplot(aes(x = Year,y = TFR, color = Ethnicity)) +
  geom_point() +
  facet_grid(Ethnicity~Region)
## `summarise()` has grouped output by 'Year', 'Region', 'Ethnicity'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'Year', 'Region'. You can override using
## the `.groups` argument.
  ggtitle("Regional TFR by Year and Ethnicity")
## $title
## [1] "Regional TFR by Year and Ethnicity"
## 
## attr(,"class")
## [1] "labels"
ggplotly(g2)