Fertility I

Harold Nelson

3/4/2019

Setup

library(tidyverse)
## ── Attaching packages ────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0       ✔ purrr   0.2.5  
## ✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.1       ✔ stringr 1.3.1  
## ✔ readr   1.1.1       ✔ forcats 0.3.0
## ── Conflicts ───────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(readr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Get one year of data.

Natality_2007_2017 <- read_delim("~/Downloads/Natality, 2007-2017.txt","\t", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   Notes = col_character(),
##   State = col_character(),
##   SCode = col_character(),
##   Year = col_integer(),
##   YearCode = col_integer(),
##   Age = col_character(),
##   AgeCode = col_character(),
##   Births = col_integer(),
##   Fpop = col_character(),
##   Rate = col_character()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 47 parsing failures.
## row # A tibble: 5 x 5 col     row col   expected   actual    file                                  expected   <int> <chr> <chr>      <chr>     <chr>                                 actual 1   414 <NA>  10 columns 1 columns '~/Downloads/Natality, 2007-2017.txt' file 2   415 <NA>  10 columns 1 columns '~/Downloads/Natality, 2007-2017.txt' row 3   416 <NA>  10 columns 1 columns '~/Downloads/Natality, 2007-2017.txt' col 4   417 <NA>  10 columns 1 columns '~/Downloads/Natality, 2007-2017.txt' expected 5   418 <NA>  10 columns 1 columns '~/Downloads/Natality, 2007-2017.txt'
## ... ................. ... ........................................................................ ........ ........................................................................ ...... ........................................................................ .... ........................................................................ ... ........................................................................ ... ........................................................................ ........ ........................................................................
## See problems(...) for more details.

Get 25-29

birth rate for each state, and sort the dataframe by that rate.

Natality_2007_2017 %>% 
  filter(AgeCode=="25-29") %>% 
  mutate(Rate25_29 = as.numeric(Rate)) %>% 
  select(State,Rate25_29) %>% 
  arrange(Rate25_29) -> srate25_29
  srate25_29
## # A tibble: 51 x 2
##    State                Rate25_29
##    <chr>                    <dbl>
##  1 District of Columbia      44.9
##  2 Massachusetts             69.5
##  3 New York                  79.0
##  4 Rhode Island              79.7
##  5 California                82.6
##  6 Connecticut               83.8
##  7 Oregon                    85.8
##  8 Colorado                  85.8
##  9 New Jersey                89.5
## 10 Maryland                  91.7
## # … with 41 more rows

Get 20-24

Natality_2007_2017 %>% 
  filter(AgeCode=="20-24") %>% 
  mutate(Rate20_24 = as.numeric(Rate)) %>% 
  select(State,Rate20_24) %>% 
  arrange(Rate20_24) -> srate20_24
  srate20_24
## # A tibble: 51 x 2
##    State                Rate20_24
##    <chr>                    <dbl>
##  1 Massachusetts             32.4
##  2 Connecticut               38.2
##  3 Vermont                   40.0
##  4 New Hampshire             40.8
##  5 Rhode Island              46.4
##  6 New Jersey                47.8
##  7 District of Columbia      49.6
##  8 New York                  52.4
##  9 Minnesota                 54.3
## 10 California                57.8
## # … with 41 more rows

Join

both = srate20_24 %>% full_join(srate25_29)
## Joining, by = "State"
both
## # A tibble: 51 x 3
##    State                Rate20_24 Rate25_29
##    <chr>                    <dbl>     <dbl>
##  1 Massachusetts             32.4      69.5
##  2 Connecticut               38.2      83.8
##  3 Vermont                   40.0      93.4
##  4 New Hampshire             40.8      92.2
##  5 Rhode Island              46.4      79.7
##  6 New Jersey                47.8      89.5
##  7 District of Columbia      49.6      44.9
##  8 New York                  52.4      79.0
##  9 Minnesota                 54.3     114. 
## 10 California                57.8      82.6
## # … with 41 more rows

Scatterplot

p = both %>% 
  ggplot(aes(x=Rate20_24,y=Rate25_29)) +
  geom_point()
p

# Interactive with plotly
pI = both %>% 
  ggplot(aes(x=Rate20_24,y=Rate25_29,group=State)) +
  geom_point()
ggplotly(pI)

Where are we

We’ve downloaded one year of data from cdc Wonder. Examined birth rates for two age groups.

We want more years of data. We want a comprehensive births metric.