Fertility Study

Get Libraries and Raw Data

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(stringr)
library(readr)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Exercise

Import the two text files Nat0306 and Nat0307 from Moodle.

Using your text editor, fix the row of variable names. Use the following

“Notes” “State” “StateCode” “Age” “AgeCode” “Year” “YearCode” Births FPop Rate.

Now read the data using “Import Dataset” Select the tidyverse options. Name the dataframes Nat0306 and Nat0717.

Answer

Nat0306 <- read_delim("~/Dropbox/RProjects/Fertility Study/Nat0306.txt","\t", escape_double = FALSE)

## Parsed with column specification:
## cols(
##   Notes = col_character(),
##   State = col_character(),
##   StateCode = col_character(),
##   Age = col_character(),
##   AgeCode = col_character(),
##   Year = col_integer(),
##   YearCode = col_integer(),
##   Births = col_integer(),
##   FPop = col_character(),
##   Rate = col_character()
## )

## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)

## Warning: 66 parsing failures.
## row # A tibble: 5 x 5 col     row col   expected   actual    file                                    expected   <int> <chr> <chr>      <chr>     <chr>                                   actual 1  1637 <NA>  10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… file 2  1638 <NA>  10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… row 3  1639 <NA>  10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… col 4  1640 <NA>  10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… expected 5  1641 <NA>  10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.

Nat0717 <- read_delim("~/Dropbox/RProjects/Fertility Study/Nat0717.txt","\t", escape_double = FALSE)

## Parsed with column specification:
## cols(
##   Notes = col_character(),
##   State = col_character(),
##   StateCode = col_character(),
##   Age = col_character(),
##   AgeCode = col_character(),
##   Year = col_integer(),
##   YearCode = col_integer(),
##   Births = col_integer(),
##   FPop = col_character(),
##   Rate = col_character()
## )

## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)

## Warning: 73 parsing failures.
## row # A tibble: 5 x 5 col     row col   expected   actual    file                                    expected   <int> <chr> <chr>      <chr>     <chr>                                   actual 1  4539 <NA>  10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… file 2  4540 <NA>  10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… row 3  4541 <NA>  10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… col 4  4542 <NA>  10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… expected 5  4543 <NA>  10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.

Parsing Errors ??

Look at the data. What should we do?

Solution

Select the variables we want. Convert the character variable Rate to numeric and eliminate the rows where this variable is na. Then combine the two dataframes using rbind. Name the result both.

We could probably do the rbind first and save a few lines of typing. Pre-preocesing the two separate files eliminates the possibility of problems because of differences between these two files, which came from different queries.

Nat0306 %>% 
  select(State,Year,Age,Rate) %>% 
  mutate(Rate = as.numeric(Rate)) %>% 
  filter(!is.na(Rate)) -> Nat0306

## Warning in evalq(as.numeric(Rate), <environment>): NAs introduced by
## coercion

glimpse(Nat0306)

## Observations: 1,224
## Variables: 4
## $ State <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "...
## $ Year  <int> 2003, 2004, 2005, 2006, 2003, 2004, 2005, 2006, 2003, 20...
## $ Age   <chr> "15-19 years", "15-19 years", "15-19 years", "15-19 year...
## $ Rate  <dbl> 51.40, 51.02, 48.06, 51.83, 113.67, 113.48, 116.90, 121....

Nat0717 %>% 
  select(State,Year,Age,Rate) %>% 
  mutate(Rate = as.numeric(Rate)) %>% 
  filter(!is.na(Rate)) -> Nat0717

## Warning in evalq(as.numeric(Rate), <environment>): NAs introduced by
## coercion

glimpse(Nat0717)

## Observations: 3,366
## Variables: 4
## $ State <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "...
## $ Year  <int> 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 20...
## $ Age   <chr> "15-19 years", "15-19 years", "15-19 years", "15-19 year...
## $ Rate  <dbl> 52.07, 50.53, 48.30, 43.63, 40.53, 39.15, 34.32, 32.02, ...

both = rbind(Nat0306,Nat0717)
summary(both)

##     State                Year          Age                 Rate       
##  Length:4590        Min.   :2003   Length:4590        Min.   :  3.93  
##  Class :character   1st Qu.:2006   Class :character   1st Qu.: 29.38  
##  Mode  :character   Median :2010   Mode  :character   Median : 60.67  
##                     Mean   :2010                      Mean   : 65.21  
##                     3rd Qu.:2014                      3rd Qu.:102.11  
##                     Max.   :2017                      Max.   :184.05

Create the Total Fertility Rate (TFR)

Now create a new dataframe with one row for each combination of State and Year. Create the variable TFR.

Add all of the rates for each age group in the calendar year. Divide the total by 1000. Multiply by 5.

The TFR is the average number of births experienced by a woman during her child-bearing years.

A value of TFR below 2 mathematically leads to a declining population.

Answer

both %>% 
  group_by(State,Year) %>% 
  summarize(TFR = 5*sum(Rate)/1000) %>% 
  ungroup() -> SYTFR
glimpse(SYTFR)

## Observations: 765
## Variables: 3
## $ State <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "...
## $ Year  <int> 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 20...
## $ TFR   <dbl> 1.92205, 1.91695, 1.93785, 2.00555, 2.04290, 2.02050, 1....

Examine the Results

Let’s look at the results. First try side-by-side boxplots to look at the changes over time. Note that we have to cast Year as a factor to get our boxplots.

Solution

SYTFR %>% 
  ggplot(aes(x = factor(Year),y=TFR)) + geom_boxplot() + coord_flip()

There are some distinct groups of years. For analysis purposes, I would group them as follows:

2003 - 2005 - Early
2006 - 2008 - Peak
2009 - 2010 - Decline (Possibly separate)
2012 - 2017 - Late

Scatterplot.

Do a scatterplot of TFR against Year. Group the results by state for a smoother. Use ggplotly to make the plot interactive. Play with fig.width and fig.height in the chunk specification.

Solution

p = SYTFR %>% ggplot(aes(x= Year, y = TFR,group=State)) + geom_point() + geom_smooth()
ggplotly(p)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Exercise

Let’s create growProp the proportion of states with a TFR greater than the breakeven point for each year. Plot this variable against Year.

SYTFR %>% group_by(Year) %>% 
  summarize(growProp = mean(TFR > 2.0))  %>% 
  ggplot(aes(x=Year,y=growProp)) + geom_point()

Cleveland Dotplot

Let’s do a Cleveland dotplot to see how the states rank in 2017.

Note: Hack this from code written by Naomi Robbins. You can see the original at http://www.joyce-robbins.com/blog/2016/06/02/datavis-with-rdrawing-a-cleveland-dot-plot-with-ggplot2/

Use this to graph our TFR data for 2017 US States.

Answer

# create a theme for dot plots, which can be reused
theme_dotplot <- theme_bw(14) +
    theme(axis.text.y = element_text(size = rel(.75)),
        axis.ticks.y = element_blank(),
        axis.title.x = element_text(size = rel(.75)),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_line(size = 0.5),
        panel.grid.minor.x = element_blank())
        
# move row names to a dataframe column        
df <-  SYTFR %>% filter(Year == 2017)

# create the plot
ggplot(df, aes(x = TFR, y = reorder(State, TFR))) +
    geom_point(color = "blue") +
  geom_vline(xintercept = 2.0, color = "red") +
    scale_x_continuous(limits = c(1,2.5),
        breaks = seq(1, 2.5, .25)) +
    theme_dotplot +
    xlab("\nTotal Fertility Rate Women aged 15-44") +
    ylab("US States\n") +
    ggtitle("Total Fertility Rate\nUnited States, 2017")

Now a choropleth

Let’s do a map for 2017. Look at Healy Section 7.1 in bookdown.org for instructions.

Answer

library(maps)

## 
## Attaching package: 'maps'

## The following object is masked from 'package:purrr':
## 
##     map

library(ggmap)

## 
## Attaching package: 'ggmap'

## The following object is masked from 'package:plotly':
## 
##     wind

us_states <- map_data("state")

df <-  SYTFR %>% filter(Year == 2017)
glimpse(df)

## Observations: 51
## Variables: 3
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California"...
## $ Year  <int> 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 20...
## $ TFR   <dbl> 1.81800, 2.01495, 1.78630, 1.90200, 1.68450, 1.62545, 1....

glimpse(us_states)

## Observations: 15,537
## Variables: 6
## $ long      <dbl> -87.46201, -87.48493, -87.52503, -87.53076, -87.5708...
## $ lat       <dbl> 30.38968, 30.37249, 30.37249, 30.33239, 30.32665, 30...
## $ group     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ order     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ region    <chr> "alabama", "alabama", "alabama", "alabama", "alabama...
## $ subregion <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...

# How can we join these 2?

df$region <- tolower(df$State)
glimpse(df)

## Observations: 51
## Variables: 4
## $ State  <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California...
## $ Year   <int> 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2...
## $ TFR    <dbl> 1.81800, 2.01495, 1.78630, 1.90200, 1.68450, 1.62545, 1...
## $ region <chr> "alabama", "alaska", "arizona", "arkansas", "california...

df2 <- left_join(us_states,df)

## Joining, by = "region"

glimpse(df2)

## Observations: 15,537
## Variables: 9
## $ long      <dbl> -87.46201, -87.48493, -87.52503, -87.53076, -87.5708...
## $ lat       <dbl> 30.38968, 30.37249, 30.37249, 30.33239, 30.32665, 30...
## $ group     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ order     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ region    <chr> "alabama", "alabama", "alabama", "alabama", "alabama...
## $ subregion <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ State     <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama...
## $ Year      <int> 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017...
## $ TFR       <dbl> 1.818, 1.818, 1.818, 1.818, 1.818, 1.818, 1.818, 1.8...

# Now do the map

p0 <- ggplot(data = df2,
             mapping = aes(x = long, y = lat,
                           group = group, fill = TFR)) + geom_polygon()

p0