library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(stringr)
library(readr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Import the two text files Nat0306 and Nat0307 from Moodle.
Using your text editor, fix the row of variable names. Use the following
“Notes” “State” “StateCode” “Age” “AgeCode” “Year” “YearCode” Births FPop Rate.
Now read the data using “Import Dataset” Select the tidyverse options. Name the dataframes Nat0306 and Nat0717.
Nat0306 <- read_delim("~/Dropbox/RProjects/Fertility Study/Nat0306.txt","\t", escape_double = FALSE)
## Parsed with column specification:
## cols(
## Notes = col_character(),
## State = col_character(),
## StateCode = col_character(),
## Age = col_character(),
## AgeCode = col_character(),
## Year = col_integer(),
## YearCode = col_integer(),
## Births = col_integer(),
## FPop = col_character(),
## Rate = col_character()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 66 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 1637 <NA> 10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… file 2 1638 <NA> 10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… row 3 1639 <NA> 10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… col 4 1640 <NA> 10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… expected 5 1641 <NA> 10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
Nat0717 <- read_delim("~/Dropbox/RProjects/Fertility Study/Nat0717.txt","\t", escape_double = FALSE)
## Parsed with column specification:
## cols(
## Notes = col_character(),
## State = col_character(),
## StateCode = col_character(),
## Age = col_character(),
## AgeCode = col_character(),
## Year = col_integer(),
## YearCode = col_integer(),
## Births = col_integer(),
## FPop = col_character(),
## Rate = col_character()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 73 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 4539 <NA> 10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… file 2 4540 <NA> 10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… row 3 4541 <NA> 10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… col 4 4542 <NA> 10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N… expected 5 4543 <NA> 10 columns 1 columns '~/Dropbox/RProjects/Fertility Study/N…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
Look at the data. What should we do?
Select the variables we want. Convert the character variable Rate to numeric and eliminate the rows where this variable is na. Then combine the two dataframes using rbind. Name the result both.
We could probably do the rbind first and save a few lines of typing. Pre-preocesing the two separate files eliminates the possibility of problems because of differences between these two files, which came from different queries.
Nat0306 %>%
select(State,Year,Age,Rate) %>%
mutate(Rate = as.numeric(Rate)) %>%
filter(!is.na(Rate)) -> Nat0306
## Warning in evalq(as.numeric(Rate), <environment>): NAs introduced by
## coercion
glimpse(Nat0306)
## Observations: 1,224
## Variables: 4
## $ State <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "...
## $ Year <int> 2003, 2004, 2005, 2006, 2003, 2004, 2005, 2006, 2003, 20...
## $ Age <chr> "15-19 years", "15-19 years", "15-19 years", "15-19 year...
## $ Rate <dbl> 51.40, 51.02, 48.06, 51.83, 113.67, 113.48, 116.90, 121....
Nat0717 %>%
select(State,Year,Age,Rate) %>%
mutate(Rate = as.numeric(Rate)) %>%
filter(!is.na(Rate)) -> Nat0717
## Warning in evalq(as.numeric(Rate), <environment>): NAs introduced by
## coercion
glimpse(Nat0717)
## Observations: 3,366
## Variables: 4
## $ State <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "...
## $ Year <int> 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 20...
## $ Age <chr> "15-19 years", "15-19 years", "15-19 years", "15-19 year...
## $ Rate <dbl> 52.07, 50.53, 48.30, 43.63, 40.53, 39.15, 34.32, 32.02, ...
both = rbind(Nat0306,Nat0717)
summary(both)
## State Year Age Rate
## Length:4590 Min. :2003 Length:4590 Min. : 3.93
## Class :character 1st Qu.:2006 Class :character 1st Qu.: 29.38
## Mode :character Median :2010 Mode :character Median : 60.67
## Mean :2010 Mean : 65.21
## 3rd Qu.:2014 3rd Qu.:102.11
## Max. :2017 Max. :184.05
Now create a new dataframe with one row for each combination of State and Year. Create the variable TFR.
Add all of the rates for each age group in the calendar year. Divide the total by 1000. Multiply by 5.
The TFR is the average number of births experienced by a woman during her child-bearing years.
A value of TFR below 2 mathematically leads to a declining population.
both %>%
group_by(State,Year) %>%
summarize(TFR = 5*sum(Rate)/1000) %>%
ungroup() -> SYTFR
glimpse(SYTFR)
## Observations: 765
## Variables: 3
## $ State <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "...
## $ Year <int> 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 20...
## $ TFR <dbl> 1.92205, 1.91695, 1.93785, 2.00555, 2.04290, 2.02050, 1....
Let’s look at the results. First try side-by-side boxplots to look at the changes over time. Note that we have to cast Year as a factor to get our boxplots.
SYTFR %>%
ggplot(aes(x = factor(Year),y=TFR)) + geom_boxplot() + coord_flip()
There are some distinct groups of years. For analysis purposes, I would group them as follows:
Do a scatterplot of TFR against Year. Group the results by state for a smoother. Use ggplotly to make the plot interactive. Play with fig.width and fig.height in the chunk specification.
p = SYTFR %>% ggplot(aes(x= Year, y = TFR,group=State)) + geom_point() + geom_smooth()
ggplotly(p)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Let’s create growProp the proportion of states with a TFR greater than the breakeven point for each year. Plot this variable against Year.
SYTFR %>% group_by(Year) %>%
summarize(growProp = mean(TFR > 2.0)) %>%
ggplot(aes(x=Year,y=growProp)) + geom_point()
Let’s do a Cleveland dotplot to see how the states rank in 2017.
Note: Hack this from code written by Naomi Robbins. You can see the original at http://www.joyce-robbins.com/blog/2016/06/02/datavis-with-rdrawing-a-cleveland-dot-plot-with-ggplot2/
Use this to graph our TFR data for 2017 US States.
# create a theme for dot plots, which can be reused
theme_dotplot <- theme_bw(14) +
theme(axis.text.y = element_text(size = rel(.75)),
axis.ticks.y = element_blank(),
axis.title.x = element_text(size = rel(.75)),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(size = 0.5),
panel.grid.minor.x = element_blank())
# move row names to a dataframe column
df <- SYTFR %>% filter(Year == 2017)
# create the plot
ggplot(df, aes(x = TFR, y = reorder(State, TFR))) +
geom_point(color = "blue") +
geom_vline(xintercept = 2.0, color = "red") +
scale_x_continuous(limits = c(1,2.5),
breaks = seq(1, 2.5, .25)) +
theme_dotplot +
xlab("\nTotal Fertility Rate Women aged 15-44") +
ylab("US States\n") +
ggtitle("Total Fertility Rate\nUnited States, 2017")
Let’s do a map for 2017. Look at Healy Section 7.1 in bookdown.org for instructions.
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(ggmap)
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
##
## wind
us_states <- map_data("state")
df <- SYTFR %>% filter(Year == 2017)
glimpse(df)
## Observations: 51
## Variables: 3
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California"...
## $ Year <int> 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 20...
## $ TFR <dbl> 1.81800, 2.01495, 1.78630, 1.90200, 1.68450, 1.62545, 1....
glimpse(us_states)
## Observations: 15,537
## Variables: 6
## $ long <dbl> -87.46201, -87.48493, -87.52503, -87.53076, -87.5708...
## $ lat <dbl> 30.38968, 30.37249, 30.37249, 30.33239, 30.32665, 30...
## $ group <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ region <chr> "alabama", "alabama", "alabama", "alabama", "alabama...
## $ subregion <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
# How can we join these 2?
df$region <- tolower(df$State)
glimpse(df)
## Observations: 51
## Variables: 4
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California...
## $ Year <int> 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2...
## $ TFR <dbl> 1.81800, 2.01495, 1.78630, 1.90200, 1.68450, 1.62545, 1...
## $ region <chr> "alabama", "alaska", "arizona", "arkansas", "california...
df2 <- left_join(us_states,df)
## Joining, by = "region"
glimpse(df2)
## Observations: 15,537
## Variables: 9
## $ long <dbl> -87.46201, -87.48493, -87.52503, -87.53076, -87.5708...
## $ lat <dbl> 30.38968, 30.37249, 30.37249, 30.33239, 30.32665, 30...
## $ group <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ region <chr> "alabama", "alabama", "alabama", "alabama", "alabama...
## $ subregion <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ State <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama...
## $ Year <int> 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017...
## $ TFR <dbl> 1.818, 1.818, 1.818, 1.818, 1.818, 1.818, 1.818, 1.8...
# Now do the map
p0 <- ggplot(data = df2,
mapping = aes(x = long, y = lat,
group = group, fill = TFR)) + geom_polygon()
p0