Setup

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

import data

#from 2003-2006
hispanic_region_2003_2006 <- read_delim("C:/SaintMartin/CSC530-DataAnalysis/Assignments/Fertility-Assignment/Natality, 2003-2006.txt", delim = "\t", escape_double = FALSE, trim_ws = TRUE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 427 Columns: 12
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (9): Notes, Census Region, Census Region Code, Mother's Hispanic Origin,...
## dbl (3): Year, Year Code, Births
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(hispanic_region_2003_2006)
## Rows: 427
## Columns: 12
## $ Notes                           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ `Census Region`                 <chr> "Census Region 1: Northeast", "Census ~
## $ `Census Region Code`            <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS~
## $ `Mother's Hispanic Origin`      <chr> "Hispanic or Latino", "Hispanic or Lat~
## $ `Mother's Hispanic Origin Code` <chr> "2135-2", "2135-2", "2135-2", "2135-2"~
## $ `Age of Mother 9`               <chr> "Under 15 years", "Under 15 years", "U~
## $ `Age of Mother 9 Code`          <chr> "15", "15", "15", "15", "15-19", "15-1~
## $ Year                            <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 20~
## $ `Year Code`                     <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 20~
## $ Births                          <dbl> 257, 257, 263, 251, 14691, 15072, 1524~
## $ `Female Population`             <chr> "Not Available", "Not Available", "Not~
## $ `Fertility Rate`                <chr> "Not Available", "Not Available", "Not~
#from 2007-2020
hispanic_region_2007_2020 <- read_delim("C:/SaintMartin/CSC530-DataAnalysis/Assignments/Fertility-Assignment/Natality, 2007-2020.txt", delim = "\t", escape_double = FALSE, trim_ws = TRUE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 1455 Columns: 12
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (9): Notes, Census Region, Census Region Code, Mother's Hispanic Origin,...
## dbl (3): Year, Year Code, Births
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(hispanic_region_2007_2020)
## Rows: 1,455
## Columns: 12
## $ Notes                           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ `Census Region`                 <chr> "Census Region 1: Northeast", "Census ~
## $ `Census Region Code`            <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS~
## $ `Mother's Hispanic Origin`      <chr> "Hispanic or Latino", "Hispanic or Lat~
## $ `Mother's Hispanic Origin Code` <chr> "2135-2", "2135-2", "2135-2", "2135-2"~
## $ `Age of Mother 9`               <chr> "Under 15 years", "Under 15 years", "U~
## $ `Age of Mother 9 Code`          <chr> "15", "15", "15", "15", "15", "15", "1~
## $ Year                            <dbl> 2007, 2008, 2009, 2010, 2011, 2012, 20~
## $ `Year Code`                     <dbl> 2007, 2008, 2009, 2010, 2011, 2012, 20~
## $ Births                          <dbl> 248, 224, 237, 218, 191, 186, 134, 137~
## $ `Female Population`             <chr> "Not Available", "Not Available", "Not~
## $ `Fertility Rate`                <chr> "Not Available", "Not Available", "Not~

combine above two with rbind into one dataframe

hispanic_region_2003_2020 = rbind(hispanic_region_2003_2006, hispanic_region_2007_2020)

glimpse(hispanic_region_2003_2020)
## Rows: 1,882
## Columns: 12
## $ Notes                           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ `Census Region`                 <chr> "Census Region 1: Northeast", "Census ~
## $ `Census Region Code`            <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS~
## $ `Mother's Hispanic Origin`      <chr> "Hispanic or Latino", "Hispanic or Lat~
## $ `Mother's Hispanic Origin Code` <chr> "2135-2", "2135-2", "2135-2", "2135-2"~
## $ `Age of Mother 9`               <chr> "Under 15 years", "Under 15 years", "U~
## $ `Age of Mother 9 Code`          <chr> "15", "15", "15", "15", "15-19", "15-1~
## $ Year                            <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 20~
## $ `Year Code`                     <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 20~
## $ Births                          <dbl> 257, 257, 263, 251, 14691, 15072, 1524~
## $ `Female Population`             <chr> "Not Available", "Not Available", "Not~
## $ `Fertility Rate`                <chr> "Not Available", "Not Available", "Not~

Rename and select

hispanic_region_2003_2020 = hispanic_region_2003_2020 %>%
  rename(Region = `Census Region Code`,
         Hispanic_origin = `Mother's Hispanic Origin`,
         Age = `Age of Mother 9 Code`,
         Fpop = `Female Population`,
         Rate = `Fertility Rate` ) %>% 
  select(Hispanic_origin, Year, Region, Age, Fpop, Births, Rate)

glimpse(hispanic_region_2003_2020)
## Rows: 1,882
## Columns: 7
## $ Hispanic_origin <chr> "Hispanic or Latino", "Hispanic or Latino", "Hispanic ~
## $ Year            <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 2005, 2006, 2003, ~
## $ Region          <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1",~
## $ Age             <chr> "15", "15", "15", "15", "15-19", "15-19", "15-19", "15~
## $ Fpop            <chr> "Not Available", "Not Available", "Not Available", "No~
## $ Births          <dbl> 257, 257, 263, 251, 14691, 15072, 15249, 16136, 32675,~
## $ Rate            <chr> "Not Available", "Not Available", "Not Available", "No~

Recode and clean data

hispanic_region_2003_2020 = hispanic_region_2003_2020 %>% 
  mutate(Region = ifelse(Region == "CENS-R1","NE",Region),
         Region = ifelse(Region == "CENS-R2","MW",Region),
         Region = ifelse(Region == "CENS-R3","SO",Region),
         Region = ifelse(Region == "CENS-R4","WE",Region),
         Hispanic_origin = ifelse(Hispanic_origin == "Hispanic or Latino","Hispanic",Hispanic_origin),
         Hispanic_origin = ifelse(Hispanic_origin == "Not Hispanic or Latino","Non Hispanic",Hispanic_origin),
         Fpop = as.numeric(Fpop),
         Rate = as.numeric(Rate)/1000) %>%  #per person
filter(Hispanic_origin != "Unknown or Not Stated
") %>% 
drop_na()
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
glimpse(hispanic_region_2003_2020)
## Rows: 864
## Columns: 7
## $ Hispanic_origin <chr> "Hispanic", "Hispanic", "Hispanic", "Hispanic", "Hispa~
## $ Year            <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 2005, 2006, 2003, ~
## $ Region          <chr> "NE", "NE", "NE", "NE", "NE", "NE", "NE", "NE", "NE", ~
## $ Age             <chr> "15-19", "15-19", "15-19", "15-19", "20-24", "20-24", ~
## $ Fpop            <dbl> 233887, 242902, 255348, 266240, 248961, 251248, 252368~
## $ Births          <dbl> 14691, 15072, 15249, 16136, 32675, 33102, 33549, 35314~
## $ Rate            <dbl> 0.06281, 0.06205, 0.05972, 0.06061, 0.13125, 0.13175, ~
head(hispanic_region_2003_2020)
## # A tibble: 6 x 7
##   Hispanic_origin  Year Region Age     Fpop Births   Rate
##   <chr>           <dbl> <chr>  <chr>  <dbl>  <dbl>  <dbl>
## 1 Hispanic         2003 NE     15-19 233887  14691 0.0628
## 2 Hispanic         2004 NE     15-19 242902  15072 0.0620
## 3 Hispanic         2005 NE     15-19 255348  15249 0.0597
## 4 Hispanic         2006 NE     15-19 266240  16136 0.0606
## 5 Hispanic         2003 NE     20-24 248961  32675 0.131 
## 6 Hispanic         2004 NE     20-24 251248  33102 0.132

Plot the yearly Rate for age group 25-29 in a grid by Hispanic_origin and Region.

hispanic_region_2003_2020 %>%
  filter(Age == "25-29") %>%
  ggplot(aes(x=Year, y=Rate)) +
  geom_point() +
  facet_grid(Hispanic_origin~Region) +
  ggtitle("TS Plot of Rate for 25-29 by Hispanic-origin and Region")

Flip the grid

hispanic_region_2003_2020 %>%
  filter(Age == "25-29") %>%
  ggplot(aes(x=Year, y=Rate)) +
  geom_point() +
  facet_grid(Region~Hispanic_origin) +
  ggtitle("TS Plot of Rate for 25-29 by Hispanic-origin and Region")

National TFR by Hispanic origin

Created a plot showing the Total Fertility Rate(TFR) for the whole country by hispanic origin.

g1 = hispanic_region_2003_2020 %>% 
  group_by(Year,Hispanic_origin,Age) %>% 
  summarize(Births = sum(Births),
            Fpop = sum(Fpop)) %>% 
  mutate(Rate = Births/Fpop)%>% 
  summarize(TFR = sum(Rate) * 5) %>% 
  ungroup() %>% 
  ggplot(aes(x = Year,y = TFR, color = Hispanic_origin)) +
  ggtitle("National TFR by Year and Hispanic origin") +
  geom_point()
## `summarise()` has grouped output by 'Year', 'Hispanic_origin'. You can override
## using the `.groups` argument.
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
ggplotly(g1)

Hispanic origin and Region

Created a plot showing the TFR by hispanic origin and Region. Used plotly.

g2 = hispanic_region_2003_2020 %>% 
  group_by(Year,Region,Hispanic_origin,Age) %>% 
  summarize(Births = sum(Births),
            Fpop = sum(Fpop)) %>% 
  mutate(Rate = Births/Fpop)%>% 
  summarize(TFR = sum(Rate) * 5) %>% 
  ungroup() %>% 
  ggplot(aes(x = Year,y = TFR, color = Hispanic_origin)) +
  geom_point() +
  facet_grid(Hispanic_origin~Region) +
  ggtitle("Regional TFR by Year and Hispanic origin and Region")
## `summarise()` has grouped output by 'Year', 'Region', 'Hispanic_origin'. You
## can override using the `.groups` argument.
## `summarise()` has grouped output by 'Year', 'Region'. You can override using
## the `.groups` argument.
ggplotly(g2)