library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
origin_region_0306 <- read_delim("C:/Users/Tedy/Downloads/Natality, 2003-2006 (1).txt",delim ="\t", escape_double = FALSE, trim_ws = TRUE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 3868 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (11): Notes, Census Region, Census Region Code, State, State Code, Mothe...
## dbl  (3): Year, Year Code, Births
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
origin_region_0720 <- read_delim("C:/Users/Tedy/Downloads/Natality, 2007-2020 (1).txt", delim ="\t", escape_double = FALSE, trim_ws = TRUE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 990 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (11): Notes, Census Region, Census Region Code, State, State Code, Mothe...
## dbl  (3): Year, Year Code, Births
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
origin_region_0320 = rbind(origin_region_0306,origin_region_0720)

#Takes a look at origin_region_0320

glimpse(origin_region_0320)
## Rows: 4,858
## Columns: 14
## $ Notes                           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ `Census Region`                 <chr> "Census Region 1: Northeast", "Census ~
## $ `Census Region Code`            <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS~
## $ State                           <chr> "Connecticut", "Connecticut", "Connect~
## $ `State Code`                    <chr> "09", "09", "09", "09", "09", "09", "0~
## $ `Mother's Hispanic Origin`      <chr> "Hispanic or Latino", "Hispanic or Lat~
## $ `Mother's Hispanic Origin Code` <chr> "2135-2", "2135-2", "2135-2", "2135-2"~
## $ `Age of Mother 9`               <chr> "Under 15 years", "Under 15 years", "U~
## $ `Age of Mother 9 Code`          <chr> "15", "15", "15", "15", "15-19", "15-1~
## $ Year                            <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 20~
## $ `Year Code`                     <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 20~
## $ Births                          <dbl> 25, 23, 17, 24, 1195, 1247, 1208, 1293~
## $ `Female Population`             <chr> "Not Available", "Not Available", "Not~
## $ `Fertility Rate`                <chr> "Not Available", "Not Available", "Not~

#Rename and Select

origin_region_0320 = origin_region_0320 %>% 
  rename(Region = `Census Region Code`,
         Origin = `Mother's Hispanic Origin`,
         Age = `Age of Mother 9 Code`,
         Fpop = `Female Population`,
         Rate = `Fertility Rate` ) %>% 
  select(Origin, Year, Region, Age, Fpop, Births, Rate)
glimpse(origin_region_0320)
## Rows: 4,858
## Columns: 7
## $ Origin <chr> "Hispanic or Latino", "Hispanic or Latino", "Hispanic or Latino~
## $ Year   <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 2005, 2006, 2003, 2004, 200~
## $ Region <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1~
## $ Age    <chr> "15", "15", "15", "15", "15-19", "15-19", "15-19", "15-19", "20~
## $ Fpop   <chr> "Not Available", "Not Available", "Not Available", "Not Availab~
## $ Births <dbl> 25, 23, 17, 24, 1195, 1247, 1208, 1293, 2454, 2314, 2413, 2534,~
## $ Rate   <chr> "Not Available", "Not Available", "Not Available", "Not Availab~

#Remove row with “Not Available”

origin_region_0320 = subset(origin_region_0320, Fpop != 'Not Available' & Rate != 'Not Available')
view(origin_region_0320)

#Recode

origin_region_0320 = origin_region_0320 %>% 
  mutate(Region = ifelse(Region == "CENS-R1","NE",Region),
         Region = ifelse(Region == "CENS-R2","MW",Region),
         Region = ifelse(Region == "CENS-R3","SO",Region),
         Region = ifelse(Region == "CENS-R4","WE",Region),
         Origin = ifelse(Origin == "American Indian or Alaska Native","AmInd",Origin),
         Origin = ifelse(Origin == "Asian or Pacific Islander","API",Origin),
         Origin = ifelse(Origin == "Black or African American","Black",Origin),
         Fpop = as.numeric(Fpop),
         Rate = as.numeric(Rate)/1000) %>% 
filter(Origin != "Not Reported") %>% 
drop_na()

#First Plot: Plot the yearly rate for age group 25-29 in a grid by Origin and Region

origin_region_0320 %>% 
  filter(Age == "25-29") %>% 
  ggplot(aes(x = Year, y = Rate)) +
  geom_point() +
  facet_grid(Origin~Region) +
  ggtitle("TS Plot of Rate for 25-29 by Origin and Region")

#Flip the Grid

origin_region_0320 %>% 
  filter(Age == "25-29") %>% 
  ggplot(aes(x = Rate, y = Year))+
  geom_point() +
  facet_grid(Origin~Region) +
  ggtitle("TS Plot of Rate for 25-29 by Origin and Region")

#National TFR by Origin: Create a plot showing the TFR for the whole country by origin.

g1 = origin_region_0320 %>% 
  group_by(Year,Origin,Age) %>% 
  summarize(Births = sum(Births),
            Fpop = sum(Fpop)) %>% 
  mutate(Rate = Births/Fpop)%>% 
  summarize(TFR = sum(Rate) * 5) %>% 
  ungroup() %>% 
  ggplot(aes(x = Year,y = TFR, color = Origin)) +
  geom_point()
## `summarise()` has grouped output by 'Year', 'Origin'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
ggtitle("National TFR by Year and Race")
## $title
## [1] "National TFR by Year and Race"
## 
## attr(,"class")
## [1] "labels"
ggplotly(g1)

#Race and Region: Create a plot showing the TFR by Race and Region. Use plotly.

g2 = origin_region_0320 %>% 
  group_by(Year,Region,Origin,Age) %>% 
  summarize(Births = sum(Births),
            Fpop = sum(Fpop)) %>% 
  mutate(Rate = Births/Fpop)%>% 
  summarize(TFR = sum(Rate) * 5) %>% 
  ungroup() %>% 
  ggplot(aes(x = Year,y = TFR, color = Origin)) +
  geom_point() +
  facet_grid(Origin~Region)
## `summarise()` has grouped output by 'Year', 'Region', 'Origin'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'Year', 'Region'. You can override using the `.groups` argument.
ggtitle("Regional TFR by Year and Origin")
## $title
## [1] "Regional TFR by Year and Origin"
## 
## attr(,"class")
## [1] "labels"
ggplotly(g2)