library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
origin_region_0306 <- read_delim("C:/Users/Tedy/Downloads/Natality, 2003-2006 (1).txt",delim ="\t", escape_double = FALSE, trim_ws = TRUE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 3868 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (11): Notes, Census Region, Census Region Code, State, State Code, Mothe...
## dbl (3): Year, Year Code, Births
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
origin_region_0720 <- read_delim("C:/Users/Tedy/Downloads/Natality, 2007-2020 (1).txt", delim ="\t", escape_double = FALSE, trim_ws = TRUE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 990 Columns: 14
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (11): Notes, Census Region, Census Region Code, State, State Code, Mothe...
## dbl (3): Year, Year Code, Births
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
origin_region_0320 = rbind(origin_region_0306,origin_region_0720)
#Takes a look at origin_region_0320
glimpse(origin_region_0320)
## Rows: 4,858
## Columns: 14
## $ Notes <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ `Census Region` <chr> "Census Region 1: Northeast", "Census ~
## $ `Census Region Code` <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS~
## $ State <chr> "Connecticut", "Connecticut", "Connect~
## $ `State Code` <chr> "09", "09", "09", "09", "09", "09", "0~
## $ `Mother's Hispanic Origin` <chr> "Hispanic or Latino", "Hispanic or Lat~
## $ `Mother's Hispanic Origin Code` <chr> "2135-2", "2135-2", "2135-2", "2135-2"~
## $ `Age of Mother 9` <chr> "Under 15 years", "Under 15 years", "U~
## $ `Age of Mother 9 Code` <chr> "15", "15", "15", "15", "15-19", "15-1~
## $ Year <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 20~
## $ `Year Code` <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 20~
## $ Births <dbl> 25, 23, 17, 24, 1195, 1247, 1208, 1293~
## $ `Female Population` <chr> "Not Available", "Not Available", "Not~
## $ `Fertility Rate` <chr> "Not Available", "Not Available", "Not~
#Rename and Select
origin_region_0320 = origin_region_0320 %>%
rename(Region = `Census Region Code`,
Origin = `Mother's Hispanic Origin`,
Age = `Age of Mother 9 Code`,
Fpop = `Female Population`,
Rate = `Fertility Rate` ) %>%
select(Origin, Year, Region, Age, Fpop, Births, Rate)
glimpse(origin_region_0320)
## Rows: 4,858
## Columns: 7
## $ Origin <chr> "Hispanic or Latino", "Hispanic or Latino", "Hispanic or Latino~
## $ Year <dbl> 2003, 2004, 2005, 2006, 2003, 2004, 2005, 2006, 2003, 2004, 200~
## $ Region <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1~
## $ Age <chr> "15", "15", "15", "15", "15-19", "15-19", "15-19", "15-19", "20~
## $ Fpop <chr> "Not Available", "Not Available", "Not Available", "Not Availab~
## $ Births <dbl> 25, 23, 17, 24, 1195, 1247, 1208, 1293, 2454, 2314, 2413, 2534,~
## $ Rate <chr> "Not Available", "Not Available", "Not Available", "Not Availab~
#Remove row with “Not Available”
origin_region_0320 = subset(origin_region_0320, Fpop != 'Not Available' & Rate != 'Not Available')
view(origin_region_0320)
#Recode
origin_region_0320 = origin_region_0320 %>%
mutate(Region = ifelse(Region == "CENS-R1","NE",Region),
Region = ifelse(Region == "CENS-R2","MW",Region),
Region = ifelse(Region == "CENS-R3","SO",Region),
Region = ifelse(Region == "CENS-R4","WE",Region),
Origin = ifelse(Origin == "American Indian or Alaska Native","AmInd",Origin),
Origin = ifelse(Origin == "Asian or Pacific Islander","API",Origin),
Origin = ifelse(Origin == "Black or African American","Black",Origin),
Fpop = as.numeric(Fpop),
Rate = as.numeric(Rate)/1000) %>%
filter(Origin != "Not Reported") %>%
drop_na()
#First Plot: Plot the yearly rate for age group 25-29 in a grid by Origin and Region
origin_region_0320 %>%
filter(Age == "25-29") %>%
ggplot(aes(x = Year, y = Rate)) +
geom_point() +
facet_grid(Origin~Region) +
ggtitle("TS Plot of Rate for 25-29 by Origin and Region")
#Flip the Grid
origin_region_0320 %>%
filter(Age == "25-29") %>%
ggplot(aes(x = Rate, y = Year))+
geom_point() +
facet_grid(Origin~Region) +
ggtitle("TS Plot of Rate for 25-29 by Origin and Region")
#National TFR by Origin: Create a plot showing the TFR for the whole country by origin.
g1 = origin_region_0320 %>%
group_by(Year,Origin,Age) %>%
summarize(Births = sum(Births),
Fpop = sum(Fpop)) %>%
mutate(Rate = Births/Fpop)%>%
summarize(TFR = sum(Rate) * 5) %>%
ungroup() %>%
ggplot(aes(x = Year,y = TFR, color = Origin)) +
geom_point()
## `summarise()` has grouped output by 'Year', 'Origin'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
ggtitle("National TFR by Year and Race")
## $title
## [1] "National TFR by Year and Race"
##
## attr(,"class")
## [1] "labels"
ggplotly(g1)
#Race and Region: Create a plot showing the TFR by Race and Region. Use plotly.
g2 = origin_region_0320 %>%
group_by(Year,Region,Origin,Age) %>%
summarize(Births = sum(Births),
Fpop = sum(Fpop)) %>%
mutate(Rate = Births/Fpop)%>%
summarize(TFR = sum(Rate) * 5) %>%
ungroup() %>%
ggplot(aes(x = Year,y = TFR, color = Origin)) +
geom_point() +
facet_grid(Origin~Region)
## `summarise()` has grouped output by 'Year', 'Region', 'Origin'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'Year', 'Region'. You can override using the `.groups` argument.
ggtitle("Regional TFR by Year and Origin")
## $title
## [1] "Regional TFR by Year and Origin"
##
## attr(,"class")
## [1] "labels"
ggplotly(g2)