Read in data

mdDf <- read.csv("https://raw.githubusercontent.com/xiaoxiaogao-DD/DATA607-Project1/master/national_marriage_divorce_rates_00-16.csv", stringsAsFactors = FALSE, skip = 2, header = TRUE, colClasses = "character")

Remove blank columns

# Confirm all values are blank for last columns
stopifnot(nrow(mdDf %>% filter_if(str_detect(colnames(.), "X"), all_vars(. != ""))) == 0)

# Remove these rows
mdDf <- mdDf[, !str_detect(colnames(mdDf), "X")]

Split apart separate dataframes

marrDf <- mdDf[1:27,]
divDf <- mdDf[29:nrow(mdDf),]

Adust column names

colnames(divDf) <- divDf[1, ]

Extract footnote data into dataframe

# Save out total row numbers to confirm logic to split off footnotes was reasonable
marrRows <- nrow(marrDf)
divRows <- nrow(divDf)

marrFtDf <- marrDf %>% filter(!str_detect(Year, "^20."))
divFtDf <- divDf %>% filter(!str_detect(Year, "^20."))

marrDf <- marrDf %>% filter(str_detect(Year, "^20."))
divDf <- divDf %>% filter(str_detect(Year, "^20."))

stopifnot(marrRows == nrow(marrFtDf) + nrow(marrDf))
stopifnot(divRows == nrow(divFtDf) + nrow(divDf))

Parse footnote dataframes

marrFtDf <- marrFtDf %>%
  rename(raw = Year) %>%
  select(raw) %>%
  filter(str_trim(raw) != "") %>%
  mutate(footId = as.numeric(str_extract(raw, "^[:digit:]+")),
         footnote = str_replace(raw, "^[:digit:]+/", "")) %>%
  filter(!is.na(footId)) %>%
  select(footId, footnote)

divFtDf <- divFtDf %>%
  rename(raw = Year) %>%
  select(raw) %>%
  filter(str_trim(raw) != "") %>%
  mutate(footId = as.numeric(str_extract(raw, "^[:digit:]+")),
         footnote = str_replace(raw, "^[:digit:]+ ", "")) %>%
  filter(!is.na(footId)) %>%
  select(footId, footnote)

Parse numerics in main dataframes

# Extract footnote information from Year column
marrDf$footId <- str_extract(marrDf$Year, "/.")
marrDf$Year <- str_replace(marrDf$Year, "/.", "")
marrDf$footId <- str_replace(marrDf$footId, "/", "")

divDf$footId <- str_extract(divDf$Year, "/.")
divDf$Year <- str_replace(divDf$Year, "/.", "")
divDf$footId <- str_replace(divDf$footId, "/", "")

# remove commas
marrDf <- marrDf %>% mutate_all(funs(str_replace_all(., ",", "")))
divDf <- divDf %>% mutate_all(funs(str_replace_all(., ",", "")))

# parse all columns as numerics
marrDf <- marrDf %>% mutate_all(as.numeric)
divDf <- divDf %>% mutate_all(as.numeric)

Merge in footnotes

marrDf <- left_join(marrDf, marrFtDf, by="footId")
divDf <- left_join(divDf, divFtDf, by="footId")
rm(mdDf, marrFtDf, divFtDf)

Merge marraige and divorce dataframes

marrDf <- marrDf %>%
  rename(marriages = Marriages,
         marrPopulation = Population,
         marrRatePer1000 = Rate.per.1.000.total.population,
         marrFootnote = footnote) %>%
  select(-footId)

divDf <- divDf %>%
  rename(divPopulation = Population,
         divRatePer1000 = `Rate per 1,000 total population`,
         divFootnote = footnote,
         divAnnul = `Divorces & annulments`) %>%
  select(-footId)

finalDf <- full_join(marrDf, divDf, by = "Year") %>%
  select(Year, marriages, divAnnul, marrPopulation, divPopulation, marrRatePer1000, divRatePer1000,
         marrFootnote, divFootnote)

rm(divDf, marrDf)

Suggested Analyses

ggplot(finalDf, aes(Year)) + 
  geom_line(aes(y = marrRatePer1000, colour = "marrRatePer1000")) + 
  geom_line(aes(y = divRatePer1000, colour = "divRatePer1000"))  +
  labs(title = "Marriage/Divorce Rates", x = "Year", y = "Rate Per 1000") +
  theme(plot.title = element_text(hjust = 0.5))

From the above, it’s not easy to discern subtle trends, but overall, it does look like a decrease in marriages is at least correlated with a decrease in divorce rates.