Part 1 - Introduction

One of the many wonderful aspects of the United States of America is its diversity of people and culture. Historically, outside of major port and immigration centers, cultures tend to be clustered geographically. This will manifest not only in genetic differences, but also in differences in diet and activity. A question can now be raised if there is a relationship between geographic location, as measured by region, division, or state, and the death rate (per 100,000) due to disease, possibly over time? In this research project we will focus on heart disease.

knitr::opts_chunk$set(echo = TRUE, collapse = TRUE, message = FALSE)

# --- SET RANDOM SEED FOR REPRODUCIBILITY --- 
random_seed <- 15
set.seed(random_seed)

#  --- LOAD LIBRARIES ---
library(recipes)
library(caret)
library(rpart.plot)
library(jsonlite)
library(MASS)
library(tidyverse)
library(ggthemes)
library(glmnet)
library(data.table)
library(rsample)
library(corrr)

# Will be used for US Maps
library(maps)
us_states <- map_data("state")

# Will be used to build our Deep Learning model
library(tensorflow)
library(keras)

# Set random seed for TensorFlow
if(Sys.info()['sysname'] == 'Windows') {
  # TensorFlow bug with setting random seed
  tensorflow::tf$random$set_seed(random_seed)
} else {
  use_session_with_seed(random_seed, 
                        disable_gpu = TRUE, 
                        disable_parallel_cpu = TRUE, 
                        quiet = FALSE)
}

#' Attempt to load cached data falling back to remote URL loading
#'
#' To save redownloading data from the internet every time, this 
#' function will cache URL data the first time and for all future
#' references, it will us the locally cached file rather than
#' redownloading from the internet.
#' @param fn The local cache filename (character)
#' @param url The remote URL to load the data (character)
#' @param type The type of data (affects which loading fuction). Valid: csv, json (character)
#' @examples
#' myData <- load_cache_file('./data/myfile.csv', 'http://someurl.com/somefile.csv')
#' @return The desired data as a data.frame
#' @export
load_cache_file <- function(fn, url, type) {
  # TODO - Add error chacking of parameters (fn, url and type)
  
  if (!(file.exists(fn))) {
    if (type == 'csv') {
      data <- fread(url) 
    } else if (type == 'json') {
      data <- fromJSON(url, flatten = TRUE)
    }

    write.csv(data, file = fn, row.names = FALSE)
  } else {
    data <- fread(fn)
  }
  
  return(data)
}


#' Print a side-by-side Histogram and QQPlot of Residuals
#'
#' @param model A model
#' @examples
#' residPlot(myModel)
#' @return null
#' @export
residPlot <- function(model) {
  par(mfrow = c(1, 2))
  hist(model[["residuals"]], freq = FALSE, breaks = "fd", main = "Residual Histogram",
       xlab = "Residuals",col="lightgreen")
  lines(density(model[["residuals"]], kernel = "ep"),col="blue", lwd=3)
  curve(dnorm(x,mean=mean(model[["residuals"]]), sd=sd(model[["residuals"]])), col="red", lwd=3, lty="dotted", add=T)
  qqnorm(model[["residuals"]], main = "Residual Q-Q plot")
  qqline(model[["residuals"]],col="red", lwd=3, lty="dotted")
  par(mfrow = c(1, 1))
}


#' Calculate the R-Squared given observed and predicted data
#'
#' @param y     Observed values in an object that supports vector operations
#' @param y_hat Predicted values in an object that supports vector operations
#' @examples
#' r_squared(test_labels, predicted_labels)
#' @return Float between 0 and 1.0
#' @export
r_squared <- function(y, y_hat) {
  return(1 - sum((y - y_hat)^2) / sum((y - mean(y))^2))
}

Part 2 - Data

Load raw data


# --- Load LCD Data ---
# Note: we keep an original copy of the data in case we need to reference pre-modified data
LCD_fn <- './data/lcd.csv'
LCD_url <- 'https://data.cdc.gov/resource/bi63-dtpu.json?$limit=50000'
LCD_orig <- load_cache_file(LCD_fn, LCD_url, 'json')

LCD <- LCD_orig

# --- Load 2018 All data ---
CD1_fn <- './data/nst-est2018-alldata.csv'
CD1_url <- "https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/national/totals/nst-est2018-alldata.csv"
CD1_orig <- load_cache_file(CD1_fn, CD1_url, 'csv')

CD1 <- CD1_orig

# --- Load 2010 State Age/Sex Data ---
CD2_fn <- './data/st-est00int-agesex.csv'
CD2_url <- "https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/st-est00int-agesex.csv"
CD2_orig <- load_cache_file(CD2_fn, CD2_url, 'csv')

CD2 <- CD2_orig

Prepare Data Frames

# --- CLEAN LCD DATA ---
# fromJSON doesn't allow defining column classes on import
names(LCD) <- c("Year", "113 Cause Name", "Cause Name" ,"State" ,"Deaths" ,
                "Age-adjusted Death Rate")

LCD$Year <- as.integer(LCD$Year)
LCD$Deaths <- as.integer(LCD$Deaths)
LCD$`Age-adjusted Death Rate` <- as.double(LCD$`Age-adjusted Death Rate`)
setDT(LCD)

# --- EXTRACT CD1 DATA ---
C1Extract <- CD1[, c("NAME", paste0("POPESTIMATE", 2010:2017))]
setkey(C1Extract, "NAME")

# --- EXTRACT CD2 DATA ---
C2Extract <- CD2[SEX == 0L & AGE == 999L,
                 c("NAME",  paste0("POPESTIMATE", 2000:2009))]
setkey(C2Extract, "NAME")

# --- CREATE POPULATION DATAFRAME ---
Pop <- C2Extract[C1Extract, on = "NAME"]
setnames(Pop, names(Pop), c("State", 2000:2017))

Population <- melt(Pop, id.vars = "State", variable.name = "Year",
                   value.name = "Population", variable.factor = FALSE)
Population$Year <- as.integer(Population$Year)

# --- PREPARE HEART DISEASE DATAFRAME ---
DT <- Population[LCD[Year > 1999L], on = c("State == State", "Year == Year")]
SD <- data.table(State = state.name,
                 Region = state.region,
                 Division = state.division)
DT <- SD[DT, on = "State"]
setkey(DT, Year, State)

DT[, `DeathRate` := Deaths / Population * 1e5]
DTHD <- DT[`Cause Name` == "Heart disease" &
              !(State %chin% c("United States",
                               "District of Columbia",
                               "Puerto Rico"))]

head(DTHD)

Source

The investigation will be based on publicly available data on leading causes of death comes from the National Center for Health Statistics (NHCS). Population data will be based on the statewide census data for both 2010–2018 and 2000-2009 from the US Census Bureau.

ETL

The dataset from the CDC contains 10868 observations comprising data from 53 locations—the 50 states, the US as a whole, and the District of Columbia—across 11 causes of death—the top 10 together with all—over the years 1999–2017. In order to perform aggregations, the data will be joined with US statewide population estimates for those years. Focusing on heart disease, there will be 50 location observations (the US should be a sum of the rest and DC will be ignored) across the 18 years for investigated cases of heart disease.

As the Age-adjusted death rate depends on unseen age-cohort populations, there is no accurate way to calculate division or regional aggregate rates without that information, the pure rate-per-100K people, called DeathRate will be used as the dependent variable.

The NHCS data will be downloaded via their API as a JSON file and converted to a data frame. The census data will be downloaded directly as CSV files and also converted to data frames. The population data, especially that for 2000-2010, requires tidying. Both data sets need to have the population estimate fields extracted from the larger collection of fields.

The earlier data needs more work. It is also split by age group and sex, but includes the data for “all” as well. Therefore, knowing that all sexes is coded as 0 and all locations as 999, only rows with those features are extracted, but those columns themselves are not.

Once the population fields are extracted, the two census databases are joined to each other using the “NAME” field—the geographic location—as the index. At this point the data is in “wide” format, and needs to be converted to “long” format, especially as geographic divisions and regions need to be added. Therefore, the data is melted into long format. This data is now joined to a database of US regions and divisions. This becomes the master disease table. The heart-diseases specific information is extracted to its own table for ease of analysis.

Part 3 - Exploratory data analysis

Summary Statistics

Summary statistics for the heart-disease data by region and division are below:

options(digits = 4L)

DTHD[, .(Mean = mean(DeathRate),
         SD = sd(DeathRate),
         Min = min(DeathRate),
         FirstQ = quantile(DeathRate, 0.25),
         Median = median(DeathRate),
         ThirdQ = quantile(DeathRate, 0.75),
         Max = max(DeathRate),
         IQR = IQR(DeathRate)), by = Region]

DTHD[, .(Mean = mean(DeathRate),
         SD = sd(DeathRate),
         Min = min(DeathRate),
         FirstQ = quantile(DeathRate, 0.25),
         Median = median(DeathRate),
         ThirdQ = quantile(DeathRate, 0.75),
         Max = max(DeathRate),
         IQR = IQR(DeathRate)), by = Division]

State View

A graphical view of the overall kernel-smoothed density and average rate over time is shown below. It is faceted by state, colored by division, and for the trend chart, the line-type indicates the region.


# We need to merge/join state data to build ggplot maps
us_states$region <- str_to_title(us_states$region)
DTHD_states <- left_join(us_states, DTHD, by = c("region" = "State"))

# Display Deathrate on US Map
ggplot(data = DTHD_states,
       mapping = aes(x = long, y = lat, group = group, fill = DeathRate)) +
  geom_polygon(color = "gray30", size = 0.1) +
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
  scale_fill_gradient(low = "white", high = "#CB454A") +
  labs(title = "Death Rates") + 
  theme_map() + 
  labs(fill = "Death rate per 100,000 population",
       title = "Heart Related Deaths by State, 2000-2017") +
  theme(plot.title = element_text(size = 20, face = "bold", hjust = 0.5))