Activity Definition:

Acquire the National Health Expenditure Summary Data, Pollution Data, and perform basic regression analysis between the National Health Expenditure Vs GDP , and National Health Expenditure Vs Pollution.

For this analysis we would take a sample pollutants of PM2.5

Data Sources:
+ Pollution Data: http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata
+ National Health Expenditure Summary: http://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/NationalHealthExpendData/Downloads/NHEGDP13.zip

Data Acquisition:

## Loading required package: xlsxjars
# *********** Load the hazardoes air pollutants: *****************************************************

haps.url <- "https://ofmext.epa.gov/AQDMRS/ws/list?name=param&pc=CORE_HAPS&resource=rawData"

haps.raw.data <- html(haps.url) %>%
  html_node("body > p") %>%
  html_text()

#Load the hazardous air pollutants into a data frame.
haps.data <- read.table(text=haps.raw.data, sep="\t",  col.names = c("Parameter.Code", "Parameter.Name"))

#-------------------------------------------------------------------------------------------------------

# *********** Load the Annual pollution data for the years: 2009 to 2014 *******************************

GetAnnualPollutionData <- function(year) {
  
  #Prepare a full url by appending the year
  full_url <- paste("http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata/annual_all_", year, ".zip", sep = '')
  
  
  #Download the file & get the data into a data frame.
  temp <- tempfile()
  download.file(full_url, temp)
  
  #Read the data into a data frame.
  data <- read.table(file = unz(temp, paste("annual_all_", year, ".csv", sep = '')), header = TRUE, sep = ",")
  
  #unlink the file handle
  unlink(temp)
  
  #putting a pause in between page reads, so this call is not treated as a denial of service attack.
  Sys.sleep(1)
  
  #return the dataframe
  return (data)
}

#Call the function 'GetAnnualPollutionData' to load the annual polllution data between 2009 to 2014
annual.pollution.data <- ldply(2009:2014, GetAnnualPollutionData)
dim(annual.pollution.data)
## [1] 463863     54
#-------------------------------------------------------------------------------------------------------

# Join haps.data with annual.pollution.data on Parameter.Code
#This will give us only the hazardous polluants data ONLY across the country.

data.merge <- merge(x = annual.pollution.data, y = haps.data, by = "Parameter.Code")
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Proxy data - Just take Pollutants of type PM2.5 - for each year

data.tidy <- data.merge %>%
  select(Year,State.Name, City.Name ,Parameter.Code,Parameter.Name.x,Latitude,Longitude, Event.Type,Arithmetic.Mean ) %>%
  filter(Event.Type  == "No Events") %>%
  arrange(desc(Year), State.Name)


# Summarise data - group by city to get the ANNUAL mean pollution in that city
datatidy.pollutant <- data.tidy %>%
  select(Year,Parameter.Name.x, Arithmetic.Mean ) %>%
  filter(grepl(" PM2.5", Parameter.Name.x)) %>%
  group_by(Year) %>%
  summarise_each(funs(mean(Arithmetic.Mean)))

datatidy.pollutant$Parameter.Name.x <- "PM2.5"

kable(datatidy.pollutant)
Year Parameter.Name.x Arithmetic.Mean
2009 PM2.5 0.0013119
2010 PM2.5 0.0014743
2011 PM2.5 0.0013911
2012 PM2.5 0.0013015
2013 PM2.5 0.0013932
2014 PM2.5 0.0015825

Now Load the National Health Expenditure Summary Data

# ********** Load the National Health Expenditure Summary Data *****************************************

# National health expenditure summary data
url.nhe <- "http://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/NationalHealthExpendData/Downloads/NHEGDP13.zip"

#Download the file & get the data into a data frame.
td <- tempdir()
tf <- tempfile(tmpdir=td, fileext=".zip") 
download.file(url.nhe, tf)


#Read the data into a data frame.
nhe.data <- read.xlsx(file = unzip(tf, "NHE13_Summary_Final.xls"), header = FALSE, startRow=2, rowIndex = c(2,3,9,10,11), colIndex = c(1:55), sheetIndex = 1 )

#unlink the file handle
unlink(tf)


#Make rows as columns and columns as rows
nhe.data.final <- t(nhe.data)

#Nullify row names - not needed.
rownames(nhe.data.final) <- NULL

#Make the row1 as the column names
colnames(nhe.data.final) <- nhe.data.final[1,]

#We do not need row1 now.
nhe.data.final = nhe.data.final[-1,]

#change the column 1 name to "Year"
colnames(nhe.data.final)[1] <- "Year"

#convert the year column to numeric
nhe.data.final[, 1] <- as.numeric(nhe.data.final[,1])

#View the final data
kable(head(nhe.data.final))
Year National Health Expenditures (Amount in Billions) U.S. Population1 (Millions) Gross Domestic Product2 (Amount in Billions) National Health Expenditures (Per Capita Amount)
1960 27.4 186.0 543.3 147.0
1961 29.2 189.0 563.3 154.0
1962 31.9 192.0 605.1 166.0
1963 34.7 195.0 638.6 178.0
1964 38.5 197.0 685.8 195.0
1965 42.0 200.0 743.7 210.0

Analysis

#Convert matrix to data frame:
nhe.data.df <-  as.data.frame(nhe.data.final)

#Provide colnames
colnames(nhe.data.df) <- c('Year', 'NHE_In_Billions', 'Population_In_Millions', 'GDP_In_Billions', 'NHE_PerCapitaAmt')

#Refactor the fields
nhe.data.df$Year <- as.numeric(as.character(nhe.data.df$Year))
nhe.data.df$NHE_In_Billions <- as.numeric(as.character(nhe.data.df$NHE_In_Billions))
nhe.data.df$Population_In_Millions <- as.numeric(as.character(nhe.data.df$Population_In_Millions))
nhe.data.df$GDP_In_Billions <- as.numeric(as.character(nhe.data.df$GDP_In_Billions))
nhe.data.df$NHE_PerCapitaAmt <- as.numeric(as.character(nhe.data.df$NHE_PerCapitaAmt))

#plot NHE
ggplot(nhe.data.df, aes(x = Year, y = NHE_In_Billions)) + geom_point() + geom_smooth(method="lm") + 
  labs(x = "Year", y = "NHE in Billions", title = "National Health Expenditure (1960-2014) ") + theme(plot.title = element_text(size=20, face="bold", vjust=2))

#Just filter the 2000 to 2014 data for NHE and GDP
nhe.gdp.data <- select(nhe.data.df, Year, NHE_In_Billions, GDP_In_Billions) %>% 
                    filter(Year >= 2000)

#Reshapre into Long format
nhe.gdp.data.long <- melt(nhe.gdp.data, id.vars = c("Year"), variable.name = "Category", value.name = "Amount")


ggplot(nhe.gdp.data.long, aes(x = Year, y = Amount, color=Category, fill=Category)) + geom_point() + geom_smooth(method="lm") + 
  labs(title = "National Health Expenditure Vs GDP") + theme(plot.title = element_text(size=20, face="bold", vjust=2))

p1 <- ggplot(nhe.gdp.data.long, aes(x = Year, y = Amount, fill=Category)) + geom_bar(stat="identity", position="dodge", colour="black") + 
  geom_line() + geom_point() + geom_smooth(method="lm") + labs( y = "Amount in Billions", title = "National Health Expenditure Vs GDP") +  theme(plot.title = element_text(size=20, face="bold", vjust=2))


#Add a new calculated columen to the data frame:
nhe.gdp.data <- mutate(nhe.gdp.data, nhe.percentage = (NHE_In_Billions / GDP_In_Billions) * 100)

p2 <- ggplot(nhe.gdp.data, aes(x = Year, y = nhe.percentage)) + geom_point() + geom_line() + geom_smooth(method="lm") + labs( y = "NHE Percentage In GDP", title = "National Health Expenditure as % of  GDP") + theme(plot.title = element_text(size=20, face="bold", vjust=2))

#install.packages("gridExtra")
library(grid)
library(gridExtra)
library(ggplot2)
grid.arrange(p1, p2, nrow=2)

NHE Vs Pollutant Data

ggplot(datatidy.pollutant, aes(x = Year, y = Arithmetic.Mean)) + geom_point() + geom_smooth(method="lm") +
  labs(x = "Year", y = "Pollution Measure", title = "Pollutant -  PM2.5 Trend") + theme(plot.title = element_text(size=20, face="bold", vjust=2))

Just take proxy data of NHE since the year 2008, and compare it with the Pollutant [PM2.5] data

nhe.data.reg <- select(nhe.data.df, Year, NHE_In_Billions, GDP_In_Billions) %>% 
  filter(Year >= 2008)

NHE <- nhe.data.reg$NHE_In_Billions
POLLUTANT <- datatidy.pollutant$Arithmetic.Mean
frame <- as.data.frame(cbind(NHE,POLLUTANT))

ggplot(frame, aes(x = POLLUTANT, y = NHE)) + geom_point() + geom_smooth(method="lm") + 
  labs(x = "Pollutant Measure", y = "NHE in Billions", title = "Pollutant (PM2.5) Vs NHE") + geom_point() + geom_line() + theme(plot.title = element_text(size=20, face="bold", vjust=2))

fit <- lm(NHE ~ POLLUTANT, data = frame)
summary(fit)
## 
## Call:
## lm(formula = NHE ~ POLLUTANT, data = frame)
## 
## Residuals:
##       1       2       3       4       5       6 
## -155.78 -216.26  -40.02  145.19  171.22   95.65 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)     1340       1086   1.234    0.285
## POLLUTANT     937226     768828   1.219    0.290
## 
## Residual standard error: 181.8 on 4 degrees of freedom
## Multiple R-squared:  0.2709, Adjusted R-squared:  0.0886 
## F-statistic: 1.486 on 1 and 4 DF,  p-value: 0.2898
cor(frame)
##                 NHE POLLUTANT
## NHE       1.0000000 0.5204579
## POLLUTANT 0.5204579 1.0000000

Multiple Linear Regression

nhe.data.full.reg <- select(nhe.data.df, Year, NHE_In_Billions, GDP_In_Billions,  NHE_PerCapitaAmt, Population_In_Millions) %>%   filter(Year >= 2008)
nhe.data.full.reg$Pollution <- POLLUTANT
nhe.data.full.reg <- nhe.data.full.reg[, -1]

cor(nhe.data.full.reg)
##                        NHE_In_Billions GDP_In_Billions NHE_PerCapitaAmt
## NHE_In_Billions              1.0000000       0.9567180        0.9999562
## GDP_In_Billions              0.9567180       1.0000000        0.9586231
## NHE_PerCapitaAmt             0.9999562       0.9586231        1.0000000
## Population_In_Millions       0.9946288       0.9216993        0.9938427
## Pollution                    0.5204579       0.4657069        0.5172072
##                        Population_In_Millions Pollution
## NHE_In_Billions                     0.9946288 0.5204579
## GDP_In_Billions                     0.9216993 0.4657069
## NHE_PerCapitaAmt                    0.9938427 0.5172072
## Population_In_Millions              1.0000000 0.5254859
## Pollution                           0.5254859 1.0000000
fit <- lm(NHE_In_Billions ~  Population_In_Millions + GDP_In_Billions + NHE_PerCapitaAmt + Pollution, data = nhe.data.full.reg)

scatterplotMatrix(nhe.data.full.reg)

Which Variables are most important in predicting the outcome ?

#Relative Weights Method - Approximates the average increase in R-square obtained by adding a predictor variable #across all possible submodels_

#relweights method from 'R in Action':adapted from an SPSS program generously provided by Dr. Johnson. See Johnson (2000, Multivariate Behavioral Research, 35, 1-19) for an explanation of how the relative weights are derived.
relweights <- function(fit,...){
R <- cor(fit$model)
nvar <- ncol(R)
rxx <- R[2:nvar, 2:nvar]
rxy <- R[2:nvar, 1]
svd <- eigen(rxx)
evec <- svd$vectors
ev <- svd$values
delta <- diag(sqrt(ev))
lambda <- evec %*% delta %*% t(evec)
lambdasq <- lambda ^ 2
beta <- solve(lambda) %*% rxy
rsquare <- colSums(beta ^ 2)
rawwgt <- lambdasq %*% beta ^ 2
import <- (rawwgt / rsquare) * 100
import <- as.data.frame(import)
row.names(import) <- names(fit$model[2:nvar])
names(import) <- "Weights"
import <- import[order(import),1, drop=FALSE]
dotchart(import$Weights, labels=row.names(import),
xlab="% of R-Square", pch=19,
main="Relative Importance of Predictor Variables",
sub=paste("Total R-Square=", round(rsquare, digits=3)),
...)
return(import)
}

relweights(fit, col= "blue")

##                          Weights
## Pollution               7.407716
## GDP_In_Billions        29.235650
## NHE_PerCapitaAmt       31.329570
## Population_In_Millions 32.027065

Conclusion

Based on the trend line from this sample data, there appears to be a strong positive linear relationship between the NHE Vs the Population, Percapita,GDP variables. However from this limited sample pollutant data, there appears to be a weak positive linear relationship between the pollutant and NHE variables. Also, this relationship may be coincidental.Need more analyses with combination of HAPS (pollutants) to gain insights.