Acquire the National Health Expenditure Summary Data, Pollution Data, and perform basic regression analysis between the National Health Expenditure Vs GDP , and National Health Expenditure Vs Pollution.
For this analysis we would take a sample pollutants of PM2.5
Data Sources:
+ Pollution Data: http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata
+ National Health Expenditure Summary: http://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/NationalHealthExpendData/Downloads/NHEGDP13.zip
## Loading required package: xlsxjars
# *********** Load the hazardoes air pollutants: *****************************************************
haps.url <- "https://ofmext.epa.gov/AQDMRS/ws/list?name=param&pc=CORE_HAPS&resource=rawData"
haps.raw.data <- html(haps.url) %>%
html_node("body > p") %>%
html_text()
#Load the hazardous air pollutants into a data frame.
haps.data <- read.table(text=haps.raw.data, sep="\t", col.names = c("Parameter.Code", "Parameter.Name"))
#-------------------------------------------------------------------------------------------------------
# *********** Load the Annual pollution data for the years: 2009 to 2014 *******************************
GetAnnualPollutionData <- function(year) {
#Prepare a full url by appending the year
full_url <- paste("http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata/annual_all_", year, ".zip", sep = '')
#Download the file & get the data into a data frame.
temp <- tempfile()
download.file(full_url, temp)
#Read the data into a data frame.
data <- read.table(file = unz(temp, paste("annual_all_", year, ".csv", sep = '')), header = TRUE, sep = ",")
#unlink the file handle
unlink(temp)
#putting a pause in between page reads, so this call is not treated as a denial of service attack.
Sys.sleep(1)
#return the dataframe
return (data)
}
#Call the function 'GetAnnualPollutionData' to load the annual polllution data between 2009 to 2014
annual.pollution.data <- ldply(2009:2014, GetAnnualPollutionData)
dim(annual.pollution.data)
## [1] 463863 54
#-------------------------------------------------------------------------------------------------------
# Join haps.data with annual.pollution.data on Parameter.Code
#This will give us only the hazardous polluants data ONLY across the country.
data.merge <- merge(x = annual.pollution.data, y = haps.data, by = "Parameter.Code")
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Proxy data - Just take Pollutants of type PM2.5 - for each year
data.tidy <- data.merge %>%
select(Year,State.Name, City.Name ,Parameter.Code,Parameter.Name.x,Latitude,Longitude, Event.Type,Arithmetic.Mean ) %>%
filter(Event.Type == "No Events") %>%
arrange(desc(Year), State.Name)
# Summarise data - group by city to get the ANNUAL mean pollution in that city
datatidy.pollutant <- data.tidy %>%
select(Year,Parameter.Name.x, Arithmetic.Mean ) %>%
filter(grepl(" PM2.5", Parameter.Name.x)) %>%
group_by(Year) %>%
summarise_each(funs(mean(Arithmetic.Mean)))
datatidy.pollutant$Parameter.Name.x <- "PM2.5"
kable(datatidy.pollutant)
| Year | Parameter.Name.x | Arithmetic.Mean |
|---|---|---|
| 2009 | PM2.5 | 0.0013119 |
| 2010 | PM2.5 | 0.0014743 |
| 2011 | PM2.5 | 0.0013911 |
| 2012 | PM2.5 | 0.0013015 |
| 2013 | PM2.5 | 0.0013932 |
| 2014 | PM2.5 | 0.0015825 |
Now Load the National Health Expenditure Summary Data
# ********** Load the National Health Expenditure Summary Data *****************************************
# National health expenditure summary data
url.nhe <- "http://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/NationalHealthExpendData/Downloads/NHEGDP13.zip"
#Download the file & get the data into a data frame.
td <- tempdir()
tf <- tempfile(tmpdir=td, fileext=".zip")
download.file(url.nhe, tf)
#Read the data into a data frame.
nhe.data <- read.xlsx(file = unzip(tf, "NHE13_Summary_Final.xls"), header = FALSE, startRow=2, rowIndex = c(2,3,9,10,11), colIndex = c(1:55), sheetIndex = 1 )
#unlink the file handle
unlink(tf)
#Make rows as columns and columns as rows
nhe.data.final <- t(nhe.data)
#Nullify row names - not needed.
rownames(nhe.data.final) <- NULL
#Make the row1 as the column names
colnames(nhe.data.final) <- nhe.data.final[1,]
#We do not need row1 now.
nhe.data.final = nhe.data.final[-1,]
#change the column 1 name to "Year"
colnames(nhe.data.final)[1] <- "Year"
#convert the year column to numeric
nhe.data.final[, 1] <- as.numeric(nhe.data.final[,1])
#View the final data
kable(head(nhe.data.final))
| Year | National Health Expenditures (Amount in Billions) | U.S. Population1 (Millions) | Gross Domestic Product2 (Amount in Billions) | National Health Expenditures (Per Capita Amount) |
|---|---|---|---|---|
| 1960 | 27.4 | 186.0 | 543.3 | 147.0 |
| 1961 | 29.2 | 189.0 | 563.3 | 154.0 |
| 1962 | 31.9 | 192.0 | 605.1 | 166.0 |
| 1963 | 34.7 | 195.0 | 638.6 | 178.0 |
| 1964 | 38.5 | 197.0 | 685.8 | 195.0 |
| 1965 | 42.0 | 200.0 | 743.7 | 210.0 |
#Convert matrix to data frame:
nhe.data.df <- as.data.frame(nhe.data.final)
#Provide colnames
colnames(nhe.data.df) <- c('Year', 'NHE_In_Billions', 'Population_In_Millions', 'GDP_In_Billions', 'NHE_PerCapitaAmt')
#Refactor the fields
nhe.data.df$Year <- as.numeric(as.character(nhe.data.df$Year))
nhe.data.df$NHE_In_Billions <- as.numeric(as.character(nhe.data.df$NHE_In_Billions))
nhe.data.df$Population_In_Millions <- as.numeric(as.character(nhe.data.df$Population_In_Millions))
nhe.data.df$GDP_In_Billions <- as.numeric(as.character(nhe.data.df$GDP_In_Billions))
nhe.data.df$NHE_PerCapitaAmt <- as.numeric(as.character(nhe.data.df$NHE_PerCapitaAmt))
#plot NHE
ggplot(nhe.data.df, aes(x = Year, y = NHE_In_Billions)) + geom_point() + geom_smooth(method="lm") +
labs(x = "Year", y = "NHE in Billions", title = "National Health Expenditure (1960-2014) ") + theme(plot.title = element_text(size=20, face="bold", vjust=2))
#Just filter the 2000 to 2014 data for NHE and GDP
nhe.gdp.data <- select(nhe.data.df, Year, NHE_In_Billions, GDP_In_Billions) %>%
filter(Year >= 2000)
#Reshapre into Long format
nhe.gdp.data.long <- melt(nhe.gdp.data, id.vars = c("Year"), variable.name = "Category", value.name = "Amount")
ggplot(nhe.gdp.data.long, aes(x = Year, y = Amount, color=Category, fill=Category)) + geom_point() + geom_smooth(method="lm") +
labs(title = "National Health Expenditure Vs GDP") + theme(plot.title = element_text(size=20, face="bold", vjust=2))
p1 <- ggplot(nhe.gdp.data.long, aes(x = Year, y = Amount, fill=Category)) + geom_bar(stat="identity", position="dodge", colour="black") +
geom_line() + geom_point() + geom_smooth(method="lm") + labs( y = "Amount in Billions", title = "National Health Expenditure Vs GDP") + theme(plot.title = element_text(size=20, face="bold", vjust=2))
#Add a new calculated columen to the data frame:
nhe.gdp.data <- mutate(nhe.gdp.data, nhe.percentage = (NHE_In_Billions / GDP_In_Billions) * 100)
p2 <- ggplot(nhe.gdp.data, aes(x = Year, y = nhe.percentage)) + geom_point() + geom_line() + geom_smooth(method="lm") + labs( y = "NHE Percentage In GDP", title = "National Health Expenditure as % of GDP") + theme(plot.title = element_text(size=20, face="bold", vjust=2))
#install.packages("gridExtra")
library(grid)
library(gridExtra)
library(ggplot2)
grid.arrange(p1, p2, nrow=2)
NHE Vs Pollutant Data
ggplot(datatidy.pollutant, aes(x = Year, y = Arithmetic.Mean)) + geom_point() + geom_smooth(method="lm") +
labs(x = "Year", y = "Pollution Measure", title = "Pollutant - PM2.5 Trend") + theme(plot.title = element_text(size=20, face="bold", vjust=2))
Just take proxy data of NHE since the year 2008, and compare it with the Pollutant [PM2.5] data
nhe.data.reg <- select(nhe.data.df, Year, NHE_In_Billions, GDP_In_Billions) %>%
filter(Year >= 2008)
NHE <- nhe.data.reg$NHE_In_Billions
POLLUTANT <- datatidy.pollutant$Arithmetic.Mean
frame <- as.data.frame(cbind(NHE,POLLUTANT))
ggplot(frame, aes(x = POLLUTANT, y = NHE)) + geom_point() + geom_smooth(method="lm") +
labs(x = "Pollutant Measure", y = "NHE in Billions", title = "Pollutant (PM2.5) Vs NHE") + geom_point() + geom_line() + theme(plot.title = element_text(size=20, face="bold", vjust=2))
fit <- lm(NHE ~ POLLUTANT, data = frame)
summary(fit)
##
## Call:
## lm(formula = NHE ~ POLLUTANT, data = frame)
##
## Residuals:
## 1 2 3 4 5 6
## -155.78 -216.26 -40.02 145.19 171.22 95.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1340 1086 1.234 0.285
## POLLUTANT 937226 768828 1.219 0.290
##
## Residual standard error: 181.8 on 4 degrees of freedom
## Multiple R-squared: 0.2709, Adjusted R-squared: 0.0886
## F-statistic: 1.486 on 1 and 4 DF, p-value: 0.2898
cor(frame)
## NHE POLLUTANT
## NHE 1.0000000 0.5204579
## POLLUTANT 0.5204579 1.0000000
Multiple Linear Regression
nhe.data.full.reg <- select(nhe.data.df, Year, NHE_In_Billions, GDP_In_Billions, NHE_PerCapitaAmt, Population_In_Millions) %>% filter(Year >= 2008)
nhe.data.full.reg$Pollution <- POLLUTANT
nhe.data.full.reg <- nhe.data.full.reg[, -1]
cor(nhe.data.full.reg)
## NHE_In_Billions GDP_In_Billions NHE_PerCapitaAmt
## NHE_In_Billions 1.0000000 0.9567180 0.9999562
## GDP_In_Billions 0.9567180 1.0000000 0.9586231
## NHE_PerCapitaAmt 0.9999562 0.9586231 1.0000000
## Population_In_Millions 0.9946288 0.9216993 0.9938427
## Pollution 0.5204579 0.4657069 0.5172072
## Population_In_Millions Pollution
## NHE_In_Billions 0.9946288 0.5204579
## GDP_In_Billions 0.9216993 0.4657069
## NHE_PerCapitaAmt 0.9938427 0.5172072
## Population_In_Millions 1.0000000 0.5254859
## Pollution 0.5254859 1.0000000
fit <- lm(NHE_In_Billions ~ Population_In_Millions + GDP_In_Billions + NHE_PerCapitaAmt + Pollution, data = nhe.data.full.reg)
scatterplotMatrix(nhe.data.full.reg)
#Relative Weights Method - Approximates the average increase in R-square obtained by adding a predictor variable #across all possible submodels_
#relweights method from 'R in Action':adapted from an SPSS program generously provided by Dr. Johnson. See Johnson (2000, Multivariate Behavioral Research, 35, 1-19) for an explanation of how the relative weights are derived.
relweights <- function(fit,...){
R <- cor(fit$model)
nvar <- ncol(R)
rxx <- R[2:nvar, 2:nvar]
rxy <- R[2:nvar, 1]
svd <- eigen(rxx)
evec <- svd$vectors
ev <- svd$values
delta <- diag(sqrt(ev))
lambda <- evec %*% delta %*% t(evec)
lambdasq <- lambda ^ 2
beta <- solve(lambda) %*% rxy
rsquare <- colSums(beta ^ 2)
rawwgt <- lambdasq %*% beta ^ 2
import <- (rawwgt / rsquare) * 100
import <- as.data.frame(import)
row.names(import) <- names(fit$model[2:nvar])
names(import) <- "Weights"
import <- import[order(import),1, drop=FALSE]
dotchart(import$Weights, labels=row.names(import),
xlab="% of R-Square", pch=19,
main="Relative Importance of Predictor Variables",
sub=paste("Total R-Square=", round(rsquare, digits=3)),
...)
return(import)
}
relweights(fit, col= "blue")
## Weights
## Pollution 7.407716
## GDP_In_Billions 29.235650
## NHE_PerCapitaAmt 31.329570
## Population_In_Millions 32.027065
Based on the trend line from this sample data, there appears to be a strong positive linear relationship between the NHE Vs the Population, Percapita,GDP variables. However from this limited sample pollutant data, there appears to be a weak positive linear relationship between the pollutant and NHE variables. Also, this relationship may be coincidental.Need more analyses with combination of HAPS (pollutants) to gain insights.