Acquire the Pollutants data, prepare a proxy set of data , load into Neo4J , and analyse.
Data Sources:
+ Pollution Data: http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata
+ HAPS Data: https://ofmext.epa.gov/AQDMRS/ws/list?name=param&pc=CORE_HAPS&resource=rawData
## Loading required package: xlsxjars
# *********** Load the hazardoes air pollutants: *****************************************************
haps.url <- "https://ofmext.epa.gov/AQDMRS/ws/list?name=param&pc=CORE_HAPS&resource=rawData"
haps.raw.data <- html(haps.url) %>%
html_node("body > p") %>%
html_text()
#Load the hazardous air pollutants into a data frame.
haps.data <- read.table(text=haps.raw.data, sep="\t", col.names = c("Parameter.Code", "Parameter.Name"))
dim(haps.data)
## [1] 42 2
#-------------------------------------------------------------------------------------------------------
# *********** Load the Annual pollution data for the years: 2009 to 2014 *******************************
GetAnnualPollutionData <- function(year) {
#Prepare a full url by appending the year
full_url <- paste("http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata/annual_all_", year, ".zip", sep = '')
#Download the file & get the data into a data frame.
temp <- tempfile()
download.file(full_url, temp)
#Read the data into a data frame.
data <- read.table(file = unz(temp, paste("annual_all_", year, ".csv", sep = '')), header = TRUE, sep = ",")
#unlink the file handle
unlink(temp)
#putting a pause in between page reads, so this call is not treated as a denial of service attack.
Sys.sleep(1)
#return the dataframe
return (data)
}
#Call the function 'GetAnnualPollutionData' to load the annual polllution data between 2009 to 2014
annual.pollution.data <- ldply(2009:2014, GetAnnualPollutionData)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Join haps.data with annual.pollution.data on Parameter.Code
#This will give us only the hazardous polluants data ONLY across the country.
data.merge <- merge(x = annual.pollution.data, y = haps.data, by = "Parameter.Code")
data.tidy <- data.merge %>%
select(Year,State.Name, City.Name ,Parameter.Code,Parameter.Name.x, Event.Type,Arithmetic.Mean ) %>%
filter(Event.Type == "No Events") %>%
arrange(desc(Year), State.Name)
# Summarise data - group by city to get the ANNUAL mean pollution in that city
datatidy.pollutant <- data.tidy %>%
select(Year,State.Name, City.Name ,Parameter.Code,Parameter.Name.x, Arithmetic.Mean ) %>%
group_by(Year,State.Name, City.Name ,Parameter.Code,Parameter.Name.x) %>%
summarise_each(funs(mean(Arithmetic.Mean)))
dim(datatidy.pollutant)
## [1] 28623 6
kable(head(datatidy.pollutant))
| Year | State.Name | City.Name | Parameter.Code | Parameter.Name.x | Arithmetic.Mean |
|---|---|---|---|---|---|
| 2009 | Alabama | Birmingham | 43218 | 1,3-Butadiene | 0.2664815 |
| 2009 | Alabama | Birmingham | 43505 | Acrolein - Unverified | 1.7958028 |
| 2009 | Alabama | Birmingham | 43802 | Dichloromethane | 0.1184178 |
| 2009 | Alabama | Birmingham | 43803 | Chloroform | 0.0240792 |
| 2009 | Alabama | Birmingham | 43804 | Carbon tetrachloride | 0.1103095 |
| 2009 | Alabama | Birmingham | 43815 | Ethylene dichloride | 0.0008500 |
Load pollution data for ‘Texas’, for polluants [Chloroform, Benzene,Lead PM2.5 LC,Arsenic PM2.5 LC]
#Construct a Location data frame
#Node:
# Location -- city name, lang, lat, state
# Pollutant -- code,name
#Relationship: [ city name, state <--> code]
# observation -- year, measurement (arithmetic mean)
#Gather unique cities
city.df <- annual.pollution.data %>%
filter(City.Name != ''& State.Name == 'Texas') %>%
group_by(State.Name, City.Name) %>%
summarise() %>%
select(State.Name, City.Name)
#Gather unique pollutants
pollutant.df <- datatidy.pollutant %>%
filter(City.Name != '' & grepl('Chloroform|Benzene|Lead PM2.5 LC|Arsenic PM2.5 LC', Parameter.Name.x) & State.Name == 'Texas') %>%
group_by(Parameter.Code, Parameter.Name.x) %>%
summarise() %>%
select(Parameter.Code, Parameter.Name.x)
#Gather the relationship [observation] between the city and pollutant - year, measurement (arithmetic mean)
observation.df <- datatidy.pollutant %>%
filter(City.Name != '' & grepl('Chloroform|Benzene|Lead PM2.5 LC|Arsenic PM2.5 LC', Parameter.Name.x) & State.Name == 'Texas') %>%
select(Year, State.Name, City.Name, Parameter.Code, Parameter.Name.x, Arithmetic.Mean)
View the Sample Data
kable(head(city.df))
| State.Name | City.Name |
|---|---|
| Texas | Amarillo |
| Texas | Arlington |
| Texas | Austin |
| Texas | Baytown |
| Texas | Beaumont |
| Texas | Brownsville |
kable(head(pollutant.df))
| Parameter.Code | Parameter.Name.x |
|---|---|
| 43803 | Chloroform |
| 45201 | Benzene |
| 88103 | Arsenic PM2.5 LC |
| 88128 | Lead PM2.5 LC |
kable(head(observation.df))
| Year | State.Name | City.Name | Parameter.Code | Parameter.Name.x | Arithmetic.Mean |
|---|---|---|---|---|---|
| 2009 | Texas | Austin | 43803 | Chloroform | 0.019298 |
| 2009 | Texas | Austin | 45201 | Benzene | 1.988421 |
| 2009 | Texas | Baytown | 43803 | Chloroform | 0.035167 |
| 2009 | Texas | Baytown | 45201 | Benzene | 3.564000 |
| 2009 | Texas | Beaumont | 43803 | Chloroform | 0.030000 |
| 2009 | Texas | Beaumont | 45201 | Benzene | 2.311126 |
Preare csv files from the above data frames
write.table(city.df, "city-data.csv", quote=FALSE, row.names=FALSE, col.names=c('state','city'), sep=",")
write.table(pollutant.df, "pollutant-data.csv", quote=FALSE, row.names=FALSE, col.names=c('code','name'), sep=",")
write.table(observation.df, "observation-data.csv", quote=FALSE, row.names=FALSE, col.names=c('year','state', 'city', 'code', 'name', 'measurement'), sep=",")