Activity Definition:

Acquire the Pollutants data, prepare a proxy set of data , load into Neo4J , and analyse.

Data Sources:
+ Pollution Data: http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata
+ HAPS Data: https://ofmext.epa.gov/AQDMRS/ws/list?name=param&pc=CORE_HAPS&resource=rawData

Data Acquisition:

## Loading required package: xlsxjars
# *********** Load the hazardoes air pollutants: *****************************************************

haps.url <- "https://ofmext.epa.gov/AQDMRS/ws/list?name=param&pc=CORE_HAPS&resource=rawData"

haps.raw.data <- html(haps.url) %>%
  html_node("body > p") %>%
  html_text()

#Load the hazardous air pollutants into a data frame.
haps.data <- read.table(text=haps.raw.data, sep="\t",  col.names = c("Parameter.Code", "Parameter.Name"))

dim(haps.data)
## [1] 42  2
#-------------------------------------------------------------------------------------------------------

# *********** Load the Annual pollution data for the years: 2009 to 2014 *******************************

GetAnnualPollutionData <- function(year) {
  
  #Prepare a full url by appending the year
  full_url <- paste("http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata/annual_all_", year, ".zip", sep = '')
  
  
  #Download the file & get the data into a data frame.
  temp <- tempfile()
  download.file(full_url, temp)
  
  #Read the data into a data frame.
  data <- read.table(file = unz(temp, paste("annual_all_", year, ".csv", sep = '')), header = TRUE, sep = ",")
  
  #unlink the file handle
  unlink(temp)
  
  #putting a pause in between page reads, so this call is not treated as a denial of service attack.
  Sys.sleep(1)
  
  #return the dataframe
  return (data)
}

#Call the function 'GetAnnualPollutionData' to load the annual polllution data between 2009 to 2014
annual.pollution.data <- ldply(2009:2014, GetAnnualPollutionData)

Tidying and Transforming

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Join haps.data with annual.pollution.data on Parameter.Code
#This will give us only the hazardous polluants data ONLY across the country.
data.merge <- merge(x = annual.pollution.data, y = haps.data, by = "Parameter.Code")

data.tidy <- data.merge %>%
                 select(Year,State.Name, City.Name ,Parameter.Code,Parameter.Name.x, Event.Type,Arithmetic.Mean ) %>%
                     filter(Event.Type  == "No Events") %>%
                          arrange(desc(Year), State.Name)

# Summarise data - group by city to get the ANNUAL mean pollution in that city
datatidy.pollutant <- data.tidy %>%
            select(Year,State.Name, City.Name ,Parameter.Code,Parameter.Name.x, Arithmetic.Mean ) %>%
                group_by(Year,State.Name, City.Name ,Parameter.Code,Parameter.Name.x) %>%
                      summarise_each(funs(mean(Arithmetic.Mean)))

dim(datatidy.pollutant)
## [1] 28623     6
kable(head(datatidy.pollutant))
Year State.Name City.Name Parameter.Code Parameter.Name.x Arithmetic.Mean
2009 Alabama Birmingham 43218 1,3-Butadiene 0.2664815
2009 Alabama Birmingham 43505 Acrolein - Unverified 1.7958028
2009 Alabama Birmingham 43802 Dichloromethane 0.1184178
2009 Alabama Birmingham 43803 Chloroform 0.0240792
2009 Alabama Birmingham 43804 Carbon tetrachloride 0.1103095
2009 Alabama Birmingham 43815 Ethylene dichloride 0.0008500

NEO4J - DATA PREPARTION —- Generate CSV files for loading the proxy data into Neo4j

Load pollution data for ‘Texas’, for polluants [Chloroform, Benzene,Lead PM2.5 LC,Arsenic PM2.5 LC]

#Construct a Location data frame
#Node:
#  Location  -- city name, lang, lat, state
#  Pollutant  -- code,name
#Relationship: [ city name, state <--> code]
#  observation -- year, measurement (arithmetic mean)  


#Gather unique cities
city.df <- annual.pollution.data %>%
  filter(City.Name != ''& State.Name == 'Texas')  %>%
    group_by(State.Name, City.Name)  %>%
      summarise() %>%
        select(State.Name, City.Name)


#Gather unique pollutants
pollutant.df <- datatidy.pollutant %>%
  filter(City.Name != '' & grepl('Chloroform|Benzene|Lead PM2.5 LC|Arsenic PM2.5 LC', Parameter.Name.x) & State.Name == 'Texas')  %>%
    group_by(Parameter.Code, Parameter.Name.x)  %>%
      summarise() %>%
         select(Parameter.Code, Parameter.Name.x)

#Gather the relationship [observation] between the city and pollutant - year, measurement (arithmetic mean)
observation.df <- datatidy.pollutant %>%
  filter(City.Name != '' & grepl('Chloroform|Benzene|Lead PM2.5 LC|Arsenic PM2.5 LC', Parameter.Name.x) & State.Name == 'Texas')  %>%
       select(Year, State.Name, City.Name, Parameter.Code, Parameter.Name.x, Arithmetic.Mean)

View the Sample Data

kable(head(city.df))
State.Name City.Name
Texas Amarillo
Texas Arlington
Texas Austin
Texas Baytown
Texas Beaumont
Texas Brownsville
kable(head(pollutant.df))
Parameter.Code Parameter.Name.x
43803 Chloroform
45201 Benzene
88103 Arsenic PM2.5 LC
88128 Lead PM2.5 LC
kable(head(observation.df))
Year State.Name City.Name Parameter.Code Parameter.Name.x Arithmetic.Mean
2009 Texas Austin 43803 Chloroform 0.019298
2009 Texas Austin 45201 Benzene 1.988421
2009 Texas Baytown 43803 Chloroform 0.035167
2009 Texas Baytown 45201 Benzene 3.564000
2009 Texas Beaumont 43803 Chloroform 0.030000
2009 Texas Beaumont 45201 Benzene 2.311126

Preare csv files from the above data frames

write.table(city.df, "city-data.csv", quote=FALSE, row.names=FALSE, col.names=c('state','city'), sep=",")
write.table(pollutant.df, "pollutant-data.csv", quote=FALSE, row.names=FALSE, col.names=c('code','name'), sep=",")
write.table(observation.df, "observation-data.csv", quote=FALSE, row.names=FALSE, col.names=c('year','state', 'city', 'code', 'name', 'measurement'), sep=",")

NEO4J - Data Analysis

Click here for Neo4J Analysis