Activity Definition:

Acquire the Pollutants data, prepare a proxy set of data , load into Neo4J , and analyse.

Data Sources:
+ Pollution Data: http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata
+ HAPS Data: https://ofmext.epa.gov/AQDMRS/ws/list?name=param&pc=CORE_HAPS&resource=rawData

Data Acquisition:

## Loading required package: xlsxjars

# *********** Load the hazardoes air pollutants: *****************************************************

haps.url <- "https://ofmext.epa.gov/AQDMRS/ws/list?name=param&pc=CORE_HAPS&resource=rawData"

haps.raw.data <- html(haps.url) %>%
  html_node("body > p") %>%
  html_text()

#Load the hazardous air pollutants into a data frame.
haps.data <- read.table(text=haps.raw.data, sep="\t",  col.names = c("Parameter.Code", "Parameter.Name"))

dim(haps.data)

## [1] 42  2

#-------------------------------------------------------------------------------------------------------

# *********** Load the Annual pollution data for the years: 2009 to 2014 *******************************

GetAnnualPollutionData <- function(year) {
  
  #Prepare a full url by appending the year
  full_url <- paste("http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata/annual_all_", year, ".zip", sep = '')
  
  
  #Download the file & get the data into a data frame.
  temp <- tempfile()
  download.file(full_url, temp)
  
  #Read the data into a data frame.
  data <- read.table(file = unz(temp, paste("annual_all_", year, ".csv", sep = '')), header = TRUE, sep = ",")
  
  #unlink the file handle
  unlink(temp)
  
  #putting a pause in between page reads, so this call is not treated as a denial of service attack.
  Sys.sleep(1)
  
  #return the dataframe
  return (data)
}

#Call the function 'GetAnnualPollutionData' to load the annual polllution data between 2009 to 2014
annual.pollution.data <- ldply(2009:2014, GetAnnualPollutionData)

Tidying and Transforming

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Join haps.data with annual.pollution.data on Parameter.Code
#This will give us only the hazardous polluants data ONLY across the country.
data.merge <- merge(x = annual.pollution.data, y = haps.data, by = "Parameter.Code")

data.tidy <- data.merge %>%
                 select(Year,State.Name, City.Name ,Parameter.Code,Parameter.Name.x, Event.Type,Arithmetic.Mean ) %>%
                     filter(Event.Type  == "No Events") %>%
                          arrange(desc(Year), State.Name)

# Summarise data - group by city to get the ANNUAL mean pollution in that city
datatidy.pollutant <- data.tidy %>%
            select(Year,State.Name, City.Name ,Parameter.Code,Parameter.Name.x, Arithmetic.Mean ) %>%
                group_by(Year,State.Name, City.Name ,Parameter.Code,Parameter.Name.x) %>%
                      summarise_each(funs(mean(Arithmetic.Mean)))

dim(datatidy.pollutant)

## [1] 28623     6

kable(head(datatidy.pollutant))

Year	State.Name	City.Name	Parameter.Code	Parameter.Name.x	Arithmetic.Mean
2009	Alabama	Birmingham	43218	1,3-Butadiene	0.2664815
2009	Alabama	Birmingham	43505	Acrolein - Unverified	1.7958028
2009	Alabama	Birmingham	43802	Dichloromethane	0.1184178
2009	Alabama	Birmingham	43803	Chloroform	0.0240792
2009	Alabama	Birmingham	43804	Carbon tetrachloride	0.1103095
2009	Alabama	Birmingham	43815	Ethylene dichloride	0.0008500

NEO4J - DATA PREPARTION —- Generate CSV files for loading the proxy data into Neo4j

Load pollution data for ‘Texas’, for polluants [Chloroform, Benzene,Lead PM2.5 LC,Arsenic PM2.5 LC]

#Construct a Location data frame
#Node:
#  Location  -- city name, lang, lat, state
#  Pollutant  -- code,name
#Relationship: [ city name, state <--> code]
#  observation -- year, measurement (arithmetic mean)  


#Gather unique cities
city.df <- annual.pollution.data %>%
  filter(City.Name != ''& State.Name == 'Texas')  %>%
    group_by(State.Name, City.Name)  %>%
      summarise() %>%
        select(State.Name, City.Name)


#Gather unique pollutants
pollutant.df <- datatidy.pollutant %>%
  filter(City.Name != '' & grepl('Chloroform|Benzene|Lead PM2.5 LC|Arsenic PM2.5 LC', Parameter.Name.x) & State.Name == 'Texas')  %>%
    group_by(Parameter.Code, Parameter.Name.x)  %>%
      summarise() %>%
         select(Parameter.Code, Parameter.Name.x)

#Gather the relationship [observation] between the city and pollutant - year, measurement (arithmetic mean)
observation.df <- datatidy.pollutant %>%
  filter(City.Name != '' & grepl('Chloroform|Benzene|Lead PM2.5 LC|Arsenic PM2.5 LC', Parameter.Name.x) & State.Name == 'Texas')  %>%
       select(Year, State.Name, City.Name, Parameter.Code, Parameter.Name.x, Arithmetic.Mean)

View the Sample Data

kable(head(city.df))

State.Name	City.Name
Texas	Amarillo
Texas	Arlington
Texas	Austin
Texas	Baytown
Texas	Beaumont
Texas	Brownsville

kable(head(pollutant.df))

Parameter.Code	Parameter.Name.x
43803	Chloroform
45201	Benzene
88103	Arsenic PM2.5 LC
88128	Lead PM2.5 LC

kable(head(observation.df))

Year	State.Name	City.Name	Parameter.Code	Parameter.Name.x	Arithmetic.Mean
2009	Texas	Austin	43803	Chloroform	0.019298
2009	Texas	Austin	45201	Benzene	1.988421
2009	Texas	Baytown	43803	Chloroform	0.035167
2009	Texas	Baytown	45201	Benzene	3.564000
2009	Texas	Beaumont	43803	Chloroform	0.030000
2009	Texas	Beaumont	45201	Benzene	2.311126

Preare csv files from the above data frames

write.table(city.df, "city-data.csv", quote=FALSE, row.names=FALSE, col.names=c('state','city'), sep=",")
write.table(pollutant.df, "pollutant-data.csv", quote=FALSE, row.names=FALSE, col.names=c('code','name'), sep=",")
write.table(observation.df, "observation-data.csv", quote=FALSE, row.names=FALSE, col.names=c('year','state', 'city', 'code', 'name', 'measurement'), sep=",")

NEO4J - Data Analysis

Click here for Neo4J Analysis

Prepare & Load Proxy Pollution Data into Neo4J

Suman, Sreejaya, Jamey

Monday, May 25, 2015

Activity Definition:

Data Acquisition:

Tidying and Transforming

NEO4J - DATA PREPARTION —- Generate CSV files for loading the proxy data into Neo4j

NEO4J - Data Analysis