fileUrl2012 =
"https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_2012.zip"
fileUrl1999 =
"https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_1999.zip"
if(!file.exists('./data_1999.csv')&&!file.exists('./data_2012.csv')){
invisible(download.file(fileUrl1999,"./data_1999.zip"))
invisible(download.file(fileUrl2012,"./data_2012.zip"))
unzip("data_2012.zip", exdir = getwd())
unzip("data_1999.zip", exdir = getwd())
}
unlink('./data_1999.zip')
unlink('./data_2012.zip')
tryCatch(
invisible(
file.rename(
"annual_conc_by_monitor_1999.csv","data_1999.csv"
)
),warning = function(w){
if(file.exists('./data_1999.csv')){
print("The file \"data_1999.csv\" already exists!")
}else{
print("Some error occured while trying to rename the file
\"annual_conc_by_monitor_1999.csv\" to \"data_1999.csv\"!")
}
}
)
tryCatch(
invisible(
file.rename(
"annual_conc_by_monitor_2012.csv","data_2012.csv"
)
),warning = function(w){
if(file.exists('./data_1999.csv')){
print("The file \"data_2012.csv\" already exists!")
}else{
print("Some error occured while trying to rename the file
\"annual_conc_by_monitor_2012.csv\" to \"data_2012.csv\"!")
}
}
)
data_1999 = read.csv("data_1999.csv")
data_2012 = read.csv("data_2012.csv")
Identifying the parameter code for PM2.5 data
invisible(library(dplyr))
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
par_code = grep("PM2.5",data_1999$Parameter.Name)[1]
par_code = data_1999$Parameter.Code[par_code]
Filtering dataset with only the required parameter list - PM2.5
data_1999 = filter(data_1999, Parameter.Code == par_code)
data_2012 = filter(data_2012, Parameter.Code == par_code)
Size of the dataset
sprintf("1999 data, Rows = %d, Columns = %d",dim(data_1999)[1],dim(data_1999)[2])
## [1] "1999 data, Rows = 4801, Columns = 55"
sprintf("2012 data, Rows = %d, Columns = %d",dim(data_2012)[1],dim(data_2012)[2])
## [1] "2012 data, Rows = 5547, Columns = 55"
Subsetting just the particulate matter measure column from each dataset
pm_1999 = data_1999$X1st.Max.Value
pm_2012 = data_2012$X1st.Max.Value
Structure of variable
summary(pm_1999)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.20 29.00 38.80 41.63 48.00 157.10
summary(pm_2012)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.80 21.70 27.60 38.59 36.50 801.00
0% of the data is missing
mean(is.na(pm_1999))
## [1] 0
mean(is.na(pm_2012))
## [1] 0
boxplot
boxplot(log10(pm_1999),log10(pm_2012))
We observe that eventhough the average values have gone down, the spread of the data has increased to the right extreme, i.e the data has become more right skewed in 2012, from being left skewed in 1999.
Extracting the dates in which the measurements were taken
date_1999 = as.Date(data_1999$X1st.Max.DateTime)
date_2012 = as.Date(data_2012$X1st.Max.DateTime)
str(date_1999)
## Date[1:4801], format: "1999-02-14" "1999-02-14" "1999-02-14" "1999-02-14" "1999-08-28" ...
str(date_2012)
## Date[1:5547], format: "2012-07-29" "2012-07-29" "2012-07-29" "2012-07-29" "2012-06-29" ...
par(mfrow = c(1,2))
hist(date_1999,"month")
hist(date_2012,"month")
To make analysis more simpler we pick a common monitor from 1999 and 2012, and compare the pm2.5 levels between the two instead of the levels of the entire country. This will also allow us to control for possible changes in the monitoring locations between 1999 and 2012. As new monitors are added overtime and this can lead to inaccuracy in the analysis by taking into consideration the entire country.
Now we subset the data to look at the various monitors present in New York City
m_NY_1999 = unique(subset(data_1999, State.Code==36, c(County.Code, Site.Num)))
m_NY_2012 = unique(subset(data_2012, State.Code==36, c(County.Code, Site.Num)))
## Creating a new variable that pastes the values of County.Code and Site.Num
## inorder to find the intersecting County-Site combinations in both time periods
## within newyork city
m_NY_1999 = paste(m_NY_1999[,1],m_NY_1999[,2],sep = ".")
m_NY_2012 = paste(m_NY_2012[,1],m_NY_2012[,2],sep = ".")
## Finding the intersecting county-site combinations
common_m = intersect(m_NY_1999,m_NY_2012)
common_m
## [1] "1.5" "1.12" "5.80" "5.110" "13.11" "29.5" "31.3"
## [8] "63.2008" "67.1015" "85.55" "101.3"
Thus we’re able to find the common monitor we can use for out analysis Now we need to select the monitor that has the most number of observations, since more data equals better analysis and visualizations
Subsetting the data from NY to only have records of observations that are from the common monitors
## Creating a new variable county.site, to subset
m_NY_1999 = data_1999
m_NY_2012 = data_2012
m_NY_1999$County.Site = paste(m_NY_1999$County.Code,m_NY_1999$Site.Num,sep = ".")
m_NY_2012$County.Site = paste(m_NY_2012$County.Code,m_NY_2012$Site.Num,sep = ".")
m_NY_1999 = subset(m_NY_1999, County.Site %in% common_m)
m_NY_2012 = subset(m_NY_2012, County.Site %in% common_m)
Looking at the number of observations for each County.Site variable
sapply(split(m_NY_1999, m_NY_1999$County.Site),nrow)
## 1.12 1.5 101.3 13.11 29.5 31.3 5.110 5.80 63.2008 67.1015
## 4 8 12 8 4 4 8 4 8 8
## 85.55
## 4
sapply(split(m_NY_2012, m_NY_2012$County.Site),nrow)
## 1.12 1.5 101.3 13.11 29.5 31.3 5.110 5.80 63.2008 67.1015
## 4 8 8 4 4 4 4 4 4 4
## 85.55
## 4
We could pick the county.site combination 1.5 i.e county 1, and site 5 within that county but there is so few observations in each combination of county-site, therefore we shall consider every common County.Site for our analysis.
## Originally we were supposed to do the following
## m_1999 = subset(data_1999, State.Code == 36 & County.Code == 1 & Site.Num == 5)
## m_2012 = subset(data_1999, State.Code == 36 & County.Code == 1 & Site.Num == 5)
## But due to lack of observations we proceed with selecting every common
## monitors in New York City
m_1999 = m_NY_1999
m_2012 = m_NY_2012
Creating a timeseries analysis for the data
pm_1999 = m_1999$X1st.Max.Value
pm_2012 = m_2012$X1st.Max.Value
date_1999 = as.Date(m_1999$X1st.Max.DateTime)
date_2012 = as.Date(m_2012$X1st.Max.DateTime)
par(mfrow = c(1,2), mar = c(5,4,1,1))
rng = range(pm_1999,pm_2012, na.rm = T)
plot(date_1999,pm_1999, pch = 19, cex = 1.2, ylim = rng)
abline(h = median(pm_1999))
plot(date_2012,pm_2012, pch = 19, cex = 1.2, ylim = rng)
abline(h = median(pm_2012))
Creating state-wise plot to analyze and compare the PM2.5 levels of the states between the year 1999 and 2012 We calculate the average PM value by each state using the tapply() function
mean_1999 = with(data_1999, tapply(X1st.Max.Value,State.Code,mean))
mean_2012 = with(data_2012, tapply(X1st.Max.Value,State.Code,mean))
dc_1999 = data.frame(state = names(mean_1999), mean = mean_1999)
dc_2012 = data.frame(state = names(mean_2012), mean = mean_2012)
Viewing the data
1999 means
head(dc_1999)
## state mean
## 01 01 48.57778
## 02 02 37.42727
## 04 04 32.29545
## 05 05 32.91364
## 06 06 63.65870
## 08 08 27.52105
2012 means
head(dc_2012)
## state mean
## 1 1 23.07083
## 2 2 56.21364
## 4 4 55.00199
## 5 5 23.75556
## 6 6 46.56210
## 8 8 28.73895
Merging the data of 1999 and 2012
mergedData = merge(dc_1999,dc_2012, by = "state")
names(mergedData) = c("State","Mean in 1999","Mean in 2012")
head(mergedData)
## State Mean in 1999 Mean in 2012
## 1 10 41.43636 26.66364
## 2 11 56.07778 41.86452
## 3 12 38.73929 25.71765
## 4 13 66.86923 28.98909
## 5 15 39.30909 54.66515
## 6 16 53.35385 119.74516
dim(mergedData)
## [1] 44 3
with(mergedData, plot(rep(1999,44),mergedData[,2], xlim = c(1998,2013),ylim = c(10,120)))
with(mergedData, points(rep(2012,44),mergedData[,3]))
segments(rep(1999,44),mergedData[,2],rep(2012,44),mergedData[,3])
We observe that most of the states have had their mean PM2.5 value go down over the years when comparing the means between 1999 and 2012.
unlink("data_1999.csv")
unlink("data_2012.csv")