Opening the libraries


require(data.table)
require(lubridate)
require(dplyr)
require(ggplot2)


1) Import and Preprocess Data


Part (A)


✢ Import the datasets
old_data<-fread("https://bit.ly/3c4AHbL",colClasses = c("character","character","character"
                ,"character","character","numeric","numeric","numeric","numeric","numeric","numeric","numeric"))
new_data<-fread("https://bit.ly/3nZicL2",colClasses = c("character","character","character"
                ,"character","character","numeric","numeric","numeric","numeric","numeric","numeric","numeric"))


✢ The first 5 variables is set to “Character” and the rest is “Numeric”
str(old_data)
## Classes 'data.table' and 'data.frame':   117421 obs. of  12 variables:
##  $ X..RD          : chr  "RD" "RD" "RD" "RD" ...
##  $ Action.Code    : chr  "I" "I" "I" "I" ...
##  $ State.Code     : chr  "01" "01" "01" "01" ...
##  $ County.Code    : chr  "027" "027" "027" "027" ...
##  $ Site.ID        : chr  "0001" "0001" "0001" "0001" ...
##  $ Parameter      : num  88101 88101 88101 88101 88101 ...
##  $ POC            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Sample.Duration: num  7 7 7 7 7 7 7 7 7 7 ...
##  $ Unit           : num  105 105 105 105 105 105 105 105 105 105 ...
##  $ Method         : num  120 120 120 120 120 120 120 120 120 120 ...
##  $ Date           : num  2e+07 2e+07 2e+07 2e+07 2e+07 ...
##  $ Sample.Value   : num  NA NA NA 8.84 14.92 ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(new_data)
## Classes 'data.table' and 'data.frame':   1304287 obs. of  12 variables:
##  $ X..RD          : chr  "RD" "RD" "RD" "RD" ...
##  $ Action.Code    : chr  "I" "I" "I" "I" ...
##  $ State.Code     : chr  "01" "01" "01" "01" ...
##  $ County.Code    : chr  "003" "003" "003" "003" ...
##  $ Site.ID        : chr  "0010" "0010" "0010" "0010" ...
##  $ Parameter      : num  88101 88101 88101 88101 88101 ...
##  $ POC            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Sample.Duration: num  7 7 7 7 7 7 7 7 7 7 ...
##  $ Unit           : num  105 105 105 105 105 105 105 105 105 105 ...
##  $ Method         : num  118 118 118 118 118 118 118 118 118 118 ...
##  $ Date           : num  20120101 20120104 20120107 20120110 20120113 ...
##  $ Sample.Value   : num  6.7 9 6.5 7 5.8 8 7.9 8 6 9.6 ...
##  - attr(*, ".internal.selfref")=<externalptr>


Part (B)


✢ Printing out the dimensions
dim(old_data)
## [1] 117421     12


✢ Printing out the first 3 rows
head(old_data,3)
##    X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1:    RD           I         01         027    0001     88101   1
## 2:    RD           I         01         027    0001     88101   1
## 3:    RD           I         01         027    0001     88101   1
##    Sample.Duration Unit Method     Date Sample.Value
## 1:               7  105    120 19990103           NA
## 2:               7  105    120 19990106           NA
## 3:               7  105    120 19990109           NA


Part (C)


✢ Using the 1999 data, print the summary statistics of the variable with summary()
summary(old_data$Sample.Value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    7.20   11.50   13.74   17.90  157.10   13217


✢ Compute the number of NAs using table() and is.na(), then divide the numbers by the total number of observations in the data to calculate the proportions
NA_Value<-table(is.na(old_data$Sample.Value))
NA_Value
## 
##  FALSE   TRUE 
## 104204  13217
total_number<-as.numeric(NA_Value[2])/nrow(old_data)
cat("Total Number of Observations :",total_number)
## Total Number of Observations : 0.1125608


✢ The percentage of the PM2.5 observations that are missing (round up to 3 decimal places)
paste("Percentage of PM2.5 : ",round(100*total_number, 3), "%")
## [1] "Percentage of PM2.5 :  11.256 %"


Part (D)


✢ Bind the 1999 data and 2012 data and assign the aggregated data to an object called ‘pm’
pm<-rbind(old_data,new_data)


✢ Subset the years from the Date variable and convert it into a factor variable called ‘year’
date<-year(ymd(pm$Date))
pm$year<-factor(date)
is.factor(pm$year)
## [1] TRUE
head(pm)
##    X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1:    RD           I         01         027    0001     88101   1
## 2:    RD           I         01         027    0001     88101   1
## 3:    RD           I         01         027    0001     88101   1
## 4:    RD           I         01         027    0001     88101   1
## 5:    RD           I         01         027    0001     88101   1
## 6:    RD           I         01         027    0001     88101   1
##    Sample.Duration Unit Method     Date Sample.Value year
## 1:               7  105    120 19990103           NA 1999
## 2:               7  105    120 19990106           NA 1999
## 3:               7  105    120 19990109           NA 1999
## 4:               7  105    120 19990112        8.841 1999
## 5:               7  105    120 19990115       14.920 1999
## 6:               7  105    120 19990118        3.878 1999


Part (E)


✢ Rename the Sample.Value variable to PM
names(pm)[names(pm) == "Sample.Value"] <- "PM"
head(pm)
##    X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1:    RD           I         01         027    0001     88101   1
## 2:    RD           I         01         027    0001     88101   1
## 3:    RD           I         01         027    0001     88101   1
## 4:    RD           I         01         027    0001     88101   1
## 5:    RD           I         01         027    0001     88101   1
## 6:    RD           I         01         027    0001     88101   1
##    Sample.Duration Unit Method     Date     PM year
## 1:               7  105    120 19990103     NA 1999
## 2:               7  105    120 19990106     NA 1999
## 3:               7  105    120 19990109     NA 1999
## 4:               7  105    120 19990112  8.841 1999
## 5:               7  105    120 19990115 14.920 1999
## 6:               7  105    120 19990118  3.878 1999


2) Data Exploration with Visualization using ggplot2


Part (A)


✢ Set the seed at 2021 and draw 1,000 randomly selected samples from the data
set.seed(2021)
random_data<-sample_n(pm,1000)
random_data
##       X..RD Action.Code State.Code County.Code Site.ID Parameter POC
##    1:    RD           I         11         001    0043     88101   4
##    2:    RD           I         30         075    0001     88101   3
##    3:    RD           I         10         003    2004     88101   3
##    4:    RD           I         46         103    0020     88101   3
##    5:    RD           I         30         083    0001     88101   3
##   ---                                                               
##  996:    RD           I         42         071    0007     88101   1
##  997:    RD           I         48         141    0044     88101   6
##  998:    RD           I         37         107    0004     88101   1
##  999:    RD           I         20         107    0002     88101   1
## 1000:    RD           I         06         073    1008     88101   3
##       Sample.Duration Unit Method     Date    PM year
##    1:               1  105    170 20120212  8.00 2012
##    2:               1  105    170 20120305  8.60 2012
##    3:               1  105    184 20120323  8.88 2012
##    4:               1  105    170 20120312  2.70 2012
##    5:               1  105    170 20120526  3.20 2012
##   ---                                                
##  996:               7  105    118 20120313 13.50 2012
##  997:               1  105    170 20120621 18.00 2012
##  998:               7  105    118 20120512    NA 2012
##  999:               7  105    118 19990530 16.20 1999
## 1000:               1  105    170 20120629  6.00 2012


Part (B)


##### ✢ Take the log of the PM values (with base 2)

random_data[,12] <- round(log(random_data[,12], 2),2)
## Warning in lapply(X = x, FUN = .Generic, ...): NaNs produced
random_data
##       X..RD Action.Code State.Code County.Code Site.ID Parameter POC
##    1:    RD           I         11         001    0043     88101   4
##    2:    RD           I         30         075    0001     88101   3
##    3:    RD           I         10         003    2004     88101   3
##    4:    RD           I         46         103    0020     88101   3
##    5:    RD           I         30         083    0001     88101   3
##   ---                                                               
##  996:    RD           I         42         071    0007     88101   1
##  997:    RD           I         48         141    0044     88101   6
##  998:    RD           I         37         107    0004     88101   1
##  999:    RD           I         20         107    0002     88101   1
## 1000:    RD           I         06         073    1008     88101   3
##       Sample.Duration Unit Method     Date   PM year
##    1:               1  105    170 20120212 3.00 2012
##    2:               1  105    170 20120305 3.10 2012
##    3:               1  105    184 20120323 3.15 2012
##    4:               1  105    170 20120312 1.43 2012
##    5:               1  105    170 20120526 1.68 2012
##   ---                                               
##  996:               7  105    118 20120313 3.75 2012
##  997:               1  105    170 20120621 4.17 2012
##  998:               7  105    118 20120512   NA 2012
##  999:               7  105    118 19990530 4.02 1999
## 1000:               1  105    170 20120629 2.58 2012


✢ Label the title, x-axis & y-axis, and use the base white theme to replicate the graphics
ggplot(random_data, aes(x= year,y = PM)) +
  labs(title = "Boxplot of PM values in 1999 and 2012",x="Year",y="Log2 PM2.5")+
  geom_boxplot(aes(color = year))+
  theme_bw()


Part (C)


✢ Describe what you observe in terms of the average and variance (in general terms—where it is centered & how much it is spread out) of the observations in 1999 and 2012?


1) In terms of average, the box plot of 1999 shows that it has the average of around 3-4 while as in 2012 around +- 3. So the average of year 2012 is lower than year 1999.

2) However the boxplot shows that 1999’s variance is lower than the 2012’s since the measurement shows that the data in 2012 is much further spreading from the average.


Part (D)


✢ Subset the data to include only the observations from New York and only include the County.Code and the Site.ID
NYC<-filter(pm,pm$State.Code=="36")
NYC_select<-select(NYC,State.Code,County.Code,Site.ID)
NYC_select<-unique(NYC_select)
NYC_select
##     State.Code County.Code Site.ID
##  1:         36         001    0005
##  2:         36         001    0012
##  3:         36         005    0073
##  4:         36         005    0080
##  5:         36         005    0083
##  6:         36         005    0110
##  7:         36         013    0011
##  8:         36         027    1004
##  9:         36         029    0002
## 10:         36         029    0005
## 11:         36         029    1007
## 12:         36         031    0003
## 13:         36         047    0011
## 14:         36         047    0076
## 15:         36         055    6001
## 16:         36         059    0005
## 17:         36         059    0008
## 18:         36         059    0011
## 19:         36         061    0010
## 20:         36         061    0056
## 21:         36         061    0062
## 22:         36         063    2008
## 23:         36         065    2001
## 24:         36         067    0019
## 25:         36         067    1015
## 26:         36         081    0094
## 27:         36         081    0097
## 28:         36         085    0055
## 29:         36         085    0067
## 30:         36         089    3001
## 31:         36         093    0003
## 32:         36         101    0003
## 33:         36         103    0001
## 34:         36         005    0133
## 35:         36         047    0122
## 36:         36         055    1007
## 37:         36         061    0079
## 38:         36         061    0134
## 39:         36         071    0002
## 40:         36         081    0124
## 41:         36         103    0002
##     State.Code County.Code Site.ID


Part (E)


✢ Create a new variable called Site.Code that combines the county code and the site ID into a single string by using paste() with “.” as the separator
NYC$Site.Code <- paste(NYC$County.Code,NYC$Site.ID,sep='.')
NYC
##       X..RD Action.Code State.Code County.Code Site.ID Parameter POC
##    1:    RD           I         36         001    0005     88101   1
##    2:    RD           I         36         001    0005     88101   1
##    3:    RD           I         36         001    0005     88101   1
##    4:    RD           I         36         001    0005     88101   1
##    5:    RD           I         36         001    0005     88101   1
##   ---                                                               
## 3264:    RD           I         36         103    0002     88101   1
## 3265:    RD           I         36         103    0002     88101   1
## 3266:    RD           I         36         103    0002     88101   1
## 3267:    RD           I         36         103    0002     88101   1
## 3268:    RD           I         36         103    0002     88101   1
##       Sample.Duration Unit Method     Date    PM year Site.Code
##    1:               7  105    118 19990702    NA 1999  001.0005
##    2:               7  105    118 19990705    NA 1999  001.0005
##    3:               7  105    118 19990708    NA 1999  001.0005
##    4:               7  105    118 19990711    NA 1999  001.0005
##    5:               7  105    118 19990714 11.80 1999  001.0005
##   ---                                                          
## 3264:               7  105    118 20120319  9.58 2012  103.0002
## 3265:               7  105    118 20120322 10.33 2012  103.0002
## 3266:               7  105    118 20120325  5.41 2012  103.0002
## 3267:               7  105    118 20120328  9.62 2012  103.0002
## 3268:               7  105    118 20120331  6.25 2012  103.0002


Part (F)


✢ Find the intersection of the sites (i.e., monitors) in between 1999 and 2012
New_NYC<-split(NYC,NYC$year)
x<-New_NYC$`1999`
y<-New_NYC$`2012`
combine<-intersect(x$Site.Code,y$Site.Code)
print(combine)
##  [1] "001.0005" "001.0012" "005.0080" "013.0011" "029.0005" "031.0003"
##  [7] "063.2008" "067.1015" "085.0055" "101.0003"


Part (G)


✢ Write a block of code to identify the monitor in the original data (i.e., pm) that had the most data
pm<-mutate(pm,Site.Code = paste(pm$County.Code,pm$Site.ID,sep='.'))
counting<-filter(pm,Site.Code %in% combine)
grouping = counting %>% group_by(Site.Code) %>% summarize(total_count = n())
arrange(grouping,total_count)
## # A tibble: 10 × 2
##    Site.Code total_count
##    <chr>           <int>
##  1 085.0055           38
##  2 001.0012           92
##  3 005.0080           92
##  4 029.0005           94
##  5 063.2008          152
##  6 067.1015          153
##  7 001.0005          186
##  8 031.0003          198
##  9 013.0011          213
## 10 101.0003          527


Part (H)


✢ Subset the data (i.e., pm) that contains observations from the monitor we just identified (State.Code = 36 & County.Code = 101 & Site.ID = 0003) and assign the subset data to an obj. called ‘pmsub’
pmsub<-subset(pm,pm$State.Code == "36" & pm$County.Code == "101" & pm$Site.ID == "0003")
pmsub
##      X..RD Action.Code State.Code County.Code Site.ID Parameter POC
##   1:    RD           I         36         101    0003     88101   1
##   2:    RD           I         36         101    0003     88101   1
##   3:    RD           I         36         101    0003     88101   1
##   4:    RD           I         36         101    0003     88101   1
##   5:    RD           I         36         101    0003     88101   1
##  ---                                                               
## 179:    RD           I         36         101    0003     88101   1
## 180:    RD           I         36         101    0003     88101   1
## 181:    RD           I         36         101    0003     88101   1
## 182:    RD           I         36         101    0003     88101   1
## 183:    RD           I         36         101    0003     88101   1
##      Sample.Duration Unit Method     Date    PM year Site.Code
##   1:               7  105    118 19990802    NA 1999  101.0003
##   2:               7  105    118 19990803  3.90 1999  101.0003
##   3:               7  105    118 19990804 11.80 1999  101.0003
##   4:               7  105    118 19990805 12.00 1999  101.0003
##   5:               7  105    118 19990806  8.50 1999  101.0003
##  ---                                                          
## 179:               7  105    118 20120319    NA 2012  101.0003
## 180:               7  105    118 20120322  6.58 2012  101.0003
## 181:               7  105    118 20120325  4.58 2012  101.0003
## 182:               7  105    118 20120328  5.95 2012  101.0003
## 183:               7  105    118 20120331  4.66 2012  101.0003


Part (I)


✢ Convert the Date variable into a date obj. and then create a variable called ‘yday’ containing info. on day of the year using yday().
temp<-ymd(pmsub$Date)
pmsub$yday<-yday(temp)
pmsub
##      X..RD Action.Code State.Code County.Code Site.ID Parameter POC
##   1:    RD           I         36         101    0003     88101   1
##   2:    RD           I         36         101    0003     88101   1
##   3:    RD           I         36         101    0003     88101   1
##   4:    RD           I         36         101    0003     88101   1
##   5:    RD           I         36         101    0003     88101   1
##  ---                                                               
## 179:    RD           I         36         101    0003     88101   1
## 180:    RD           I         36         101    0003     88101   1
## 181:    RD           I         36         101    0003     88101   1
## 182:    RD           I         36         101    0003     88101   1
## 183:    RD           I         36         101    0003     88101   1
##      Sample.Duration Unit Method     Date    PM year Site.Code yday
##   1:               7  105    118 19990802    NA 1999  101.0003  214
##   2:               7  105    118 19990803  3.90 1999  101.0003  215
##   3:               7  105    118 19990804 11.80 1999  101.0003  216
##   4:               7  105    118 19990805 12.00 1999  101.0003  217
##   5:               7  105    118 19990806  8.50 1999  101.0003  218
##  ---                                                               
## 179:               7  105    118 20120319    NA 2012  101.0003   79
## 180:               7  105    118 20120322  6.58 2012  101.0003   82
## 181:               7  105    118 20120325  4.58 2012  101.0003   85
## 182:               7  105    118 20120328  5.95 2012  101.0003   88
## 183:               7  105    118 20120331  4.66 2012  101.0003   91


Part (J)


✢ Draw a scatter plot by mapping the year-day variable on the x-axis, PM2.5 level on the y-axis separately for 1999 and 2012. Make sure to label the x-axis, separate the plots using the facet function and use the base white theme to replicate the graphics shown below.
ggplot(pmsub, aes(x=yday, y=PM))+ 
  geom_point()+
  facet_grid(. ~ year)+
  theme_bw()+
  labs(x="Day of the Year",y="PM")


Part (K)


✢ Interesting pattern observed is that the variation (spread) in the PM values in 2012 is much smaller (vs. larger in aggregate) than it was in 1999. The plot shows that not only are the average levels of PM lower in 2012, but that there are fewer large spikes from day to day in 2012.