require(data.table)
require(lubridate)
require(dplyr)
require(ggplot2)
old_data<-fread("https://bit.ly/3c4AHbL",colClasses = c("character","character","character"
,"character","character","numeric","numeric","numeric","numeric","numeric","numeric","numeric"))
new_data<-fread("https://bit.ly/3nZicL2",colClasses = c("character","character","character"
,"character","character","numeric","numeric","numeric","numeric","numeric","numeric","numeric"))
str(old_data)
## Classes 'data.table' and 'data.frame': 117421 obs. of 12 variables:
## $ X..RD : chr "RD" "RD" "RD" "RD" ...
## $ Action.Code : chr "I" "I" "I" "I" ...
## $ State.Code : chr "01" "01" "01" "01" ...
## $ County.Code : chr "027" "027" "027" "027" ...
## $ Site.ID : chr "0001" "0001" "0001" "0001" ...
## $ Parameter : num 88101 88101 88101 88101 88101 ...
## $ POC : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Sample.Duration: num 7 7 7 7 7 7 7 7 7 7 ...
## $ Unit : num 105 105 105 105 105 105 105 105 105 105 ...
## $ Method : num 120 120 120 120 120 120 120 120 120 120 ...
## $ Date : num 2e+07 2e+07 2e+07 2e+07 2e+07 ...
## $ Sample.Value : num NA NA NA 8.84 14.92 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(new_data)
## Classes 'data.table' and 'data.frame': 1304287 obs. of 12 variables:
## $ X..RD : chr "RD" "RD" "RD" "RD" ...
## $ Action.Code : chr "I" "I" "I" "I" ...
## $ State.Code : chr "01" "01" "01" "01" ...
## $ County.Code : chr "003" "003" "003" "003" ...
## $ Site.ID : chr "0010" "0010" "0010" "0010" ...
## $ Parameter : num 88101 88101 88101 88101 88101 ...
## $ POC : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Sample.Duration: num 7 7 7 7 7 7 7 7 7 7 ...
## $ Unit : num 105 105 105 105 105 105 105 105 105 105 ...
## $ Method : num 118 118 118 118 118 118 118 118 118 118 ...
## $ Date : num 20120101 20120104 20120107 20120110 20120113 ...
## $ Sample.Value : num 6.7 9 6.5 7 5.8 8 7.9 8 6 9.6 ...
## - attr(*, ".internal.selfref")=<externalptr>
dim(old_data)
## [1] 117421 12
head(old_data,3)
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1: RD I 01 027 0001 88101 1
## 2: RD I 01 027 0001 88101 1
## 3: RD I 01 027 0001 88101 1
## Sample.Duration Unit Method Date Sample.Value
## 1: 7 105 120 19990103 NA
## 2: 7 105 120 19990106 NA
## 3: 7 105 120 19990109 NA
summary(old_data$Sample.Value)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 7.20 11.50 13.74 17.90 157.10 13217
NA_Value<-table(is.na(old_data$Sample.Value))
NA_Value
##
## FALSE TRUE
## 104204 13217
total_number<-as.numeric(NA_Value[2])/nrow(old_data)
cat("Total Number of Observations :",total_number)
## Total Number of Observations : 0.1125608
paste("Percentage of PM2.5 : ",round(100*total_number, 3), "%")
## [1] "Percentage of PM2.5 : 11.256 %"
pm<-rbind(old_data,new_data)
date<-year(ymd(pm$Date))
pm$year<-factor(date)
is.factor(pm$year)
## [1] TRUE
head(pm)
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1: RD I 01 027 0001 88101 1
## 2: RD I 01 027 0001 88101 1
## 3: RD I 01 027 0001 88101 1
## 4: RD I 01 027 0001 88101 1
## 5: RD I 01 027 0001 88101 1
## 6: RD I 01 027 0001 88101 1
## Sample.Duration Unit Method Date Sample.Value year
## 1: 7 105 120 19990103 NA 1999
## 2: 7 105 120 19990106 NA 1999
## 3: 7 105 120 19990109 NA 1999
## 4: 7 105 120 19990112 8.841 1999
## 5: 7 105 120 19990115 14.920 1999
## 6: 7 105 120 19990118 3.878 1999
names(pm)[names(pm) == "Sample.Value"] <- "PM"
head(pm)
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1: RD I 01 027 0001 88101 1
## 2: RD I 01 027 0001 88101 1
## 3: RD I 01 027 0001 88101 1
## 4: RD I 01 027 0001 88101 1
## 5: RD I 01 027 0001 88101 1
## 6: RD I 01 027 0001 88101 1
## Sample.Duration Unit Method Date PM year
## 1: 7 105 120 19990103 NA 1999
## 2: 7 105 120 19990106 NA 1999
## 3: 7 105 120 19990109 NA 1999
## 4: 7 105 120 19990112 8.841 1999
## 5: 7 105 120 19990115 14.920 1999
## 6: 7 105 120 19990118 3.878 1999
set.seed(2021)
random_data<-sample_n(pm,1000)
random_data
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1: RD I 11 001 0043 88101 4
## 2: RD I 30 075 0001 88101 3
## 3: RD I 10 003 2004 88101 3
## 4: RD I 46 103 0020 88101 3
## 5: RD I 30 083 0001 88101 3
## ---
## 996: RD I 42 071 0007 88101 1
## 997: RD I 48 141 0044 88101 6
## 998: RD I 37 107 0004 88101 1
## 999: RD I 20 107 0002 88101 1
## 1000: RD I 06 073 1008 88101 3
## Sample.Duration Unit Method Date PM year
## 1: 1 105 170 20120212 8.00 2012
## 2: 1 105 170 20120305 8.60 2012
## 3: 1 105 184 20120323 8.88 2012
## 4: 1 105 170 20120312 2.70 2012
## 5: 1 105 170 20120526 3.20 2012
## ---
## 996: 7 105 118 20120313 13.50 2012
## 997: 1 105 170 20120621 18.00 2012
## 998: 7 105 118 20120512 NA 2012
## 999: 7 105 118 19990530 16.20 1999
## 1000: 1 105 170 20120629 6.00 2012
##### ✢ Take the log of the PM values (with base 2)
random_data[,12] <- round(log(random_data[,12], 2),2)
## Warning in lapply(X = x, FUN = .Generic, ...): NaNs produced
random_data
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1: RD I 11 001 0043 88101 4
## 2: RD I 30 075 0001 88101 3
## 3: RD I 10 003 2004 88101 3
## 4: RD I 46 103 0020 88101 3
## 5: RD I 30 083 0001 88101 3
## ---
## 996: RD I 42 071 0007 88101 1
## 997: RD I 48 141 0044 88101 6
## 998: RD I 37 107 0004 88101 1
## 999: RD I 20 107 0002 88101 1
## 1000: RD I 06 073 1008 88101 3
## Sample.Duration Unit Method Date PM year
## 1: 1 105 170 20120212 3.00 2012
## 2: 1 105 170 20120305 3.10 2012
## 3: 1 105 184 20120323 3.15 2012
## 4: 1 105 170 20120312 1.43 2012
## 5: 1 105 170 20120526 1.68 2012
## ---
## 996: 7 105 118 20120313 3.75 2012
## 997: 1 105 170 20120621 4.17 2012
## 998: 7 105 118 20120512 NA 2012
## 999: 7 105 118 19990530 4.02 1999
## 1000: 1 105 170 20120629 2.58 2012
ggplot(random_data, aes(x= year,y = PM)) +
labs(title = "Boxplot of PM values in 1999 and 2012",x="Year",y="Log2 PM2.5")+
geom_boxplot(aes(color = year))+
theme_bw()
1) In terms of average, the box plot of 1999 shows that it has
the average of around 3-4 while as in 2012 around +- 3. So the average
of year 2012 is lower than year 1999.
2) However the boxplot
shows that 1999’s variance is lower than the 2012’s since the
measurement shows that the data in 2012 is much further spreading from
the average.
NYC<-filter(pm,pm$State.Code=="36")
NYC_select<-select(NYC,State.Code,County.Code,Site.ID)
NYC_select<-unique(NYC_select)
NYC_select
## State.Code County.Code Site.ID
## 1: 36 001 0005
## 2: 36 001 0012
## 3: 36 005 0073
## 4: 36 005 0080
## 5: 36 005 0083
## 6: 36 005 0110
## 7: 36 013 0011
## 8: 36 027 1004
## 9: 36 029 0002
## 10: 36 029 0005
## 11: 36 029 1007
## 12: 36 031 0003
## 13: 36 047 0011
## 14: 36 047 0076
## 15: 36 055 6001
## 16: 36 059 0005
## 17: 36 059 0008
## 18: 36 059 0011
## 19: 36 061 0010
## 20: 36 061 0056
## 21: 36 061 0062
## 22: 36 063 2008
## 23: 36 065 2001
## 24: 36 067 0019
## 25: 36 067 1015
## 26: 36 081 0094
## 27: 36 081 0097
## 28: 36 085 0055
## 29: 36 085 0067
## 30: 36 089 3001
## 31: 36 093 0003
## 32: 36 101 0003
## 33: 36 103 0001
## 34: 36 005 0133
## 35: 36 047 0122
## 36: 36 055 1007
## 37: 36 061 0079
## 38: 36 061 0134
## 39: 36 071 0002
## 40: 36 081 0124
## 41: 36 103 0002
## State.Code County.Code Site.ID
NYC$Site.Code <- paste(NYC$County.Code,NYC$Site.ID,sep='.')
NYC
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1: RD I 36 001 0005 88101 1
## 2: RD I 36 001 0005 88101 1
## 3: RD I 36 001 0005 88101 1
## 4: RD I 36 001 0005 88101 1
## 5: RD I 36 001 0005 88101 1
## ---
## 3264: RD I 36 103 0002 88101 1
## 3265: RD I 36 103 0002 88101 1
## 3266: RD I 36 103 0002 88101 1
## 3267: RD I 36 103 0002 88101 1
## 3268: RD I 36 103 0002 88101 1
## Sample.Duration Unit Method Date PM year Site.Code
## 1: 7 105 118 19990702 NA 1999 001.0005
## 2: 7 105 118 19990705 NA 1999 001.0005
## 3: 7 105 118 19990708 NA 1999 001.0005
## 4: 7 105 118 19990711 NA 1999 001.0005
## 5: 7 105 118 19990714 11.80 1999 001.0005
## ---
## 3264: 7 105 118 20120319 9.58 2012 103.0002
## 3265: 7 105 118 20120322 10.33 2012 103.0002
## 3266: 7 105 118 20120325 5.41 2012 103.0002
## 3267: 7 105 118 20120328 9.62 2012 103.0002
## 3268: 7 105 118 20120331 6.25 2012 103.0002
New_NYC<-split(NYC,NYC$year)
x<-New_NYC$`1999`
y<-New_NYC$`2012`
combine<-intersect(x$Site.Code,y$Site.Code)
print(combine)
## [1] "001.0005" "001.0012" "005.0080" "013.0011" "029.0005" "031.0003"
## [7] "063.2008" "067.1015" "085.0055" "101.0003"
pm<-mutate(pm,Site.Code = paste(pm$County.Code,pm$Site.ID,sep='.'))
counting<-filter(pm,Site.Code %in% combine)
grouping = counting %>% group_by(Site.Code) %>% summarize(total_count = n())
arrange(grouping,total_count)
## # A tibble: 10 × 2
## Site.Code total_count
## <chr> <int>
## 1 085.0055 38
## 2 001.0012 92
## 3 005.0080 92
## 4 029.0005 94
## 5 063.2008 152
## 6 067.1015 153
## 7 001.0005 186
## 8 031.0003 198
## 9 013.0011 213
## 10 101.0003 527
pmsub<-subset(pm,pm$State.Code == "36" & pm$County.Code == "101" & pm$Site.ID == "0003")
pmsub
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1: RD I 36 101 0003 88101 1
## 2: RD I 36 101 0003 88101 1
## 3: RD I 36 101 0003 88101 1
## 4: RD I 36 101 0003 88101 1
## 5: RD I 36 101 0003 88101 1
## ---
## 179: RD I 36 101 0003 88101 1
## 180: RD I 36 101 0003 88101 1
## 181: RD I 36 101 0003 88101 1
## 182: RD I 36 101 0003 88101 1
## 183: RD I 36 101 0003 88101 1
## Sample.Duration Unit Method Date PM year Site.Code
## 1: 7 105 118 19990802 NA 1999 101.0003
## 2: 7 105 118 19990803 3.90 1999 101.0003
## 3: 7 105 118 19990804 11.80 1999 101.0003
## 4: 7 105 118 19990805 12.00 1999 101.0003
## 5: 7 105 118 19990806 8.50 1999 101.0003
## ---
## 179: 7 105 118 20120319 NA 2012 101.0003
## 180: 7 105 118 20120322 6.58 2012 101.0003
## 181: 7 105 118 20120325 4.58 2012 101.0003
## 182: 7 105 118 20120328 5.95 2012 101.0003
## 183: 7 105 118 20120331 4.66 2012 101.0003
temp<-ymd(pmsub$Date)
pmsub$yday<-yday(temp)
pmsub
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1: RD I 36 101 0003 88101 1
## 2: RD I 36 101 0003 88101 1
## 3: RD I 36 101 0003 88101 1
## 4: RD I 36 101 0003 88101 1
## 5: RD I 36 101 0003 88101 1
## ---
## 179: RD I 36 101 0003 88101 1
## 180: RD I 36 101 0003 88101 1
## 181: RD I 36 101 0003 88101 1
## 182: RD I 36 101 0003 88101 1
## 183: RD I 36 101 0003 88101 1
## Sample.Duration Unit Method Date PM year Site.Code yday
## 1: 7 105 118 19990802 NA 1999 101.0003 214
## 2: 7 105 118 19990803 3.90 1999 101.0003 215
## 3: 7 105 118 19990804 11.80 1999 101.0003 216
## 4: 7 105 118 19990805 12.00 1999 101.0003 217
## 5: 7 105 118 19990806 8.50 1999 101.0003 218
## ---
## 179: 7 105 118 20120319 NA 2012 101.0003 79
## 180: 7 105 118 20120322 6.58 2012 101.0003 82
## 181: 7 105 118 20120325 4.58 2012 101.0003 85
## 182: 7 105 118 20120328 5.95 2012 101.0003 88
## 183: 7 105 118 20120331 4.66 2012 101.0003 91
ggplot(pmsub, aes(x=yday, y=PM))+
geom_point()+
facet_grid(. ~ year)+
theme_bw()+
labs(x="Day of the Year",y="PM")