# load packages
library(lubridate)
library(dplyr)
library(ggplot2)
# read in dataset
# getwd()
prsa <- read.csv("../data/PRSA_data_2010.1.1-2014.12.31.csv", row.names = 1, colClasses = c("pm2.5" = "numeric",
"DEWP"="numeric"))
# Explanations for each variable
# No: row number
# year: year of data in this row
# month: month of data in this row
# day: day of data in this row
# hour: hour of data in this row
# pm2.5: PM2.5 concentration (ug/m^3)
# DEWP: Dew Point (℃)
# TEMP: Temperature (℃)
# PRES: Pressure (hPa)
# cbwd: Combined wind direction
# Iws: Cumulated wind speed (m/s)
# Is: Cumulated hours of snow
# Ir: Cumulated hours of rain
# Check distributions
summary(prsa)
year month day hour pm2.5 DEWP
Min. :2010 Min. : 1.000 Min. : 1.00 Min. : 0.00 Min. : 0.00 Min. :-40.000
1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 5.75 1st Qu.: 29.00 1st Qu.:-10.000
Median :2012 Median : 7.000 Median :16.00 Median :11.50 Median : 72.00 Median : 2.000
Mean :2012 Mean : 6.524 Mean :15.73 Mean :11.50 Mean : 98.61 Mean : 1.817
3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:17.25 3rd Qu.:137.00 3rd Qu.: 15.000
Max. :2014 Max. :12.000 Max. :31.00 Max. :23.00 Max. :994.00 Max. : 28.000
NA's :2067
TEMP PRES cbwd Iws Is Ir
Min. :-19.00 Min. : 991 cv: 9387 Min. : 0.45 Min. : 0.00000 Min. : 0.0000
1st Qu.: 2.00 1st Qu.:1008 NE: 4997 1st Qu.: 1.79 1st Qu.: 0.00000 1st Qu.: 0.0000
Median : 14.00 Median :1016 NW:14150 Median : 5.37 Median : 0.00000 Median : 0.0000
Mean : 12.45 Mean :1016 SE:15290 Mean : 23.89 Mean : 0.05273 Mean : 0.1949
3rd Qu.: 23.00 3rd Qu.:1025 3rd Qu.: 21.91 3rd Qu.: 0.00000 3rd Qu.: 0.0000
Max. : 42.00 Max. :1046 Max. :585.60 Max. :27.00000 Max. :36.0000
# pm2.5 has 2067 NA
# change "cv"" in cbwd to "SW"
levels(prsa$cbwd)[1] <- "SW"
# sort it to NE, NW, SE, SW
prsa$cbwd <- factor(prsa$cbwd, levels = c("NE", "NW", "SE", "SW"))
summary(prsa$cbwd)
NE NW SE SW
4997 14150 15290 9387
# create datetime from year, month, day and hour
# sort the dataframe by datetime
prsa <- prsa %>%
mutate(date = make_date(year, month, day),
datetime = make_datetime(year, month, day, hour)) %>%
arrange(datetime)
# plot the distribution of each variable
which(colnames(prsa) == "pm2.5")
[1] 5
which(colnames(prsa) == "Ir")
[1] 12
for(i in 5:12) {
if(is.factor(prsa[,i])) {
print(ggplot(prsa, aes(prsa[, i])) +
geom_histogram(stat = "count") +
xlab(colnames(prsa)[i]))
}
else {
print(ggplot(prsa, aes(prsa[, i])) +
geom_histogram(binwidth = 2) +
xlab(colnames(prsa)[i]))
}
}








# correlations between pm2.5 and other variables
for(i in 6:12) {
if(!is.factor(prsa[,i])) {
print(paste("pm2.5 & ", colnames(prsa)[i], sep = ""))
print(cor(prsa$pm2.5, prsa[, i], use = "complete.obs"))
print(ggplot(prsa, aes(prsa[,i], pm2.5)) +
geom_point() +
xlab(colnames(prsa)[i]))
}
else {
print(ggplot(prsa, aes(prsa[,i], pm2.5)) +
geom_boxplot() +
xlab(colnames(prsa)[i]))
}
}
[1] "pm2.5 & DEWP"
[1] 0.1714233
[1] "pm2.5 & TEMP"
[1] -0.090534
[1] "pm2.5 & PRES"
[1] -0.04728231
[1] "pm2.5 & Iws"
[1] -0.2477844
[1] "pm2.5 & Is"
[1] 0.01926558
[1] "pm2.5 & Ir"
[1] -0.05136871







# Why is there an N-shape in the plot of pm2.5 vs Iws? Are they outliers?
# Are there any correlations between those weather variables?
for(i in 6:12) {
for(j in (i+1):12) {
if(j <= 12 & j > i & !is.factor(prsa[,i]) & !is.factor(prsa[,j])) {
print(paste(colnames(prsa)[i], "&", colnames(prsa)[j], sep = " "))
print(cor(prsa[,i], prsa[,j]))
print(ggplot(prsa, aes(prsa[,i], prsa[,j])) +
geom_point() +
xlab(colnames(prsa)[i]) +
ylab(colnames(prsa)[j]))
}
}
}
[1] "DEWP & TEMP"
[1] 0.8246331
[1] "DEWP & PRES"
[1] -0.7783461
[1] "DEWP & Iws"
[1] -0.2963987
[1] "DEWP & Is"
[1] -0.03441037
[1] "DEWP & Ir"
[1] 0.1250895
[1] "TEMP & PRES"
[1] -0.8266904
[1] "TEMP & Iws"
[1] -0.1546228
[1] "TEMP & Is"
[1] -0.09260097
[1] "TEMP & Ir"
[1] 0.04912147
[1] "PRES & Iws"
[1] 0.1853547
[1] "PRES & Is"
[1] 0.06902795
[1] "PRES & Ir"
[1] -0.07984317
[1] "Iws & Is"
[1] 0.02188285
[1] "Iws & Ir"
[1] -0.01012205
[1] "Is & Ir"
[1] -0.009547625















for(i in 6:12) {
if(!is.factor(prsa[,i])) {
print(ggplot(prsa, aes(cbwd, prsa[,i])) +
geom_boxplot() +
xlab("cbwd") +
ylab(colnames(prsa)[i]))
}
}






# dewp and temp have positive correlation
# dewp and pres have negative correlation
# temp and pres have negative correlation
# plot time series of pm2.5
ggplot(prsa, aes(datetime, pm2.5)) +
geom_line() +
scale_x_datetime(date_breaks = "6 months", limits = c(as.POSIXct("2009-12-01"), as.POSIXct("2015-01-31"))) +
xlab("") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

LS0tCnRpdGxlOiAiRXhwbG9yYXRvcnkgRGF0YSBBbmFseXNpcyBvbiBCZWlqaW5nIFBNMi41IERhdGFzZXQiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQojIGxvYWQgcGFja2FnZXMKbGlicmFyeShsdWJyaWRhdGUpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoZ2dwbG90MikKYGBgCgpgYGB7cn0KIyByZWFkIGluIGRhdGFzZXQKIyBnZXR3ZCgpCnByc2EgPC0gcmVhZC5jc3YoIi4uL2RhdGEvUFJTQV9kYXRhXzIwMTAuMS4xLTIwMTQuMTIuMzEuY3N2Iiwgcm93Lm5hbWVzID0gMSwgY29sQ2xhc3NlcyA9IGMoInBtMi41IiA9ICJudW1lcmljIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAiREVXUCI9Im51bWVyaWMiKSkKIyBFeHBsYW5hdGlvbnMgZm9yIGVhY2ggdmFyaWFibGUKIyBObzogcm93IG51bWJlciAKIyB5ZWFyOiB5ZWFyIG9mIGRhdGEgaW4gdGhpcyByb3cgCiMgbW9udGg6IG1vbnRoIG9mIGRhdGEgaW4gdGhpcyByb3cgCiMgZGF5OiBkYXkgb2YgZGF0YSBpbiB0aGlzIHJvdyAKIyBob3VyOiBob3VyIG9mIGRhdGEgaW4gdGhpcyByb3cgCiMgcG0yLjU6IFBNMi41IGNvbmNlbnRyYXRpb24gKHVnL21eMykgCiMgREVXUDogRGV3IFBvaW50ICjDouKAnsaSKSAKIyBURU1QOiBUZW1wZXJhdHVyZSAow6LigJ7GkikgCiMgUFJFUzogUHJlc3N1cmUgKGhQYSkgCiMgY2J3ZDogQ29tYmluZWQgd2luZCBkaXJlY3Rpb24gCiMgSXdzOiBDdW11bGF0ZWQgd2luZCBzcGVlZCAobS9zKSAKIyBJczogQ3VtdWxhdGVkIGhvdXJzIG9mIHNub3cgCiMgSXI6IEN1bXVsYXRlZCBob3VycyBvZiByYWluIApgYGAKCmBgYHtyfQojIGNoZWNrIGRpc3RyaWJ1dGlvbnMKc3VtbWFyeShwcnNhKQojIHBtMi41IGhhcyAyMDY3IE5BCmBgYAoKYGBge3J9CiMgY2hhbmdlICJjdiIiIGluIGNid2QgdG8gIlNXIgpsZXZlbHMocHJzYSRjYndkKVsxXSA8LSAiU1ciCiMgc29ydCBpdCB0byBORSwgTlcsIFNFLCBTVwpwcnNhJGNid2QgPC0gZmFjdG9yKHByc2EkY2J3ZCwgbGV2ZWxzID0gYygiTkUiLCAiTlciLCAiU0UiLCAiU1ciKSkKc3VtbWFyeShwcnNhJGNid2QpCmBgYAoKYGBge3J9CiMgY3JlYXRlIGRhdGV0aW1lIGZyb20geWVhciwgbW9udGgsIGRheSBhbmQgaG91cgojIHNvcnQgdGhlIGRhdGFmcmFtZSBieSBkYXRldGltZQpwcnNhIDwtIHByc2EgJT4lCiAgbXV0YXRlKGRhdGUgPSBtYWtlX2RhdGUoeWVhciwgbW9udGgsIGRheSksCiAgICAgICAgIGRhdGV0aW1lID0gbWFrZV9kYXRldGltZSh5ZWFyLCBtb250aCwgZGF5LCBob3VyKSkgJT4lCiAgYXJyYW5nZShkYXRldGltZSkKYGBgCgpgYGB7ciwgZWNobz1UUlVFfQojIHBsb3QgdGhlIGRpc3RyaWJ1dGlvbiBvZiBlYWNoIHZhcmlhYmxlCndoaWNoKGNvbG5hbWVzKHByc2EpID09ICJwbTIuNSIpCndoaWNoKGNvbG5hbWVzKHByc2EpID09ICJJciIpCmZvcihpIGluIDU6MTIpIHsKICBpZihpcy5mYWN0b3IocHJzYVssaV0pKSB7CiAgICBwcmludChnZ3Bsb3QocHJzYSwgYWVzKHByc2FbLCBpXSkpICsKICAgICAgICAgICAgZ2VvbV9oaXN0b2dyYW0oc3RhdCA9ICJjb3VudCIpICsKICAgICAgICAgICAgeGxhYihjb2xuYW1lcyhwcnNhKVtpXSkpCiAgfQogIGVsc2UgewogICAgcHJpbnQoZ2dwbG90KHByc2EsIGFlcyhwcnNhWywgaV0pKSArCiAgICAgICAgICAgIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMikgKwogICAgICAgICAgICB4bGFiKGNvbG5hbWVzKHByc2EpW2ldKSkKICB9Cn0KYGBgCgpgYGB7ciwgZWNobz1UUlVFfQojIGNvcnJlbGF0aW9ucyBiZXR3ZWVuIHBtMi41IGFuZCBvdGhlciB2YXJpYWJsZXMKZm9yKGkgaW4gNjoxMikgewogIGlmKCFpcy5mYWN0b3IocHJzYVssaV0pKSB7CiAgICBwcmludChwYXN0ZSgicG0yLjUgJiAiLCBjb2xuYW1lcyhwcnNhKVtpXSwgc2VwID0gIiIpKQogICAgcHJpbnQoY29yKHByc2EkcG0yLjUsIHByc2FbLCBpXSwgdXNlID0gImNvbXBsZXRlLm9icyIpKQogICAgcHJpbnQoZ2dwbG90KHByc2EsIGFlcyhwcnNhWyxpXSwgcG0yLjUpKSArCiAgICAgICAgICAgIGdlb21fcG9pbnQoKSArCiAgICAgICAgICAgIHhsYWIoY29sbmFtZXMocHJzYSlbaV0pKQogIH0KICBlbHNlIHsKICAgIHByaW50KGdncGxvdChwcnNhLCBhZXMocHJzYVssaV0sIHBtMi41KSkgKwogICAgICAgICAgICBnZW9tX2JveHBsb3QoKSArCiAgICAgICAgICAgIHhsYWIoY29sbmFtZXMocHJzYSlbaV0pKQogIH0KfQojIFdoeSBpcyB0aGVyZSBhbiBOLXNoYXBlIGluIHRoZSBwbG90IG9mIHBtMi41IHZzIEl3cz8gQXJlIHRoZXkgb3V0bGllcnM/CmBgYAoKYGBge3J9CiMgQXJlIHRoZXJlIGFueSBjb3JyZWxhdGlvbnMgYmV0d2VlbiB0aG9zZSB3ZWF0aGVyIHZhcmlhYmxlcz8KZm9yKGkgaW4gNjoxMikgewogIGZvcihqIGluIChpKzEpOjEyKSB7CiAgICBpZihqIDw9IDEyICYgaiA+IGkgJiAhaXMuZmFjdG9yKHByc2FbLGldKSAmICFpcy5mYWN0b3IocHJzYVssal0pKSB7CiAgICAgIHByaW50KHBhc3RlKGNvbG5hbWVzKHByc2EpW2ldLCAiJiIsIGNvbG5hbWVzKHByc2EpW2pdLCBzZXAgPSAiICIpKQogICAgICBwcmludChjb3IocHJzYVssaV0sIHByc2FbLGpdKSkKICAgICAgcHJpbnQoZ2dwbG90KHByc2EsIGFlcyhwcnNhWyxpXSwgcHJzYVssal0pKSArIAogICAgICAgICAgICAgIGdlb21fcG9pbnQoKSArCiAgICAgICAgICAgICAgeGxhYihjb2xuYW1lcyhwcnNhKVtpXSkgKwogICAgICAgICAgICAgIHlsYWIoY29sbmFtZXMocHJzYSlbal0pKQogICAgfQogIH0KfQoKZm9yKGkgaW4gNjoxMikgewogIGlmKCFpcy5mYWN0b3IocHJzYVssaV0pKSB7CiAgICBwcmludChnZ3Bsb3QocHJzYSwgYWVzKGNid2QsIHByc2FbLGldKSkgKwogICAgICAgICAgICBnZW9tX2JveHBsb3QoKSArCiAgICAgICAgICAgIHhsYWIoImNid2QiKSArCiAgICAgICAgICAgIHlsYWIoY29sbmFtZXMocHJzYSlbaV0pKQogIH0KfQojIGRld3AgYW5kIHRlbXAgaGF2ZSBwb3NpdGl2ZSBjb3JyZWxhdGlvbgojIGRld3AgYW5kIHByZXMgaGF2ZSBuZWdhdGl2ZSBjb3JyZWxhdGlvbgojIHRlbXAgYW5kIHByZXMgaGF2ZSBuZWdhdGl2ZSBjb3JyZWxhdGlvbgpgYGAKCmBgYHtyfQojIHBsb3QgdGltZSBzZXJpZXMgb2YgcG0yLjUKZ2dwbG90KHByc2EsIGFlcyhkYXRldGltZSwgcG0yLjUpKSArCiAgZ2VvbV9saW5lKCkgKyAKICBzY2FsZV94X2RhdGV0aW1lKGRhdGVfYnJlYWtzID0gIjYgbW9udGhzIiwgbGltaXRzID0gYyhhcy5QT1NJWGN0KCIyMDA5LTEyLTAxIiksIGFzLlBPU0lYY3QoIjIwMTUtMDEtMzEiKSkpICsKICB4bGFiKCIiKSArCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgaGp1c3QgPSAxKSkKYGBgCgo=