# load packages
library(lubridate)
library(dplyr)
library(ggplot2)
# read in dataset
# getwd()
prsa <- read.csv("../data/PRSA_data_2010.1.1-2014.12.31.csv", row.names = 1, colClasses = c("pm2.5" = "numeric",
                                                                                            "DEWP"="numeric"))
# Explanations for each variable
# No: row number 
# year: year of data in this row 
# month: month of data in this row 
# day: day of data in this row 
# hour: hour of data in this row 
# pm2.5: PM2.5 concentration (ug/m^3) 
# DEWP: Dew Point (℃) 
# TEMP: Temperature (℃) 
# PRES: Pressure (hPa) 
# cbwd: Combined wind direction 
# Iws: Cumulated wind speed (m/s) 
# Is: Cumulated hours of snow 
# Ir: Cumulated hours of rain 
# Check distributions
summary(prsa)
      year          month             day             hour           pm2.5             DEWP        
 Min.   :2010   Min.   : 1.000   Min.   : 1.00   Min.   : 0.00   Min.   :  0.00   Min.   :-40.000  
 1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 5.75   1st Qu.: 29.00   1st Qu.:-10.000  
 Median :2012   Median : 7.000   Median :16.00   Median :11.50   Median : 72.00   Median :  2.000  
 Mean   :2012   Mean   : 6.524   Mean   :15.73   Mean   :11.50   Mean   : 98.61   Mean   :  1.817  
 3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:17.25   3rd Qu.:137.00   3rd Qu.: 15.000  
 Max.   :2014   Max.   :12.000   Max.   :31.00   Max.   :23.00   Max.   :994.00   Max.   : 28.000  
                                                                 NA's   :2067                      
      TEMP             PRES      cbwd            Iws               Is                 Ir         
 Min.   :-19.00   Min.   : 991   cv: 9387   Min.   :  0.45   Min.   : 0.00000   Min.   : 0.0000  
 1st Qu.:  2.00   1st Qu.:1008   NE: 4997   1st Qu.:  1.79   1st Qu.: 0.00000   1st Qu.: 0.0000  
 Median : 14.00   Median :1016   NW:14150   Median :  5.37   Median : 0.00000   Median : 0.0000  
 Mean   : 12.45   Mean   :1016   SE:15290   Mean   : 23.89   Mean   : 0.05273   Mean   : 0.1949  
 3rd Qu.: 23.00   3rd Qu.:1025              3rd Qu.: 21.91   3rd Qu.: 0.00000   3rd Qu.: 0.0000  
 Max.   : 42.00   Max.   :1046              Max.   :585.60   Max.   :27.00000   Max.   :36.0000  
                                                                                                 
# pm2.5 has 2067 NA
# change "cv"" in cbwd to "SW"
levels(prsa$cbwd)[1] <- "SW"
# sort it to NE, NW, SE, SW
prsa$cbwd <- factor(prsa$cbwd, levels = c("NE", "NW", "SE", "SW"))
summary(prsa$cbwd)
   NE    NW    SE    SW 
 4997 14150 15290  9387 
# create datetime from year, month, day and hour
# sort the dataframe by datetime
prsa <- prsa %>%
  mutate(date = make_date(year, month, day),
         datetime = make_datetime(year, month, day, hour)) %>%
  arrange(datetime)
# plot the distribution of each variable
which(colnames(prsa) == "pm2.5")
[1] 5
which(colnames(prsa) == "Ir")
[1] 12
for(i in 5:12) {
  if(is.factor(prsa[,i])) {
    print(ggplot(prsa, aes(prsa[, i])) +
            geom_histogram(stat = "count") +
            xlab(colnames(prsa)[i]))
  }
  else {
    print(ggplot(prsa, aes(prsa[, i])) +
            geom_histogram(binwidth = 2) +
            xlab(colnames(prsa)[i]))
  }
}

# correlations between pm2.5 and other variables
for(i in 6:12) {
  if(!is.factor(prsa[,i])) {
    print(paste("pm2.5 & ", colnames(prsa)[i], sep = ""))
    print(cor(prsa$pm2.5, prsa[, i], use = "complete.obs"))
    print(ggplot(prsa, aes(prsa[,i], pm2.5)) +
            geom_point() +
            xlab(colnames(prsa)[i]))
  }
  else {
    print(ggplot(prsa, aes(prsa[,i], pm2.5)) +
            geom_boxplot() +
            xlab(colnames(prsa)[i]))
  }
}
[1] "pm2.5 & DEWP"
[1] 0.1714233
[1] "pm2.5 & TEMP"
[1] -0.090534
[1] "pm2.5 & PRES"
[1] -0.04728231
[1] "pm2.5 & Iws"
[1] -0.2477844
[1] "pm2.5 & Is"
[1] 0.01926558
[1] "pm2.5 & Ir"
[1] -0.05136871

# Why is there an N-shape in the plot of pm2.5 vs Iws? Are they outliers?
# Are there any correlations between those weather variables?
for(i in 6:12) {
  for(j in (i+1):12) {
    if(j <= 12 & j > i & !is.factor(prsa[,i]) & !is.factor(prsa[,j])) {
      print(paste(colnames(prsa)[i], "&", colnames(prsa)[j], sep = " "))
      print(cor(prsa[,i], prsa[,j]))
      print(ggplot(prsa, aes(prsa[,i], prsa[,j])) + 
              geom_point() +
              xlab(colnames(prsa)[i]) +
              ylab(colnames(prsa)[j]))
    }
  }
}
[1] "DEWP & TEMP"
[1] 0.8246331
[1] "DEWP & PRES"
[1] -0.7783461
[1] "DEWP & Iws"
[1] -0.2963987
[1] "DEWP & Is"
[1] -0.03441037
[1] "DEWP & Ir"
[1] 0.1250895
[1] "TEMP & PRES"
[1] -0.8266904
[1] "TEMP & Iws"
[1] -0.1546228
[1] "TEMP & Is"
[1] -0.09260097
[1] "TEMP & Ir"
[1] 0.04912147
[1] "PRES & Iws"
[1] 0.1853547
[1] "PRES & Is"
[1] 0.06902795
[1] "PRES & Ir"
[1] -0.07984317
[1] "Iws & Is"
[1] 0.02188285
[1] "Iws & Ir"
[1] -0.01012205
[1] "Is & Ir"
[1] -0.009547625

for(i in 6:12) {
  if(!is.factor(prsa[,i])) {
    print(ggplot(prsa, aes(cbwd, prsa[,i])) +
            geom_boxplot() +
            xlab("cbwd") +
            ylab(colnames(prsa)[i]))
  }
}

# dewp and temp have positive correlation
# dewp and pres have negative correlation
# temp and pres have negative correlation
# plot time series of pm2.5
ggplot(prsa, aes(datetime, pm2.5)) +
  geom_line() + 
  scale_x_datetime(date_breaks = "6 months", limits = c(as.POSIXct("2009-12-01"), as.POSIXct("2015-01-31"))) +
  xlab("") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

LS0tCnRpdGxlOiAiRXhwbG9yYXRvcnkgRGF0YSBBbmFseXNpcyBvbiBCZWlqaW5nIFBNMi41IERhdGFzZXQiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQojIGxvYWQgcGFja2FnZXMKbGlicmFyeShsdWJyaWRhdGUpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoZ2dwbG90MikKYGBgCgpgYGB7cn0KIyByZWFkIGluIGRhdGFzZXQKIyBnZXR3ZCgpCnByc2EgPC0gcmVhZC5jc3YoIi4uL2RhdGEvUFJTQV9kYXRhXzIwMTAuMS4xLTIwMTQuMTIuMzEuY3N2Iiwgcm93Lm5hbWVzID0gMSwgY29sQ2xhc3NlcyA9IGMoInBtMi41IiA9ICJudW1lcmljIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAiREVXUCI9Im51bWVyaWMiKSkKIyBFeHBsYW5hdGlvbnMgZm9yIGVhY2ggdmFyaWFibGUKIyBObzogcm93IG51bWJlciAKIyB5ZWFyOiB5ZWFyIG9mIGRhdGEgaW4gdGhpcyByb3cgCiMgbW9udGg6IG1vbnRoIG9mIGRhdGEgaW4gdGhpcyByb3cgCiMgZGF5OiBkYXkgb2YgZGF0YSBpbiB0aGlzIHJvdyAKIyBob3VyOiBob3VyIG9mIGRhdGEgaW4gdGhpcyByb3cgCiMgcG0yLjU6IFBNMi41IGNvbmNlbnRyYXRpb24gKHVnL21eMykgCiMgREVXUDogRGV3IFBvaW50ICjDouKAnsaSKSAKIyBURU1QOiBUZW1wZXJhdHVyZSAow6LigJ7GkikgCiMgUFJFUzogUHJlc3N1cmUgKGhQYSkgCiMgY2J3ZDogQ29tYmluZWQgd2luZCBkaXJlY3Rpb24gCiMgSXdzOiBDdW11bGF0ZWQgd2luZCBzcGVlZCAobS9zKSAKIyBJczogQ3VtdWxhdGVkIGhvdXJzIG9mIHNub3cgCiMgSXI6IEN1bXVsYXRlZCBob3VycyBvZiByYWluIApgYGAKCmBgYHtyfQojIGNoZWNrIGRpc3RyaWJ1dGlvbnMKc3VtbWFyeShwcnNhKQojIHBtMi41IGhhcyAyMDY3IE5BCmBgYAoKYGBge3J9CiMgY2hhbmdlICJjdiIiIGluIGNid2QgdG8gIlNXIgpsZXZlbHMocHJzYSRjYndkKVsxXSA8LSAiU1ciCiMgc29ydCBpdCB0byBORSwgTlcsIFNFLCBTVwpwcnNhJGNid2QgPC0gZmFjdG9yKHByc2EkY2J3ZCwgbGV2ZWxzID0gYygiTkUiLCAiTlciLCAiU0UiLCAiU1ciKSkKc3VtbWFyeShwcnNhJGNid2QpCmBgYAoKYGBge3J9CiMgY3JlYXRlIGRhdGV0aW1lIGZyb20geWVhciwgbW9udGgsIGRheSBhbmQgaG91cgojIHNvcnQgdGhlIGRhdGFmcmFtZSBieSBkYXRldGltZQpwcnNhIDwtIHByc2EgJT4lCiAgbXV0YXRlKGRhdGUgPSBtYWtlX2RhdGUoeWVhciwgbW9udGgsIGRheSksCiAgICAgICAgIGRhdGV0aW1lID0gbWFrZV9kYXRldGltZSh5ZWFyLCBtb250aCwgZGF5LCBob3VyKSkgJT4lCiAgYXJyYW5nZShkYXRldGltZSkKYGBgCgpgYGB7ciwgZWNobz1UUlVFfQojIHBsb3QgdGhlIGRpc3RyaWJ1dGlvbiBvZiBlYWNoIHZhcmlhYmxlCndoaWNoKGNvbG5hbWVzKHByc2EpID09ICJwbTIuNSIpCndoaWNoKGNvbG5hbWVzKHByc2EpID09ICJJciIpCmZvcihpIGluIDU6MTIpIHsKICBpZihpcy5mYWN0b3IocHJzYVssaV0pKSB7CiAgICBwcmludChnZ3Bsb3QocHJzYSwgYWVzKHByc2FbLCBpXSkpICsKICAgICAgICAgICAgZ2VvbV9oaXN0b2dyYW0oc3RhdCA9ICJjb3VudCIpICsKICAgICAgICAgICAgeGxhYihjb2xuYW1lcyhwcnNhKVtpXSkpCiAgfQogIGVsc2UgewogICAgcHJpbnQoZ2dwbG90KHByc2EsIGFlcyhwcnNhWywgaV0pKSArCiAgICAgICAgICAgIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMikgKwogICAgICAgICAgICB4bGFiKGNvbG5hbWVzKHByc2EpW2ldKSkKICB9Cn0KYGBgCgpgYGB7ciwgZWNobz1UUlVFfQojIGNvcnJlbGF0aW9ucyBiZXR3ZWVuIHBtMi41IGFuZCBvdGhlciB2YXJpYWJsZXMKZm9yKGkgaW4gNjoxMikgewogIGlmKCFpcy5mYWN0b3IocHJzYVssaV0pKSB7CiAgICBwcmludChwYXN0ZSgicG0yLjUgJiAiLCBjb2xuYW1lcyhwcnNhKVtpXSwgc2VwID0gIiIpKQogICAgcHJpbnQoY29yKHByc2EkcG0yLjUsIHByc2FbLCBpXSwgdXNlID0gImNvbXBsZXRlLm9icyIpKQogICAgcHJpbnQoZ2dwbG90KHByc2EsIGFlcyhwcnNhWyxpXSwgcG0yLjUpKSArCiAgICAgICAgICAgIGdlb21fcG9pbnQoKSArCiAgICAgICAgICAgIHhsYWIoY29sbmFtZXMocHJzYSlbaV0pKQogIH0KICBlbHNlIHsKICAgIHByaW50KGdncGxvdChwcnNhLCBhZXMocHJzYVssaV0sIHBtMi41KSkgKwogICAgICAgICAgICBnZW9tX2JveHBsb3QoKSArCiAgICAgICAgICAgIHhsYWIoY29sbmFtZXMocHJzYSlbaV0pKQogIH0KfQojIFdoeSBpcyB0aGVyZSBhbiBOLXNoYXBlIGluIHRoZSBwbG90IG9mIHBtMi41IHZzIEl3cz8gQXJlIHRoZXkgb3V0bGllcnM/CmBgYAoKYGBge3J9CiMgQXJlIHRoZXJlIGFueSBjb3JyZWxhdGlvbnMgYmV0d2VlbiB0aG9zZSB3ZWF0aGVyIHZhcmlhYmxlcz8KZm9yKGkgaW4gNjoxMikgewogIGZvcihqIGluIChpKzEpOjEyKSB7CiAgICBpZihqIDw9IDEyICYgaiA+IGkgJiAhaXMuZmFjdG9yKHByc2FbLGldKSAmICFpcy5mYWN0b3IocHJzYVssal0pKSB7CiAgICAgIHByaW50KHBhc3RlKGNvbG5hbWVzKHByc2EpW2ldLCAiJiIsIGNvbG5hbWVzKHByc2EpW2pdLCBzZXAgPSAiICIpKQogICAgICBwcmludChjb3IocHJzYVssaV0sIHByc2FbLGpdKSkKICAgICAgcHJpbnQoZ2dwbG90KHByc2EsIGFlcyhwcnNhWyxpXSwgcHJzYVssal0pKSArIAogICAgICAgICAgICAgIGdlb21fcG9pbnQoKSArCiAgICAgICAgICAgICAgeGxhYihjb2xuYW1lcyhwcnNhKVtpXSkgKwogICAgICAgICAgICAgIHlsYWIoY29sbmFtZXMocHJzYSlbal0pKQogICAgfQogIH0KfQoKZm9yKGkgaW4gNjoxMikgewogIGlmKCFpcy5mYWN0b3IocHJzYVssaV0pKSB7CiAgICBwcmludChnZ3Bsb3QocHJzYSwgYWVzKGNid2QsIHByc2FbLGldKSkgKwogICAgICAgICAgICBnZW9tX2JveHBsb3QoKSArCiAgICAgICAgICAgIHhsYWIoImNid2QiKSArCiAgICAgICAgICAgIHlsYWIoY29sbmFtZXMocHJzYSlbaV0pKQogIH0KfQojIGRld3AgYW5kIHRlbXAgaGF2ZSBwb3NpdGl2ZSBjb3JyZWxhdGlvbgojIGRld3AgYW5kIHByZXMgaGF2ZSBuZWdhdGl2ZSBjb3JyZWxhdGlvbgojIHRlbXAgYW5kIHByZXMgaGF2ZSBuZWdhdGl2ZSBjb3JyZWxhdGlvbgpgYGAKCmBgYHtyfQojIHBsb3QgdGltZSBzZXJpZXMgb2YgcG0yLjUKZ2dwbG90KHByc2EsIGFlcyhkYXRldGltZSwgcG0yLjUpKSArCiAgZ2VvbV9saW5lKCkgKyAKICBzY2FsZV94X2RhdGV0aW1lKGRhdGVfYnJlYWtzID0gIjYgbW9udGhzIiwgbGltaXRzID0gYyhhcy5QT1NJWGN0KCIyMDA5LTEyLTAxIiksIGFzLlBPU0lYY3QoIjIwMTUtMDEtMzEiKSkpICsKICB4bGFiKCIiKSArCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgaGp1c3QgPSAxKSkKYGBgCgo=