if (!require(pacman)){ install.packages("pacman") }
pacman::p_load(data.table, zoo, dygraphs)
knitr::opts_chunk$set(
    fig.width=9, fig.height=5,
    warning=FALSE, message=FALSE, results="hide", fig.keep=T)

Load Data

Data Source

Latest date: 2017/05/31

quality = rbind(
    fread("Beijing_2015_HourlyPM25_created20160201.csv", skip=3),
    fread("Beijing_2016_HourlyPM25_created20170201.csv", skip=3),
    fread("Beijing_2017_HourlyPM25_created20170705\ (3).csv", skip=3)
)

Simple Cleaning

There are 3 duplicate entries:

sum(duplicated(quality[, get("Date\ (LST)")]))

Use the latest entries:

quality = quality[!duplicated(quality[, get("Date\ (LST)")], fromLast=T)]

Impute Missing Values

quality[get("QC Name") != "Valid", Value := NA]
quality[, Value := na.locf(quality[, Value], na.rm=F)]

Parse dates

quality[, date:=as.Date(strptime(quality[, get("Date (LST)")], "%m/%d/%Y %H:%M"), tz="Asia/Shanghai")]

The Chart

daily = quality[, .(median=as.numeric(median(Value)), avg=mean(Value),
                    upper=quantile(Value, probs=0.90), 
                    lower=quantile(Value, probs=0.1)), 
                by=date]


dygraph(daily[, .(date, lower, avg, upper)], 
        main="Beijing Daily PM2.5 Concentration") %>% 
    dyAxis("x", drawGrid=F) %>% dyAxis("y", drawGrid=F) %>%
    dySeries(c("lower", "avg", "upper"), label="PM2.5") %>%
    dyLimit(35, "US EPA 24-hr Guildline", labelLoc="right", color="green") %>%
    # dyLimit(25, "WHO 24-hr Guildline", labelLoc="right", color="grey") %>%
    dyOptions(colors = RColorBrewer::brewer.pal(3, "Set1")) %>%
    dyRangeSelector(dateWindow=c(as.Date("2017-01-01"), max(daily$date))) %>% dyLegend(width = 200, show = "follow")

References