Original Code

Original code is first of 26 plots here

date_breaks <- function(days) {
  length <- cut(days, c(-1, 7, 12, 56, 180, 600, 5000, Inf), labels=FALSE)
  major <- c("days", "2 days", "weeks", "months", "3 months", "years",
             "5 years")[length]
  minor <- c("days", "days", "days", "weeks", "months", "months",
             "years")[length]
  format <- c("%d-%b", "%d-%b", "%d-%b", "%b-%Y", "%b-%Y", "%Y",
              "%Y")[length]
  list(major = major, minor = minor, format = format)
}
formatter <- function(x, ...) {
  format(x, ..., scientific = FALSE, big.mark = ' ')
}
plot_networksize <- function(start, end, path) {
  end <- min(end, as.character(Sys.Date() - 2))
  s <- read.csv(paste("/srv/metrics.torproject.org/metrics/shared/stats/",
                      "servers.csv", sep = ""), stringsAsFactors = FALSE)
  s <- s[s$date >= start & s$date <= end & s$flag == '' &
           s$country == '' & s$version == '' & s$platform == '' &
           s$ec2bridge == '', ]
  s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"), relays = s$relays,
                  bridges = s$bridges)
  dates <- seq(from = as.Date(start, "%Y-%m-%d"),
               to = as.Date(end, "%Y-%m-%d"), by="1 day")
  missing <- setdiff(dates, as.Date(s$date, origin = "1970-01-01"))
  if (length(missing) > 0)
    s <- rbind(s,
               data.frame(date = as.Date(missing, origin = "1970-01-01"),
                          relays = NA, bridges = NA))
  networksize <- melt(s, id = "date")
  date_breaks <- date_breaks(
    as.numeric(max(as.Date(networksize$date, "%Y-%m-%d")) -
                 min(as.Date(networksize$date, "%Y-%m-%d"))))
  ggplot(networksize, aes(x = as.Date(date, "%Y-%m-%d"), y = value,
                          colour = variable)) + geom_line(size = 1) +
    scale_x_date(name = paste("\nThe Tor Project - ",
                              "https://metrics.torproject.org/", sep = ""),
                 labels = date_format(date_breaks$format),
                 breaks = date_breaks$major,
                 minor_breaks = date_breaks$minor) +
    scale_y_continuous(name = "", limits = c(0, max(networksize$value,
                                                    na.rm = TRUE))) +
    scale_colour_hue("", breaks = c("relays", "bridges"),
                     labels = c("Relays", "Bridges")) +
    ggtitle("Number of relays\n")
  ggsave(filename = path, width = 8, height = 5, dpi = 72)
}

Refactored Code

Libraries

library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales)

Data Preparation

servers <- read_csv("https://metrics.torproject.org/stats/servers.csv") %>% 
  filter(is.na(flag) & is.na(country) & is.na(version) & is.na(platform) & is.na(ec2bridge)) %>%
  select(date, bridges, relays) %>% 
  mutate(bridges = as.integer(bridges)) %>% 
  mutate(date = as.Date(date)) %>% 
  gather(device, count, -date)

plot_networksize()

plot_networksize <- function(start, end, path) {
  end <- min(end, as.character(Sys.Date() - 2))
  servers %>% 
    filter(date >= as.Date(start) & date <= as.Date(end)) %>% 
    ggplot(aes(date, count, color = device)) +
      geom_line(size = 0.8) +
      scale_x_date(labels = date_format("%b-%Y")) +
      scale_colour_hue("", breaks = c("relays", "bridges")) +
      ggtitle("Number of Relays\n") +
      theme(plot.title = element_text(hjust = 0.5)) + 
      labs(y = "", x = "\nThe Tor Project - https://metrics.torproject.org")    
  ggsave(path, device = "png", width = 8, height = 5, dpi = 72) 
}

Unit Tests

Tested against same period results at https://metrics.torproject.org/networksize.html

2007-10-27 to 2017-5-25

plot_networksize("2007-10-27", "2017-5-25", "./images/relays1.png")

2016-6-1 to 2017-5-25

plot_networksize("2016-6-1", "2017-5-25", "./images/relays2.png")

2015-1-1 to 2015-5-25

plot_networksize("2015-1-1", "2015-5-25", "./images/relays3.png")

2013-1-1 to 2016-12-31

plot_networksize("2013-1-1", "2016-12-31", "./images/relays4.png")

2012-5-1 to 2014-6-30

plot_networksize("2012-5-1", "2014-6-30", "./images/relays5.png")