CRAN download log for tableone

References

Load packages

library(data.table)
library(tidyverse)

## Configure parallelization
## Parallel backend for foreach (also loads foreach and parallel; includes doMC)
library(doParallel)
## Reproducible parallelization
library(doRNG)
## Detect core count
nCores <- min(parallel::detectCores(), 10)
## Used by parallel::mclapply() as default
options(mc.cores = nCores)
## Used by doParallel as default
options(cores = nCores)
## Register doParallel as the parallel backend with foreach
## http://stackoverflow.com/questions/28989855/the-difference-between-domc-and-doparallel-in-r
doParallel::registerDoParallel(cores = nCores)
## Report multicore use
cat("### Using", foreach::getDoParWorkers(), "cores\n")
## ### Using 10 cores
cat("### Using", foreach::getDoParName(), "as backend\n")
## ### Using doParallelMC as backend

Download data files and create a unified data frame

## Define start date
startDay <- as.Date('2014-02-01')
## subtract two days not to include non-existent file
endDay   <- as.Date(format(Sys.time(), "%Y-%m-%d")) - 2
## Create all dates
allDays  <- seq(startDay, endDay, by = 'day')
## Extract years
year     <- format(allDays, "%Y")

## Create URLs
urls <- paste0('http://cran-logs.rstudio.com/', year, '/', allDays, '.csv.gz')

## Download files
dirName <- "./cran.tableone.log.d/"
for (url in urls) {
    ## Destination file name
    destfile <- paste0(dirName, basename(url))
    ## If not already present download
    if (!file.exists(destfile)) {
        ## Download
        download.file(url = url, destfile = destfile)
        ## Move to a back up location
        system(sprintf("mv %s %s~", destfile, destfile))
        ## Parse outside R for tableone
        system(sprintf("gzip -cd %s~ | grep tableone | gzip > %s", destfile, destfile))
        ## Delete backup file
        system(sprintf("rm %s~", destfile))
    }
}

## Load files in the folder into a list
filePaths <- paste0(dirName, Filter(function(x){grepl(".csv.gz$",x)}, dir(dirName)))
dfCran <- mclapply(filePaths,
                   function(file) {
                       ## Load individual file
                       read_csv(file = file,
                                col_names = c("date", "time", "size", "r_version", "r_arch", "r_os", "package",
                                              "version", "country", "ip_id"),
                                col_types = cols(
                                    date = col_date(format = ""),
                                    time = col_time(format = ""),
                                    size = col_integer(),
                                    r_version = col_character(),
                                    r_arch = col_character(),
                                    r_os = col_character(),
                                    package = col_character(),
                                    version = col_character(),
                                    country = col_character(),
                                    ip_id = col_integer()
                                ),
                                progress = FALSE)
                   }) %>%
    bind_rows

Data table creation

## Convert to a data table
dtCran <- data.table(dfCran)

# add some keys and define variable types
dtCran[, date    := as.Date(date)]
dtCran[, package := factor(package)]
dtCran[, country := factor(country)]
dtCran[, weekday := weekdays(date)]
dtCran[, week    := strftime(as.POSIXlt(date),format="%Y-%W")]

## set keys
setkey(dtCran, package, date, week, country)

Check download history tableone

## Extract tableone
dtTableOne <- dtCran[dtCran$package %in% "tableone", ]

## Summarize
summary(dtTableOne)
##       date                time               size         r_version            r_arch              r_os          
##  Min.   :2014-02-19   Length:117366     Min.   :   339   Length:117366      Length:117366      Length:117366     
##  1st Qu.:2017-08-29   Class1:hms        1st Qu.:124382   Class :character   Class :character   Class :character  
##  Median :2018-07-30   Class2:difftime   Median :235978   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2018-04-01   Mode  :numeric    Mean   :183631                                                           
##  3rd Qu.:2019-03-21                     3rd Qu.:242927                                                           
##  Max.   :2019-09-23                     Max.   :278612                                                           
##                                                                                                                  
##      package         version             country          ip_id         weekday              week          
##  tableone:117366   Length:117366      US     :44572   Min.   :    1   Length:117366      Length:117366     
##  A3      :     0   Class :character   GB     : 7918   1st Qu.: 2474   Class :character   Class :character  
##  abbyyR  :     0   Mode  :character   JP     : 7513   Median :10226   Mode  :character   Mode  :character  
##  abc     :     0                      NL     : 5537   Mean   :16385                                        
##  abc.data:     0                      CA     : 4722   3rd Qu.:24704                                        
##  ABC.RAP :     0                      (Other):41249   Max.   :98659                                        
##  (Other) :     0                      NA's   : 5855
## n of download for each day
dtTableOneByDay <- dtTableOne %>%
    group_by(date) %>%
    summarize(n = n())
dtTableOneByDay
## # A tibble: 2,042 x 2
##    date           n
##    <date>     <int>
##  1 2014-02-19    10
##  2 2014-02-20     5
##  3 2014-02-21     9
##  4 2014-02-22     4
##  5 2014-02-23     4
##  6 2014-02-24     6
##  7 2014-02-25     5
##  8 2014-02-26    12
##  9 2014-02-27     4
## 10 2014-02-28     4
## # … with 2,032 more rows
## Add cumulative numbers
dtTableOneByDay$nCum <- cumsum(dtTableOneByDay$n)

Visualize download history of tableone

## Plot
ggplot(data = dtTableOneByDay,
       mapping = aes(x = date, y = n)) +
    geom_line() +
    geom_point() +
    theme_bw() +
    theme(legend.key = element_blank()) +
    labs(title = "Daily downloads of tableone package")

## Plot
ggplot(data = dtTableOneByDay,
       mapping = aes(x = date, y = nCum)) +
    geom_line() +
    theme_bw() +
    theme(legend.key = element_blank()) +
    labs(title = "Cumulative downloads of tableone package")