Finally! Tracking CRAN packages downloads: http://www.r-bloggers.com/finally-tracking-cran-packages-downloads/
CRAN package download logs: http://cran-logs.rstudio.com
tableone package CRAN page: http://cran.r-project.org/package=tableone
tableone package github page: https://github.com/kaz-yos/tableone
library(data.table)
library(tidyverse)
## Configure parallelization
## Parallel backend for foreach (also loads foreach and parallel; includes doMC)
library(doParallel)
## Reproducible parallelization
library(doRNG)
## Detect core count
nCores <- min(parallel::detectCores(), 10)
## Used by parallel::mclapply() as default
options(mc.cores = nCores)
## Used by doParallel as default
options(cores = nCores)
## Register doParallel as the parallel backend with foreach
## http://stackoverflow.com/questions/28989855/the-difference-between-domc-and-doparallel-in-r
doParallel::registerDoParallel(cores = nCores)
## Report multicore use
cat("### Using", foreach::getDoParWorkers(), "cores\n")
## ### Using 10 cores
cat("### Using", foreach::getDoParName(), "as backend\n")
## ### Using doParallelMC as backend
## Define start date
startDay <- as.Date('2014-02-01')
## subtract two days not to include non-existent file
endDay <- as.Date(format(Sys.time(), "%Y-%m-%d")) - 2
## Create all dates
allDays <- seq(startDay, endDay, by = 'day')
## Extract years
year <- format(allDays, "%Y")
## Create URLs
urls <- paste0('http://cran-logs.rstudio.com/', year, '/', allDays, '.csv.gz')
## Download files
dirName <- "./cran.tableone.log.d/"
for (url in urls) {
## Destination file name
destfile <- paste0(dirName, basename(url))
## If not already present download
if (!file.exists(destfile)) {
## Download
download.file(url = url, destfile = destfile)
## Move to a back up location
system(sprintf("mv %s %s~", destfile, destfile))
## Parse outside R for tableone
system(sprintf("gzip -cd %s~ | grep tableone | gzip > %s", destfile, destfile))
## Delete backup file
system(sprintf("rm %s~", destfile))
}
}
## Load files in the folder into a list
filePaths <- paste0(dirName, Filter(function(x){grepl(".csv.gz$",x)}, dir(dirName)))
dfCran <- mclapply(filePaths,
function(file) {
## Load individual file
read_csv(file = file,
col_names = c("date", "time", "size", "r_version", "r_arch", "r_os", "package",
"version", "country", "ip_id"),
col_types = cols(
date = col_date(format = ""),
time = col_time(format = ""),
size = col_integer(),
r_version = col_character(),
r_arch = col_character(),
r_os = col_character(),
package = col_character(),
version = col_character(),
country = col_character(),
ip_id = col_integer()
),
progress = FALSE)
}) %>%
bind_rows
## Convert to a data table
dtCran <- data.table(dfCran)
# add some keys and define variable types
dtCran[, date := as.Date(date)]
dtCran[, package := factor(package)]
dtCran[, country := factor(country)]
dtCran[, weekday := weekdays(date)]
dtCran[, week := strftime(as.POSIXlt(date),format="%Y-%W")]
## set keys
setkey(dtCran, package, date, week, country)
## Extract tableone
dtTableOne <- dtCran[dtCran$package %in% "tableone", ]
## Summarize
summary(dtTableOne)
## date time size r_version r_arch r_os
## Min. :2014-02-19 Length:117366 Min. : 339 Length:117366 Length:117366 Length:117366
## 1st Qu.:2017-08-29 Class1:hms 1st Qu.:124382 Class :character Class :character Class :character
## Median :2018-07-30 Class2:difftime Median :235978 Mode :character Mode :character Mode :character
## Mean :2018-04-01 Mode :numeric Mean :183631
## 3rd Qu.:2019-03-21 3rd Qu.:242927
## Max. :2019-09-23 Max. :278612
##
## package version country ip_id weekday week
## tableone:117366 Length:117366 US :44572 Min. : 1 Length:117366 Length:117366
## A3 : 0 Class :character GB : 7918 1st Qu.: 2474 Class :character Class :character
## abbyyR : 0 Mode :character JP : 7513 Median :10226 Mode :character Mode :character
## abc : 0 NL : 5537 Mean :16385
## abc.data: 0 CA : 4722 3rd Qu.:24704
## ABC.RAP : 0 (Other):41249 Max. :98659
## (Other) : 0 NA's : 5855
## n of download for each day
dtTableOneByDay <- dtTableOne %>%
group_by(date) %>%
summarize(n = n())
dtTableOneByDay
## # A tibble: 2,042 x 2
## date n
## <date> <int>
## 1 2014-02-19 10
## 2 2014-02-20 5
## 3 2014-02-21 9
## 4 2014-02-22 4
## 5 2014-02-23 4
## 6 2014-02-24 6
## 7 2014-02-25 5
## 8 2014-02-26 12
## 9 2014-02-27 4
## 10 2014-02-28 4
## # … with 2,032 more rows
## Add cumulative numbers
dtTableOneByDay$nCum <- cumsum(dtTableOneByDay$n)
## Plot
ggplot(data = dtTableOneByDay,
mapping = aes(x = date, y = n)) +
geom_line() +
geom_point() +
theme_bw() +
theme(legend.key = element_blank()) +
labs(title = "Daily downloads of tableone package")
## Plot
ggplot(data = dtTableOneByDay,
mapping = aes(x = date, y = nCum)) +
geom_line() +
theme_bw() +
theme(legend.key = element_blank()) +
labs(title = "Cumulative downloads of tableone package")