Finally! Tracking CRAN packages downloads: http://www.r-bloggers.com/finally-tracking-cran-packages-downloads/
CRAN package download logs: http://cran-logs.rstudio.com
tableone package CRAN page: http://cran.r-project.org/package=tableone
tableone package github page: https://github.com/kaz-yos/tableone
library(data.table)
library(dplyr)
library(ggplot2)
## Define start date
startDay <- as.Date('2014-02-01')
## subtract two days not to include non-existent file
endDay <- as.Date(format(Sys.time(), "%Y-%m-%d")) - 2
## Create all dates
allDays <- seq(startDay, endDay, by = 'day')
## Extract years
year <- format(allDays, "%Y")
## Create URLs
urls <- paste0('http://cran-logs.rstudio.com/', year, '/', allDays, '.csv.gz')
## Download files
dirName <- "./cran.log.d/"
for (url in urls) {
## Destination file name
destfile <- paste0(dirName, basename(url))
## If not already present download
if (!file.exists(destfile)) {
download.file(url = url, destfile = destfile)
}
}
## Load files in a list
listCsv <- lapply(paste0(dirName, allDays, '.csv.gz'),
function(file) {
## Load individual file
read.csv(file)
})
## Unify as a data frame
dfCran <- do.call(rbind, listCsv)
## Convert to a data table
dtCran <- data.table(dfCran)
# add some keys and define variable types
dtCran[, date := as.Date(date)]
## date time size r_version r_arch r_os package version country ip_id
## 1: 2014-02-01 00:05:57 64091 3.0.2 x86_64 mingw32 gtable 0.1.2 GB 1
## 2: 2014-02-01 00:05:57 1324004 3.0.2 x86_64 darwin10.8.0 plm 1.4-0 US 2
## 3: 2014-02-01 00:33:17 333896 3.0.1 x86_64 darwin10.8.0 scatterplot3d 0.3-34 US 3
## 4: 2014-02-01 00:23:52 17911 3.0.2 x86_64 linux-gnu evaluate 0.5.1 PT 4
## 5: 2014-02-01 00:37:37 1431501 3.0.2 x86_64 linux-gnu maps 2.3-6 FR 5
## ---
## 27025025: 2014-08-04 23:09:00 21439 3.0.3 x86_64 darwin10.8.0 profileModel 0.5-9 US 11583
## 27025026: 2014-08-04 23:08:15 2347297 3.0.2 x86_64 linux-gnu VIF 1.0 US 32
## 27025027: 2014-08-04 23:20:02 2719220 3.0.2 x86_64 linux-gnu Rcpp 0.11.2 DK 11778
## 27025028: 2014-08-04 23:20:04 39829 3.0.1 x86_64 mingw32 labeling 0.2 DK 11778
## 27025029: 2014-08-04 23:23:45 13779 3.1.1 x86_64 mingw32 RJDBC 0.2-4 US 2
dtCran[, package := factor(package)]
## date time size r_version r_arch r_os package version country ip_id
## 1: 2014-02-01 00:05:57 64091 3.0.2 x86_64 mingw32 gtable 0.1.2 GB 1
## 2: 2014-02-01 00:05:57 1324004 3.0.2 x86_64 darwin10.8.0 plm 1.4-0 US 2
## 3: 2014-02-01 00:33:17 333896 3.0.1 x86_64 darwin10.8.0 scatterplot3d 0.3-34 US 3
## 4: 2014-02-01 00:23:52 17911 3.0.2 x86_64 linux-gnu evaluate 0.5.1 PT 4
## 5: 2014-02-01 00:37:37 1431501 3.0.2 x86_64 linux-gnu maps 2.3-6 FR 5
## ---
## 27025025: 2014-08-04 23:09:00 21439 3.0.3 x86_64 darwin10.8.0 profileModel 0.5-9 US 11583
## 27025026: 2014-08-04 23:08:15 2347297 3.0.2 x86_64 linux-gnu VIF 1.0 US 32
## 27025027: 2014-08-04 23:20:02 2719220 3.0.2 x86_64 linux-gnu Rcpp 0.11.2 DK 11778
## 27025028: 2014-08-04 23:20:04 39829 3.0.1 x86_64 mingw32 labeling 0.2 DK 11778
## 27025029: 2014-08-04 23:23:45 13779 3.1.1 x86_64 mingw32 RJDBC 0.2-4 US 2
dtCran[, country := factor(country)]
## date time size r_version r_arch r_os package version country ip_id
## 1: 2014-02-01 00:05:57 64091 3.0.2 x86_64 mingw32 gtable 0.1.2 GB 1
## 2: 2014-02-01 00:05:57 1324004 3.0.2 x86_64 darwin10.8.0 plm 1.4-0 US 2
## 3: 2014-02-01 00:33:17 333896 3.0.1 x86_64 darwin10.8.0 scatterplot3d 0.3-34 US 3
## 4: 2014-02-01 00:23:52 17911 3.0.2 x86_64 linux-gnu evaluate 0.5.1 PT 4
## 5: 2014-02-01 00:37:37 1431501 3.0.2 x86_64 linux-gnu maps 2.3-6 FR 5
## ---
## 27025025: 2014-08-04 23:09:00 21439 3.0.3 x86_64 darwin10.8.0 profileModel 0.5-9 US 11583
## 27025026: 2014-08-04 23:08:15 2347297 3.0.2 x86_64 linux-gnu VIF 1.0 US 32
## 27025027: 2014-08-04 23:20:02 2719220 3.0.2 x86_64 linux-gnu Rcpp 0.11.2 DK 11778
## 27025028: 2014-08-04 23:20:04 39829 3.0.1 x86_64 mingw32 labeling 0.2 DK 11778
## 27025029: 2014-08-04 23:23:45 13779 3.1.1 x86_64 mingw32 RJDBC 0.2-4 US 2
dtCran[, weekday := weekdays(date)]
## date time size r_version r_arch r_os package version country ip_id weekday
## 1: 2014-02-01 00:05:57 64091 3.0.2 x86_64 mingw32 gtable 0.1.2 GB 1 Saturday
## 2: 2014-02-01 00:05:57 1324004 3.0.2 x86_64 darwin10.8.0 plm 1.4-0 US 2 Saturday
## 3: 2014-02-01 00:33:17 333896 3.0.1 x86_64 darwin10.8.0 scatterplot3d 0.3-34 US 3 Saturday
## 4: 2014-02-01 00:23:52 17911 3.0.2 x86_64 linux-gnu evaluate 0.5.1 PT 4 Saturday
## 5: 2014-02-01 00:37:37 1431501 3.0.2 x86_64 linux-gnu maps 2.3-6 FR 5 Saturday
## ---
## 27025025: 2014-08-04 23:09:00 21439 3.0.3 x86_64 darwin10.8.0 profileModel 0.5-9 US 11583 Monday
## 27025026: 2014-08-04 23:08:15 2347297 3.0.2 x86_64 linux-gnu VIF 1.0 US 32 Monday
## 27025027: 2014-08-04 23:20:02 2719220 3.0.2 x86_64 linux-gnu Rcpp 0.11.2 DK 11778 Monday
## 27025028: 2014-08-04 23:20:04 39829 3.0.1 x86_64 mingw32 labeling 0.2 DK 11778 Monday
## 27025029: 2014-08-04 23:23:45 13779 3.1.1 x86_64 mingw32 RJDBC 0.2-4 US 2 Monday
dtCran[, week := strftime(as.POSIXlt(date),format="%Y-%W")]
## date time size r_version r_arch r_os package version country ip_id weekday
## 1: 2014-02-01 00:05:57 64091 3.0.2 x86_64 mingw32 gtable 0.1.2 GB 1 Saturday
## 2: 2014-02-01 00:05:57 1324004 3.0.2 x86_64 darwin10.8.0 plm 1.4-0 US 2 Saturday
## 3: 2014-02-01 00:33:17 333896 3.0.1 x86_64 darwin10.8.0 scatterplot3d 0.3-34 US 3 Saturday
## 4: 2014-02-01 00:23:52 17911 3.0.2 x86_64 linux-gnu evaluate 0.5.1 PT 4 Saturday
## 5: 2014-02-01 00:37:37 1431501 3.0.2 x86_64 linux-gnu maps 2.3-6 FR 5 Saturday
## ---
## 27025025: 2014-08-04 23:09:00 21439 3.0.3 x86_64 darwin10.8.0 profileModel 0.5-9 US 11583 Monday
## 27025026: 2014-08-04 23:08:15 2347297 3.0.2 x86_64 linux-gnu VIF 1.0 US 32 Monday
## 27025027: 2014-08-04 23:20:02 2719220 3.0.2 x86_64 linux-gnu Rcpp 0.11.2 DK 11778 Monday
## 27025028: 2014-08-04 23:20:04 39829 3.0.1 x86_64 mingw32 labeling 0.2 DK 11778 Monday
## 27025029: 2014-08-04 23:23:45 13779 3.1.1 x86_64 mingw32 RJDBC 0.2-4 US 2 Monday
## week
## 1: 2014-04
## 2: 2014-04
## 3: 2014-04
## 4: 2014-04
## 5: 2014-04
## ---
## 27025025: 2014-31
## 27025026: 2014-31
## 27025027: 2014-31
## 27025028: 2014-31
## 27025029: 2014-31
## set keys
setkey(dtCran, package, date, week, country)
## Extract tableone
dtTableOne <- dtCran[dtCran$package %in% "tableone", ]
## Summarize
summary(dtTableOne)
## date time size r_version r_arch r_os
## Min. :2014-02-19 15:20:15: 7 Min. : 5933 3.1.0 :428 x86_64 :1023 mingw32 :834
## 1st Qu.:2014-04-02 14:48:38: 7 1st Qu.:74493 3.0.2 :367 i386 : 251 linux-gnu :268
## Median :2014-05-05 14:32:19: 4 Median :75259 3.0.3 :229 i686 : 16 darwin10.8.0:126
## Mean :2014-05-10 16:48:02: 3 Mean :68680 3.0.1 : 94 i486 : 3 darwin13.1.0: 53
## 3rd Qu.:2014-06-15 21:31:44: 3 3rd Qu.:76231 3.1.1 : 74 arm : 1 darwin9.8.0 : 9
## Max. :2014-08-04 22:18:14: 3 Max. :78223 (Other):102 (Other): 0 (Other) : 4
## (Other) :1410 NA's :143 NA's : 143 NA's :143
## package version country ip_id weekday week
## tableone :1437 0.6.2 :476 US :470 Min. : 1 Length:1437 Length:1437
## A3 : 0 0.5.0 :303 JP :186 1st Qu.: 1676 Class :character Class :character
## abc : 0 0.4.0 :264 DE : 76 Median : 3725 Mode :character Mode :character
## abcdeFBA : 0 0.3.5 :114 FR : 66 Mean : 4554
## ABCExtremes: 0 0.3.4 :112 CN : 65 3rd Qu.: 6813
## ABCoptim : 0 0.3.3 : 86 NL : 57 Max. :18777
## (Other) : 0 (Other): 82 (Other):517
## n of download for each day
dtTableOneByDay <- dtTableOne %.%
group_by(date) %.%
summarize(n = n())
dtTableOneByDay
## Source: local data table [167 x 2]
##
## date n
## 1 2014-02-19 10
## 2 2014-02-20 5
## 3 2014-02-21 9
## 4 2014-02-22 4
## 5 2014-02-23 4
## 6 2014-02-24 6
## 7 2014-02-25 5
## 8 2014-02-26 12
## 9 2014-02-27 4
## 10 2014-02-28 4
## .. ... ..
## Add cumulative numbers
dtTableOneByDay$nCum <- cumsum(dtTableOneByDay$n)
## Plot
ggplot(data = dtTableOneByDay,
mapping = aes(x = date, y = nCum)) +
layer(geom = "line") +
layer(geom = "point", mapping = aes(y = n)) +
theme_bw() +
theme(legend.key = element_blank()) +
labs(title = "Cumulative downloads of tableone package")