library(dlstats)
library(feedeR)
library(ggthemes)
library(httr)
library(jsonlite)
library(scales)
library(stringr)
library(tidyverse)

O’Reilly Data Science Salary Survey

Source: John King & Roger Magoulas (2016). O’Reilly 2016 Data Science Salary Survey. Retrieved from http://www.oreilly.com/data/free/files/2016-data-science-salary-survey.pdf

Original raw data were not available so this is a reconstruction of plots to format them for a presentation.

Programming Languages

Respondants were asked to indicate what tools they used. This section of the report compares programming languages.

programming_languages <- read_csv('
.70,SQL,TRUE
.57,R,FALSE
.54,Python,TRUE
.24,Bash,TRUE
.18,Java,TRUE
.17,Javascript,TRUE
.13,VBA,TRUE
.09,C++,TRUE
.09,Matlab,TRUE
.08,Scala,TRUE
.08,C,TRUE
.08,C#,TRUE
.05,Perl,TRUE
.05,SAS,TRUE
.03,Ruby,TRUE
.02,Octave,TRUE
.01,Go,TRUE
', col_names=c("percent","language","not_r"))


ggplot(programming_languages, aes(x=reorder(language, percent), y=percent, fill=not_r)) +
  geom_bar(stat="identity", show.legend = FALSE) +
  theme_few() +
  labs(title="O'Reilly Data Science Survey 2016: Programming Languages Used", x="", y="Share of Respondents") +
  scale_y_continuous(labels = percent) +
  coord_flip()

ggsave("oreilly_2016_programming_languages.png")
## Saving 7 x 5 in image

CRAN

Libraries IO

Most Dependent Repos

Initially I thought I would do a word thingy of keywords but most of these projects don’t have keywords. Instead I will do text analysis on their descriptions and extract keywords from that.

download.file("https://libraries.io/search.atom?order=desc&platforms=CRAN&sort=dependent_repos_count", "libsio_dep_repos.rss.atom")
libsio_rss <- feed.extract("https://libraries.io/search.atom?order=desc&platforms=CRAN&sort=dependent_repos_count")


libsio_dep_repos <- libsio_rss$items %>%
  filter(link != "text/html", link != "alternate") %>%
  separate(link, c("url", "libsio"), sep="https://libraries.io/", extra="merge", fill="left") %>%
  select(title, libsio)
# get info on each repo in list

query_params <- list(
  api_key=params$libsio_api_key
)

get_libsio_repos <- function (url, query) {
  req <- GET(url, query=query)
  print(paste(url))
  json <- content(req, as = "text")
  repo_json <- fromJSON(json, flatten=TRUE)
  repo_json$versions <- NULL
  repo_json_keywords <- repo_json$keywords
  repo_json$keywords <- NULL
  repo <- repo_json %>%
    unlist() %>% 
    as.data.frame.list()
  if (length(repo_json_keywords) != 0) { # FIXME
    repo$keywords <- paste(repo_json_keywords, collapse=",")
  } else {
    repo$keywords <- NULL
  }
  return(repo)
}

repos <- data_frame()
for (n in 1:nrow(libsio_dep_repos)) {
  url <- paste0("https://libraries.io/api/", libsio_dep_repos$libsio[[n]])
  repo <- get_libsio_repos(url, query_params)
  repos <- bind_rows(repos, repo)
  saveRDS(repos, paste0("libsio/_repo_", n, ".rds"))
}
## [1] "https://libraries.io/api/cran/testthat"
## [1] "https://libraries.io/api/cran/knitr"
## [1] "https://libraries.io/api/cran/ggplot2"
## [1] "https://libraries.io/api/cran/dplyr"
## [1] "https://libraries.io/api/cran/rmarkdown"
## [1] "https://libraries.io/api/cran/Rcpp"
## [1] "https://libraries.io/api/cran/stringr"
## [1] "https://libraries.io/api/cran/plyr"
## [1] "https://libraries.io/api/cran/MASS"
## [1] "https://libraries.io/api/cran/httr"
## [1] "https://libraries.io/api/cran/data.table"
## [1] "https://libraries.io/api/cran/magrittr"
## [1] "https://libraries.io/api/cran/jsonlite"
## [1] "https://libraries.io/api/cran/reshape2"
## [1] "https://libraries.io/api/cran/tidyr"
## [1] "https://libraries.io/api/cran/shiny"
## [1] "https://libraries.io/api/cran/lubridate"
## [1] "https://libraries.io/api/cran/Matrix"
## [1] "https://libraries.io/api/cran/covr"
## [1] "https://libraries.io/api/cran/RCurl"
## [1] "https://libraries.io/api/cran/RColorBrewer"
## [1] "https://libraries.io/api/cran/foreach"
## [1] "https://libraries.io/api/cran/XML"
## [1] "https://libraries.io/api/cran/scales"
## [1] "https://libraries.io/api/cran/devtools"
## [1] "https://libraries.io/api/cran/digest"
## [1] "https://libraries.io/api/cran/readr"
## [1] "https://libraries.io/api/cran/purrr"
## [1] "https://libraries.io/api/cran/tibble"
## [1] "https://libraries.io/api/cran/igraph"
saveRDS(repos, "libsio_repos.rds")
repos <- readRDS("libsio_repos.rds")

Downloads

Package metadata from the cran_stats package

pkg_stats <- cran_stats(repos$name)

saveRDS(pkg_stats, "pkg_stats.rds")

Views from CRAN (lets us group packages into topic areas)

download.file("http://cran.r-project.org/src/contrib/Views.rds", destfile = "Views.rds")
cran_views_raw         <- readRDS("Views.rds")
func_list_things   <- function(x){
  return(list(package=paste(x$packagelist$name, collapse=","), name=x$name, topic=x$topic))}

cran_agg <- lapply(cran_views_raw, func_list_things)
cran_views <- bind_rows(cran_agg)

cran_views <- cran_views %>% unnest(package = str_split(package, ","))
saveRDS(cran_views, "cran_views.rds")

Downloads

pkg_stats <- readRDS("pkg_stats.rds")
cran_views <- readRDS("cran_views.rds")

cran_pkgs <- pkg_stats %>% 
  inner_join(cran_views, by="package") %>%
  group_by(start, name) %>%
  mutate(downloads_topic = sum(downloads)) 
## Warning: Column `package` joining factor and character vector, coercing
## into character vector
pkg_stats_tf <- cran_stats(c("tfruns", "tensorflow", "tfdatasets", "tfestimators"))

ggplot(cran_pkgs %>% filter(start > "2016-12-31", start < "2018-02-01"), 
       aes(x=start, y=downloads, color=package, group=package)) +
  geom_point() + geom_line() +
  scale_y_continuous(labels = comma) +
  xlab('Month') + ylab('Downloads') + theme_few() +
  ggtitle('Downloads of Most Depended On packages on CRAN')

ggsave("20180201_top_deps_pkg_downloads_cran.png")
## Saving 7 x 5 in image
cran_pkgs_summary <- cran_pkgs %>% 
  filter(start > "2016-12-31", start < "2018-02-01") %>%
  group_by(name) %>%
  summarize(total_downloads=sum(downloads))

ggplot(cran_pkgs_summary, 
       aes(x=reorder(name, total_downloads), y=total_downloads)) +
  geom_bar(stat="identity", show.legend = FALSE) + 
  theme_few() +
  labs(title="Downloads of Top CRAN Packages by Topic (2017)", x="", y="") +
  scale_y_continuous(labels = comma) +
  coord_flip()

ggsave("cran_packages_topic.png")
## Saving 7 x 5 in image

Total CRAN package downloads from RStudio’s mirror

CRAN package uploads

R Language Downloads

ggplot(pkgs, aes(as.Date(first_release), index)) +
  geom_line(size = 2) +
  scale_x_date(date_breaks = '2 year', date_labels = '%Y') +
  scale_y_continuous(breaks = pretty_breaks()) +
  xlab('') + ylab('') + theme_few() +
  ggtitle('Number of R packages ever published on CRAN')


ggsave("cran_r_pkgs.png")

CRAN packages with Tensorflow - https://libraries.io/search?platforms=CRAN&q=tensorflow

pkg_stats_tf <- cran_stats(c("tfruns", "tensorflow", "tfdatasets", "tfestimators"))

ggplot(pkg_stats_tf %>% filter(start < "2018-02-01"), aes(x=start, y=downloads, color=package, group=package)) +
  geom_point() + geom_line() +
  xlab('Month') + ylab('Downloads') + theme_few() +
  ggtitle('Downloads of R Tensorflow packages on CRAN')

ggsave("20180201_TF_pkg_downloads_cran.png")
## Saving 7 x 5 in image