library(dlstats)
library(feedeR)
library(ggthemes)
library(httr)
library(jsonlite)
library(scales)
library(stringr)
library(tidyverse)
Source: John King & Roger Magoulas (2016). O’Reilly 2016 Data Science Salary Survey. Retrieved from http://www.oreilly.com/data/free/files/2016-data-science-salary-survey.pdf
Original raw data were not available so this is a reconstruction of plots to format them for a presentation.
Respondants were asked to indicate what tools they used. This section of the report compares programming languages.
programming_languages <- read_csv('
.70,SQL,TRUE
.57,R,FALSE
.54,Python,TRUE
.24,Bash,TRUE
.18,Java,TRUE
.17,Javascript,TRUE
.13,VBA,TRUE
.09,C++,TRUE
.09,Matlab,TRUE
.08,Scala,TRUE
.08,C,TRUE
.08,C#,TRUE
.05,Perl,TRUE
.05,SAS,TRUE
.03,Ruby,TRUE
.02,Octave,TRUE
.01,Go,TRUE
', col_names=c("percent","language","not_r"))
ggplot(programming_languages, aes(x=reorder(language, percent), y=percent, fill=not_r)) +
geom_bar(stat="identity", show.legend = FALSE) +
theme_few() +
labs(title="O'Reilly Data Science Survey 2016: Programming Languages Used", x="", y="Share of Respondents") +
scale_y_continuous(labels = percent) +
coord_flip()
ggsave("oreilly_2016_programming_languages.png")
## Saving 7 x 5 in image
Initially I thought I would do a word thingy of keywords but most of these projects don’t have keywords. Instead I will do text analysis on their descriptions and extract keywords from that.
download.file("https://libraries.io/search.atom?order=desc&platforms=CRAN&sort=dependent_repos_count", "libsio_dep_repos.rss.atom")
libsio_rss <- feed.extract("https://libraries.io/search.atom?order=desc&platforms=CRAN&sort=dependent_repos_count")
libsio_dep_repos <- libsio_rss$items %>%
filter(link != "text/html", link != "alternate") %>%
separate(link, c("url", "libsio"), sep="https://libraries.io/", extra="merge", fill="left") %>%
select(title, libsio)
# get info on each repo in list
query_params <- list(
api_key=params$libsio_api_key
)
get_libsio_repos <- function (url, query) {
req <- GET(url, query=query)
print(paste(url))
json <- content(req, as = "text")
repo_json <- fromJSON(json, flatten=TRUE)
repo_json$versions <- NULL
repo_json_keywords <- repo_json$keywords
repo_json$keywords <- NULL
repo <- repo_json %>%
unlist() %>%
as.data.frame.list()
if (length(repo_json_keywords) != 0) { # FIXME
repo$keywords <- paste(repo_json_keywords, collapse=",")
} else {
repo$keywords <- NULL
}
return(repo)
}
repos <- data_frame()
for (n in 1:nrow(libsio_dep_repos)) {
url <- paste0("https://libraries.io/api/", libsio_dep_repos$libsio[[n]])
repo <- get_libsio_repos(url, query_params)
repos <- bind_rows(repos, repo)
saveRDS(repos, paste0("libsio/_repo_", n, ".rds"))
}
## [1] "https://libraries.io/api/cran/testthat"
## [1] "https://libraries.io/api/cran/knitr"
## [1] "https://libraries.io/api/cran/ggplot2"
## [1] "https://libraries.io/api/cran/dplyr"
## [1] "https://libraries.io/api/cran/rmarkdown"
## [1] "https://libraries.io/api/cran/Rcpp"
## [1] "https://libraries.io/api/cran/stringr"
## [1] "https://libraries.io/api/cran/plyr"
## [1] "https://libraries.io/api/cran/MASS"
## [1] "https://libraries.io/api/cran/httr"
## [1] "https://libraries.io/api/cran/data.table"
## [1] "https://libraries.io/api/cran/magrittr"
## [1] "https://libraries.io/api/cran/jsonlite"
## [1] "https://libraries.io/api/cran/reshape2"
## [1] "https://libraries.io/api/cran/tidyr"
## [1] "https://libraries.io/api/cran/shiny"
## [1] "https://libraries.io/api/cran/lubridate"
## [1] "https://libraries.io/api/cran/Matrix"
## [1] "https://libraries.io/api/cran/covr"
## [1] "https://libraries.io/api/cran/RCurl"
## [1] "https://libraries.io/api/cran/RColorBrewer"
## [1] "https://libraries.io/api/cran/foreach"
## [1] "https://libraries.io/api/cran/XML"
## [1] "https://libraries.io/api/cran/scales"
## [1] "https://libraries.io/api/cran/devtools"
## [1] "https://libraries.io/api/cran/digest"
## [1] "https://libraries.io/api/cran/readr"
## [1] "https://libraries.io/api/cran/purrr"
## [1] "https://libraries.io/api/cran/tibble"
## [1] "https://libraries.io/api/cran/igraph"
saveRDS(repos, "libsio_repos.rds")
repos <- readRDS("libsio_repos.rds")
Package metadata from the cran_stats package
pkg_stats <- cran_stats(repos$name)
saveRDS(pkg_stats, "pkg_stats.rds")
Views from CRAN (lets us group packages into topic areas)
download.file("http://cran.r-project.org/src/contrib/Views.rds", destfile = "Views.rds")
cran_views_raw <- readRDS("Views.rds")
func_list_things <- function(x){
return(list(package=paste(x$packagelist$name, collapse=","), name=x$name, topic=x$topic))}
cran_agg <- lapply(cran_views_raw, func_list_things)
cran_views <- bind_rows(cran_agg)
cran_views <- cran_views %>% unnest(package = str_split(package, ","))
saveRDS(cran_views, "cran_views.rds")
pkg_stats <- readRDS("pkg_stats.rds")
cran_views <- readRDS("cran_views.rds")
cran_pkgs <- pkg_stats %>%
inner_join(cran_views, by="package") %>%
group_by(start, name) %>%
mutate(downloads_topic = sum(downloads))
## Warning: Column `package` joining factor and character vector, coercing
## into character vector
pkg_stats_tf <- cran_stats(c("tfruns", "tensorflow", "tfdatasets", "tfestimators"))
ggplot(cran_pkgs %>% filter(start > "2016-12-31", start < "2018-02-01"),
aes(x=start, y=downloads, color=package, group=package)) +
geom_point() + geom_line() +
scale_y_continuous(labels = comma) +
xlab('Month') + ylab('Downloads') + theme_few() +
ggtitle('Downloads of Most Depended On packages on CRAN')
ggsave("20180201_top_deps_pkg_downloads_cran.png")
## Saving 7 x 5 in image
cran_pkgs_summary <- cran_pkgs %>%
filter(start > "2016-12-31", start < "2018-02-01") %>%
group_by(name) %>%
summarize(total_downloads=sum(downloads))
ggplot(cran_pkgs_summary,
aes(x=reorder(name, total_downloads), y=total_downloads)) +
geom_bar(stat="identity", show.legend = FALSE) +
theme_few() +
labs(title="Downloads of Top CRAN Packages by Topic (2017)", x="", y="") +
scale_y_continuous(labels = comma) +
coord_flip()
ggsave("cran_packages_topic.png")
## Saving 7 x 5 in image
Total CRAN package downloads from RStudio’s mirror
CRAN package uploads
ggplot(pkgs, aes(as.Date(first_release), index)) +
geom_line(size = 2) +
scale_x_date(date_breaks = '2 year', date_labels = '%Y') +
scale_y_continuous(breaks = pretty_breaks()) +
xlab('') + ylab('') + theme_few() +
ggtitle('Number of R packages ever published on CRAN')
ggsave("cran_r_pkgs.png")
CRAN packages with Tensorflow - https://libraries.io/search?platforms=CRAN&q=tensorflow
pkg_stats_tf <- cran_stats(c("tfruns", "tensorflow", "tfdatasets", "tfestimators"))
ggplot(pkg_stats_tf %>% filter(start < "2018-02-01"), aes(x=start, y=downloads, color=package, group=package)) +
geom_point() + geom_line() +
xlab('Month') + ylab('Downloads') + theme_few() +
ggtitle('Downloads of R Tensorflow packages on CRAN')
ggsave("20180201_TF_pkg_downloads_cran.png")
## Saving 7 x 5 in image