Release cycle analysis

Author

Eric Milliman

library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.8     ✔ dplyr   1.0.8
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   1.4.0     ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
devtools::load_all("~/OneDrive - Biogen/packages/riskmetric/")
ℹ Loading riskmetric
setwd("~/OneDrive - Biogen/packages/riskmetric_scratch_space/")
ap <- available.packages(repos = "https://cran.rstudio.com")
aps <- rownames(ap)

ref_pkg <- lapply(aps, pkg_cran)

if(!file.exists("~/OneDrive - Biogen/packages/riskmetric_scratch_space/release_dates.rds"))  {
  rbind(ref_pkg[[2]]$archive_release_dates,
        c(ref_pkg[[2]]$name, ref_pkg[[2]]$version, ref_pkg[[2]]$release_date))

  release_dates <- list()
  for (i in seq_along(ref_pkg)){
    if(tryCatch(ref_pkg[[i]]$archive_release_dates, error = function(e) "error")=="error") next
    if(ncol(ref_pkg[[i]]$archive_release_dates)==3){
      release_dates[[i]] <- rbind(ref_pkg[[i]]$archive_release_dates,
            c(ref_pkg[[i]]$name, ref_pkg[[i]]$version, ref_pkg[[i]]$release_date))
    } else{
      release_dates[[i]] <- matrix(c(ref_pkg[[i]]$name, ref_pkg[[i]]$version, ref_pkg[[i]]$release_date), ncol = 3, byrow = T)
      }
  }
  names(release_dates) <- aps
  saveRDS(release_dates, "release_dates.rds")
} else{
  release_dates <- readRDS("release_dates.rds")
}

release_dates <- release_dates[!sapply(release_dates, is.null)]
release_dates <- lapply(release_dates, function(x) setNames(as.data.frame(x), c("name","version","date")))
release_dates_df <- bind_rows(release_dates)
release_dates_df <- release_dates_df %>%
  group_by(name) %>%
  arrange(date) %>%
  mutate(date = as.Date(date),
         first_release = min(date),
         dT = c(0, diff(date)),
         ReleaseAge = date - first_release,
         PackageAge = Sys.Date() - first_release,

         rel_num = rank(date)
  )
release_dates_df %>%
  filter(date>first_release) %>%
  ggplot(aes(x=dT)) + geom_histogram(bins=50) + scale_x_sqrt() + 
  labs(title="Time between releases")

release_dates_df %>%
  filter(date>first_release) %>%
  ggplot(aes(x=ReleaseAge, y = dT)) +
  geom_point(alpha=0.05) + labs(title = "Scatterplot of time betwee releaseas with respect to package age at time of release")
Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

release_dates_df %>%
  filter(date != first_release & dT != ReleaseAge & n() > 2) %>%
  ggplot(aes(x=ReleaseAge, y = dT)) +
  geom_point(alpha=0.05) + 
  labs(caption = "Excluding first and second release", 
       title = "Scatterplot of time betwee releaseas with respect to package age at time of release")
Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

release_dates_df %>%
  summarise(num_rel = n(),
            PackageAge = max(PackageAge)) %>%
  ggplot(aes(x=PackageAge, y=num_rel)) + geom_hex()
Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

release_dates_df %>%
  summarise(num_rel = n(),
            PackageAge = max(PackageAge)) %>%
  ggplot(aes(x=num_rel)) + geom_histogram(bins=50) +
  scale_x_sqrt()

release_dates_df %>% distinct(name, PackageAge) %>%
  ggplot(aes(x= as.numeric(PackageAge))) +
  geom_histogram(bins=50) + scale_x_log10() +
  labs(x="Package Ages (days)")

release_dates_df %>%
  group_by(name) %>%
  filter(date > first_release) %>%
  summarise(num_rel=n() + 1,
            MediandT = median(dT)) %>%
  ggplot(aes(x=num_rel, y = MediandT)) +
  geom_hex()

release_dates_df %>%
  group_by(name) %>%
  filter(date > first_release) %>%
  summarise(PackageAge=max(PackageAge),
            AvgdT = median(dT)) %>%
  ggplot(aes(x=PackageAge, y = AvgdT)) +
  geom_hex()
Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

release_dates_df %>%
  group_by(rel_num) %>%
  summarise(mdT = mean(dT),
            sd = sd(dT),
            meddT = median(dT)) %>%
  ggplot(aes(x=rel_num, y = mdT)) +
  geom_point() +
  geom_errorbar(aes(ymin=mdT-sd, ymax=mdT + sd)) + labs(y="Avg dT")

release_dates_df %>%
  group_by(name) %>%
  filter(n()>10) %>% 
  summarise(mdT = mean(dT),
            sd = sd(dT),
            meddT = median(dT)) %>%
  ggplot(aes(x = sd)) +
  geom_histogram(binwidth=0.01) +
  scale_x_log10() +
  labs(x="Release interval std. dev.", 
       title="Distribution of package release interval variances",
       subtitle = "Pakcages with a minimum of 10 releases")

release_dates_df %>%
  mutate(Years = ceiling(as.numeric(PackageAge)/365.25)) %>%
  filter(date > first_release) %>%
  group_by(Years) %>%
  summarise(meddT = median(dT),
            sddT = sd(dT),
            AvgdT = mean(dT)) %>%
  ggplot(aes(x=Years, y=AvgdT)) +
  geom_point() + geom_errorbar(aes(ymin=AvgdT-sddT, ymax=AvgdT+sddT))

release_dates_df %>%
  filter(name %in% c("ggplot2","arules", "lme4","dplyr","tidyr","reshape2","lubridate","devtools", "data.table","pheatmap")) %>%
  group_by(name) %>%
  mutate(idx=cur_group_id(),
         name = factor(name)) %>%
  ggplot(aes(x=ReleaseAge, y=name)) + geom_point()
Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

release_dates_df %>%
  mutate(Years = ceiling(as.numeric(PackageAge)/365.25)) %>%
  group_by(name, Years) %>%
  filter(n()>3) %>%
  summarise(sd = sd(dT)) %>%
  ggplot(aes(x=Years, y=sd)) + geom_point(alpha=0.1) + geom_point() +
  labs(x="Age of Package", title = "StdDev of version release cycle",
       y="Standard Deviation")
`summarise()` has grouped output by 'name'. You can override using the
`.groups` argument.

sort(table(gsub("\\d","0", release_dates_df$version)), decreasing = T)

          0.0.0             0.0          0.0.00          0.00.0            0.00 
          85730           17677            7921            4222            3080 
        0.0.0.0           0.0-0         0.00.00          0.0-00         0.0.000 
           2400            1451             564             256             255 
       0.00.0.0        0.0.00.0        0000.0.0       0000.0.00        0.0.0.00 
            221             193             186             155             154 
        0000.00         0.000.0       0.0.0.0.0          000.00        0.0.0000 
            127              88              86              85              81 
    0.000000.00          00.0.0       0000.00.0      0000.00.00         0.0-0.0 
             65              65              64              60              59 
    0.0.000.0.0         0.0.0-0           0.000        0.00.000          0.00-0 
             57              55              54              52              51 
       0.000.00          0000.0       0.0.000.0        0.0000.0      0.0.00.0.0 
             39              35              28              27              26 
      000.00000         00.0.00       0.0.00.00       0.00.0.00      0.00.0.0.0 
             26              25              24              24              22 
      0.00.00.0   0.00.00000000        00.0.0.0       0.0.0.000      0.0.0.0000 
             21              21              20              18              17 
         0.0000        0000.000       0.00000.0            00.0        0.0-00.0 
             17              17              15              15              14 
        0.00-00       0.00.0000      0.0.00.000    0.0.00000000      0.000000.0 
             14              13              12              12              12 
        00.00.0        0.0.0-00       0.0.00000        0000.0-0      0.0.0.00.0 
             11               9               9               9               8 
      0.000.0.0    0.0.0.0.00.0        0.00.0-0      0.00.000.0       0.0000.00 
              8               7               7               6               6 
     0000.0.0.0       0000.00-0      0.0.0.0.00  0.0.0.00000000      0.00.00.00 
              6               6               5               5               5 
       00.00.00        0.0-00-0         0.0-000       0.0.0-0.0     0.0.0.0.0.0 
              5               4               4               4               4 
       0.0.00-0     0000.0.00.0      0000.00-00        0.0-0.00     0.0000000.0 
              4               4               4               3               3 
      0000.0-00      0000.000.0       0.0-0.0.0       0.0.00-00      0.0.000000 
              3               3               2               2               2 
      0.00-0.00          00.0-0        0000-0.0       0000-0.00             0-0 
              2               2               2               2               1 
           0-00         0.0-0-0     0.0-0-0.0.0        0.0-0000    0.0-00000000 
              1               1               1               1               1 
    0.0.0-0.0.0    0.0.0.0-00.0      0.0.00-0.0  0.0.00.00.0.00     0.0.00.0000 
              1               1               1               1               1 
       0.00-0-0        0.00-0.0        0.00-000      0.00.000-0     0.00.000.00 
              1               1               1               1               1 
0.00.00000000.0         0.000-0      0.000.00.0      0.00000.00        00.0-0.0 
              1               1               1               1               1 
       00.0.0-0       00.0.0.00      00.0.0.000         00.00-0     000.00000.0 
              1               1               1               1               1 
    0000-0.00.0      0000-00-00    0000-00-00-0       0000-00.0      0000-00.00 
              1               1               1               1               1 
     0000.0-0.0      0000.0.000     0000.00.0.0    0000.00.00.0      00000000.0 
              1               1               1               1               1 
release_dates_df <- release_dates_df %>% 
  filter(grepl("^\\d{1,2}\\.", version)) %>% 
  group_by(name) %>% 
  separate(version, into = c("major","minor","patch"), 
           remove = FALSE, extra = "drop") %>% 
  mutate(across(major:patch, ~as.numeric(.x)))
Warning: Expected 3 pieces. Missing pieces filled with `NA` in 20843 rows [3,
20, 25, 26, 27, 28, 38, 39, 40, 41, 42, 48, 50, 80, 90, 107, 127, 131, 133,
134, ...].
release_dates_df %>% 
  group_by(name, major) %>% 
  slice_min(date) %>% 
  group_by(name) %>% 
  filter(n() > 2) %>% 
  mutate(dT.majRel = c(0, diff(date))) %>% 
  filter(dT.majRel>0) %>% 
  ggplot(aes(x=major, y=dT.majRel, group=major)) + geom_boxplot()

release_dates_df %>% 
  group_by(name, major, minor) %>% 
  slice_min(date) %>% 
  filter(date > first_release) %>% 
  ggplot(aes(x=major, y=dT, group=major)) + geom_boxplot() +
  scale_y_sqrt() + labs(title= "Days between (minor) releases")

release_dates_df %>% 
  group_by(name, major, minor) %>% 
  slice_min(date) %>% 
  filter(date > first_release) %>% 
  group_by(name,major) %>% 
  summarise(n=n()) %>% 
  ggplot(aes(x=major, y=n, group=major)) + geom_boxplot() +
  #scale_y_sqrt() + 
  labs(title= "Number of (minor) releases between major releases")
`summarise()` has grouped output by 'name'. You can override using the
`.groups` argument.