My GitHub commits

Introduction

This doc explores my use of GitHub since the beginning of 2016. We’ll start by loading a whole bunch of packges we’ll need to get the data.

library(gh) # devtools::install_github("gaborcsardi/gh")
library(purrr)
library(tibble)
library(dplyr)
library(readr)
library(lubridate)

Data import

I start by getting a list of the 100 repos that I’ve touched most recently:

my_repos <- function(type = c("all", "owner", "public", "private", "member"), 
                     limit = 100) {
  type <- match.arg(type)
  
  gh(
    "GET /user/repos",
    type = type, 
    sort = "updated",
    .limit = limit
  )
}
repos <- my_repos("owner", limit = 100)
length(repos)
## [1] 100

full_name <- repos %>% map_chr("full_name")
head(full_name, 20)
##  [1] "hadley/dplyr"        "hadley/profr"        "hadley/pryr"        
##  [4] "hadley/packman"      "hadley/modelr"       "hadley/ggplot2-book"
##  [7] "hadley/layers"       "hadley/plyr"         "hadley/ggplot2"     
## [10] "hadley/reshape"      "hadley/testthat"     "hadley/adv-r"       
## [13] "hadley/lubridate"    "hadley/shinySignals" "hadley/r-pkgs"      
## [16] "hadley/memoise"      "hadley/forcats"      "hadley/odbconnect"  
## [19] "hadley/readxl"       "hadley/monads"

(If you’re doing this yourself, you’ll need to make sure you’ve set up an environment variable GITHUB_PAT with a GitHub personal access token.)

And then, for each repo, I get all the commits since the start of the year. I didn’t necessary make a commit to my own repo (because I collaborate with lots of other people), so I make sure to extract the author of the commit.

repo_commits <- function(full_name, since = "2016-01-01") {
  message("Requesting commits for ", full_name)
  
  commits <- gh("GET /repos/:full_name/commits", 
    full_name = full_name, 
    since = since,
    .limit = Inf
  )
  
  if (length(commits) == 0) {
    return(NULL)
  }
  
  tibble(
    full_name = full_name,
    author = commits %>% map_chr(c("author", "login"), .null = NA_character_),
    datetime = commits %>% map_chr(c("commit", "author", "date"), .null = NA_character_)
  )
}

commits <- full_name %>% map(repo_commits) %>% compact() %>% bind_rows()
commits
## # A tibble: 4,879 × 3
##       full_name          author             datetime
##           <chr>           <chr>                <chr>
## 1  hadley/dplyr nicholasjhorton 2016-08-15T12:29:51Z
## 2  hadley/dplyr   Robinlovelace 2016-06-27T12:38:37Z
## 3  hadley/dplyr          hadley 2016-06-24T15:26:07Z
## 4  hadley/dplyr          hadley 2016-06-23T21:54:59Z
## 5  hadley/dplyr          krlmlr 2016-06-23T13:27:13Z
## 6  hadley/dplyr          hadley 2016-06-23T12:25:49Z
## 7  hadley/dplyr          hadley 2016-06-23T12:25:42Z
## 8  hadley/dplyr          krlmlr 2016-06-23T12:19:12Z
## 9  hadley/dplyr          krlmlr 2016-06-23T12:18:22Z
## 10 hadley/dplyr          hadley 2016-06-23T12:17:14Z
## # ... with 4,869 more rows

Next, I parse the commit date, and set my timezone. I break the datetime into separate date and time pieces as that will make plotting easier later on.

commits <- commits %>% mutate(
  datetime = lubridate::with_tz(readr::parse_datetime(datetime), "America/Chicago"),
  date = floor_date(datetime, "day"),
  time = update(datetime, yday = 1)
)
commits
## # A tibble: 4,879 × 5
##       full_name          author            datetime       date
##           <chr>           <chr>              <dttm>     <dttm>
## 1  hadley/dplyr nicholasjhorton 2016-08-15 07:29:51 2016-08-15
## 2  hadley/dplyr   Robinlovelace 2016-06-27 07:38:37 2016-06-27
## 3  hadley/dplyr          hadley 2016-06-24 10:26:07 2016-06-24
## 4  hadley/dplyr          hadley 2016-06-23 16:54:59 2016-06-23
## 5  hadley/dplyr          krlmlr 2016-06-23 08:27:13 2016-06-23
## 6  hadley/dplyr          hadley 2016-06-23 07:25:49 2016-06-23
## 7  hadley/dplyr          hadley 2016-06-23 07:25:42 2016-06-23
## 8  hadley/dplyr          krlmlr 2016-06-23 07:19:12 2016-06-23
## 9  hadley/dplyr          krlmlr 2016-06-23 07:18:22 2016-06-23
## 10 hadley/dplyr          hadley 2016-06-23 07:17:14 2016-06-23
## # ... with 4,869 more rows, and 1 more variables: time <dttm>

Next, I do a couple of quick checks to make sure the data looks reasonable

commits %>% count(full_name, sort = TRUE) %>% print(n = 20)
## # A tibble: 43 × 2
##            full_name     n
##                <chr> <int>
## 1      hadley/tibble   688
## 2        hadley/r4ds   670
## 3    hadley/devtools   562
## 4    hadley/testthat   453
## 5       hadley/dplyr   387
## 6       hadley/readr   223
## 7     hadley/ggplot2   209
## 8        hadley/xml2   176
## 9    hadley/lazyeval   159
## 10       hadley/httr   144
## 11      hadley/haven   143
## 12  hadley/lubridate   130
## 13      hadley/tidyr    98
## 14     hadley/dtplyr    93
## 15 hadley/odbconnect    91
## 16      hadley/purrr    66
## 17     hadley/modelr    62
## 18    hadley/forcats    57
## 19     hadley/ggstat    54
## 20    hadley/svglite    48
## # ... with 23 more rows
commits %>% count(author, sort = TRUE)
## # A tibble: 167 × 2
##            author     n
##             <chr> <int>
## 1          hadley  2398
## 2          krlmlr   943
## 3       jimhester   641
## 4          vspinu    96
## 5     garrettgman    54
## 6  romainfrancois    54
## 7           heike    47
## 8      kevinushey    44
## 9         lionel-    40
## 10     sibusiso16    39
## # ... with 157 more rows

Exploration

I need a few more packages to support my exploration.

library(ggplot2)
library(forcats)  # devtools::install_github("hadley/forcats")
library(ggbeeswarm) # devtools::install_github("eclarke/ggbeeswarm")

And then I pull out just the commits that I made. For some reason, I get a couple of commits before the start of the year, so I get rid of those:

hadley <- commits %>% 
  filter(author == "hadley") %>% 
  filter(date > ymd(20160101, tz = "America/Chicago"))

hadley
## # A tibble: 2,393 × 5
##       full_name author            datetime       date                time
##           <chr>  <chr>              <dttm>     <dttm>              <dttm>
## 1  hadley/dplyr hadley 2016-06-24 10:26:07 2016-06-24 2016-01-01 10:26:07
## 2  hadley/dplyr hadley 2016-06-23 16:54:59 2016-06-23 2016-01-01 16:54:59
## 3  hadley/dplyr hadley 2016-06-23 07:25:49 2016-06-23 2016-01-01 07:25:49
## 4  hadley/dplyr hadley 2016-06-23 07:25:42 2016-06-23 2016-01-01 07:25:42
## 5  hadley/dplyr hadley 2016-06-23 07:17:14 2016-06-23 2016-01-01 07:17:14
## 6  hadley/dplyr hadley 2016-06-20 06:48:06 2016-06-20 2016-01-01 06:48:06
## 7  hadley/dplyr hadley 2016-06-17 15:38:58 2016-06-17 2016-01-01 15:38:58
## 8  hadley/dplyr hadley 2016-06-17 15:38:26 2016-06-17 2016-01-01 15:38:26
## 9  hadley/dplyr hadley 2016-06-17 15:32:26 2016-06-17 2016-01-01 15:32:26
## 10 hadley/dplyr hadley 2016-06-15 06:40:18 2016-06-15 2016-01-01 06:40:18
## # ... with 2,383 more rows

To start, lets figure out what I’ve been working on. I’ve touched a lot of repos this year, so I’ll just look at the top 25.

hadley %>% 
  mutate(repo = full_name %>% fct_reorder(date) %>% fct_rev() %>% fct_lump(25)) %>% 
  ggplot(aes(date, repo)) + 
  geom_quasirandom(size = 0.5)

What times of day do I usually work on things? I hardly ever work on weekends so here I round the date to the nearest week. This makes it easy to see when I’m travelling!

hadley %>% 
  mutate(week = floor_date(date, "week")) %>% 
  ggplot(aes(week, time)) + 
  geom_quasirandom()

Finally, we can look at my average work week by breaking down by day of week, and focussing on my usual working hours.

hadley %>% 
  mutate(wday = wday(date, label = TRUE) %>% fct_shift(1) %>% fct_rev()) %>% 
  filter(hour(time) >= 6, hour(time) <= 18) %>% 
  ggplot(aes(time, wday)) + 
  geom_quasirandom()