library(knitr)
library(data.table)
camel <- fread("camel_commit_data.csv",sep=",",header=TRUE)
cassandra <- fread("cassandra_commit_data.csv",sep=",",header=TRUE)
flink <- fread("flink_commit_data.csv",sep=",",header=TRUE)
storm <- fread("storm_commit_data.csv",sep=",",header=TRUE)

Commits

The data I currently have on commits is the following table for each of the 4 projects. The one below displays Apache Storm.

library(DT)
example <- storm[,.(commitAuthor=author,commitID=id,commitDate,commitDescription=description,modifiedFilePath=entity,fileChurn=size)]
datatable(example, options = list(pageLength = 5))

Each row represents a file modified by an author through a commit with an associated churn (lines added + lines removed). A row is therefore uniquely identified by the commitID and the file.

Count Missing Issue IDs within Time Window (12 months)

Remember that there is also a sub-setting from the intersection to the VCS I do not have the data.

First we define a function that count the amount of commit labels that contain the issue ID.

nrMissingMatching <- function(commitLabels){
    r <- '[A-Z]+-[0-9]+'
    logicVector <- grepl(r,commitLabels)    
    return(length(logicVector[logicVector==TRUE]))
}

Then we apply it on all our 4 projects commit labels.

summary <- data.table(
            Project=c("Camel","Cassandra","Flink","Storm"),
            nrCommitsContainIssueID = c(nrMissingMatching(camel$description),
                                        nrMissingMatching(cassandra$description),
                                        nrMissingMatching(flink$description),
                                        nrMissingMatching(storm$description)
                                ),
            nrAllCommits =  c(nrow(camel),nrow(cassandra),nrow(flink),nrow(storm))
            )
summary$percentMissingLabels <- 1-(summary$nrCommitsContainIssueID/summary$nrAllCommits)
summary <- summary[order(-percentMissingLabels)]
kable(summary)
Project nrCommitsContainIssueID nrAllCommits percentMissingLabels
Storm 2197 5658 0.6117002
Flink 22833 30587 0.2535064
Camel 12739 15312 0.1680381
Cassandra 2673 2908 0.0808116

Apache Storm proportion of missing mappings a bit a larming (over half!). The others seems to be ok.

Number of Commits Over Time in 12 Time Window

library(lubridate)
library(dygraphs)

#Extract Date
camel$Date <- ymd_hms(camel$commitDate)
cassandra$Date <- ymd_hms(cassandra$commitDate)
flink$Date <- ymd_hms(flink$commitDate)
storm$Date <- ymd_hms(storm$commitDate)

Camel

camel1 <- camel[,.(nrCommits=length(unique(commitHash))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
camel1 <- camel1[,.(Date=ymd(paste),nrCommits)]
dygraph(camel1[!is.na(Date),], main = "Number of Commits over time") 

Cassandra

cassandra1 <- cassandra[,.(nrCommits=length(unique(commitHash))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
cassandra1 <- cassandra1[,.(Date=ymd(paste),nrCommits)]
dygraph(cassandra1[!is.na(Date),], main = "Number of Commits over time") 

Storm

storm1 <- storm[,.(nrCommits=length(unique(commitHash))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
storm1 <- storm1[,.(Date=ymd(paste),nrCommits)]
dygraph(storm1[!is.na(Date),], main = "Number of Commits over time") 

Authors Rank

Extract the IssueID:

library(stringr)
r<-'[A-Z]+-[0-9]+'
camel$issueId <- str_match(camel$description,r)
cassandra$issueId <- str_match(cassandra$description,r)
flink$issueId <- str_match(flink$description,r)
storm$issueId <- str_match(storm$description,r)

Camel

camel1 <- camel[,.(nrCommits=length(unique(commitHash)),nrDistinctModifiedFiles=length(unique(entity)),nrDistinctIssuesCommented=length(unique(issueId))),by=author]
datatable(camel1, options = list(pageLength = 5))

Amount of Authors: 24

Cassandra

cassandra1 <- cassandra[,.(nrCommits=length(unique(commitHash)),nrDistinctModifiedFiles=length(unique(entity)),nrDistinctIssuesCommented=length(unique(issueId))),by=author]
datatable(cassandra1, options = list(pageLength = 5))

Amount of Authors: 61

Storm

storm1 <- storm[,.(nrCommits=length(unique(commitHash)),nrDistinctModifiedFiles=length(unique(entity)),nrDistinctIssuesCommented=length(unique(issueId))),by=author]
datatable(storm1, options = list(pageLength = 5))

Amount of Authors: 74

Number of authors involved in the commits over time

Camel

camel1 <- camel[,.(nrAuthors=length(unique(author))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
camel1 <- camel1[,.(Date=ymd(paste),nrAuthors)]
dygraph(camel1[!is.na(Date),], main = "Number of Authors over time") 

Cassandra

cassandra1 <- cassandra[,.(nrAuthors=length(unique(author))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
cassandra1 <- cassandra1[,.(Date=ymd(paste),nrAuthors)]
dygraph(cassandra1[!is.na(Date),], main = "Number of Authors over time") 

Storm

storm1 <- storm[,.(nrAuthors=length(unique(author))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
storm1 <- storm1[,.(Date=ymd(paste),nrAuthors)]
dygraph(storm1[!is.na(Date),], main = "Number of Authors over time")