Commits

The data I currently have on commits is the following table for each of the 4 projects. The one below displays Apache Storm.

library(DT)
example <- storm[,.(commitAuthor=author,commitID=id,commitDate,commitDescription=description,modifiedFilePath=entity,fileChurn=size)]
datatable(example, options = list(pageLength = 5))

Each row represents a file modified by an author through a commit with an associated churn (lines added + lines removed). A row is therefore uniquely identified by the commitID and the file.

Count Missing Issue IDs within Time Window (12 months)

Remember that there is also a sub-setting from the intersection to the VCS I do not have the data.

First we define a function that count the amount of commit labels that contain the issue ID.

nrMissingMatching <- function(commitLabels){
    r <- '[A-Z]+-[0-9]+'
    logicVector <- grepl(r,commitLabels)    
    return(length(logicVector[logicVector==TRUE]))
}

Then we apply it on all our 4 projects commit labels.

summary <- data.table(
            Project=c("Camel","Cassandra","Flink","Storm"),
            nrCommitsContainIssueID = c(nrMissingMatching(camel$description),
                                        nrMissingMatching(cassandra$description),
                                        nrMissingMatching(flink$description),
                                        nrMissingMatching(storm$description)
                                ),
            nrAllCommits =  c(nrow(camel),nrow(cassandra),nrow(flink),nrow(storm))
            )
summary$percentMissingLabels <- 1-(summary$nrCommitsContainIssueID/summary$nrAllCommits)
summary <- summary[order(-percentMissingLabels)]
kable(summary)

Project	nrCommitsContainIssueID	nrAllCommits	percentMissingLabels
Storm	2197	5658	0.6117002
Flink	22833	30587	0.2535064
Camel	12739	15312	0.1680381
Cassandra	2673	2908	0.0808116

Apache Storm proportion of missing mappings a bit a larming (over half!). The others seems to be ok.

Number of Commits Over Time in 12 Time Window

library(lubridate)
library(dygraphs)

#Extract Date
camel$Date <- ymd_hms(camel$commitDate)
cassandra$Date <- ymd_hms(cassandra$commitDate)
flink$Date <- ymd_hms(flink$commitDate)
storm$Date <- ymd_hms(storm$commitDate)

Camel

camel1 <- camel[,.(nrCommits=length(unique(commitHash))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
camel1 <- camel1[,.(Date=ymd(paste),nrCommits)]
dygraph(camel1[!is.na(Date),], main = "Number of Commits over time")

Cassandra

cassandra1 <- cassandra[,.(nrCommits=length(unique(commitHash))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
cassandra1 <- cassandra1[,.(Date=ymd(paste),nrCommits)]
dygraph(cassandra1[!is.na(Date),], main = "Number of Commits over time")

Flink

flink1 <- flink[,.(nrCommits=length(unique(commitHash))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
flink1 <- flink1[,.(Date=ymd(paste),nrCommits)]
dygraph(flink1[!is.na(Date),], main = "Number of Commits over time")

Storm

storm1 <- storm[,.(nrCommits=length(unique(commitHash))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
storm1 <- storm1[,.(Date=ymd(paste),nrCommits)]
dygraph(storm1[!is.na(Date),], main = "Number of Commits over time")

Authors Rank

Extract the IssueID:

library(stringr)
r<-'[A-Z]+-[0-9]+'
camel$issueId <- str_match(camel$description,r)
cassandra$issueId <- str_match(cassandra$description,r)
flink$issueId <- str_match(flink$description,r)
storm$issueId <- str_match(storm$description,r)

Camel

camel1 <- camel[,.(nrCommits=length(unique(commitHash)),nrDistinctModifiedFiles=length(unique(entity)),nrDistinctIssuesCommented=length(unique(issueId))),by=author]
datatable(camel1, options = list(pageLength = 5))

Amount of Authors: 24

Cassandra

cassandra1 <- cassandra[,.(nrCommits=length(unique(commitHash)),nrDistinctModifiedFiles=length(unique(entity)),nrDistinctIssuesCommented=length(unique(issueId))),by=author]
datatable(cassandra1, options = list(pageLength = 5))

Amount of Authors: 61

Flink

flink1 <- flink[,.(nrCommits=length(unique(commitHash)),nrDistinctModifiedFiles=length(unique(entity)),nrDistinctIssuesCommented=length(unique(issueId))),by=author]
datatable(flink1, options = list(pageLength = 5))

Amount of Authors: 91

Storm

storm1 <- storm[,.(nrCommits=length(unique(commitHash)),nrDistinctModifiedFiles=length(unique(entity)),nrDistinctIssuesCommented=length(unique(issueId))),by=author]
datatable(storm1, options = list(pageLength = 5))

Amount of Authors: 74

Number of authors involved in the commits over time

Camel

camel1 <- camel[,.(nrAuthors=length(unique(author))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
camel1 <- camel1[,.(Date=ymd(paste),nrAuthors)]
dygraph(camel1[!is.na(Date),], main = "Number of Authors over time")

Cassandra

cassandra1 <- cassandra[,.(nrAuthors=length(unique(author))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
cassandra1 <- cassandra1[,.(Date=ymd(paste),nrAuthors)]
dygraph(cassandra1[!is.na(Date),], main = "Number of Authors over time")

Flink

flink1 <- flink[,.(nrAuthors=length(unique(author))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
flink1 <- flink1[,.(Date=ymd(paste),nrAuthors)]
dygraph(flink1[!is.na(Date),], main = "Number of Authors over time")

Storm

storm1 <- storm[,.(nrAuthors=length(unique(author))),by=paste(year(Date),month(Date),hour(Date),sep="-")]
storm1 <- storm1[,.(Date=ymd(paste),nrAuthors)]
dygraph(storm1[!is.na(Date),], main = "Number of Authors over time")

Conway Data Sanity Check

Carlos V. A. Silva

March 5, 2016

Commits

Count Missing Issue IDs within Time Window (12 months)

Number of Commits Over Time in 12 Time Window

Camel

Cassandra

Flink

Storm

Authors Rank

Camel

Cassandra

Flink

Storm

Number of authors involved in the commits over time

Camel

Cassandra

Flink

Storm