The library load in

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(openxlsx)
library(dplyr)
library(zoo)

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(varhandle)

Let’s First define our problem

For this project, we want to look at the pull requests by year and quarter for various programming languages. This should give an interesting indicator.

The First look into the data set

url.data <- "prs.csv"
raw <-read.csv(url.data)
raw

Now let’s look into the data set

First things first, lets start looking at the raw data just to see what things look like. An interesting note is that this data covers from 2011-2022 with approximately 200 langauges.

sort(unique(raw$name))

##   [1] "1C Enterprise"            "ABAP"                    
##   [3] "ActionScript"             "Ada"                     
##   [5] "AGS Script"               "ApacheConf"              
##   [7] "Apex"                     "API Blueprint"           
##   [9] "Arduino"                  "ASP"                     
##  [11] "Assembly"                 "AutoHotkey"              
##  [13] "AutoIt"                   "Batchfile"               
##  [15] "BitBake"                  "Blade"                   
##  [17] "BlitzBasic"               "C"                       
##  [19] "C#"                       "C++"                     
##  [21] "Ceylon"                   "Clojure"                 
##  [23] "CMake"                    "COBOL"                   
##  [25] "CodeQL"                   "CoffeeScript"            
##  [27] "ColdFusion"               "Common Lisp"             
##  [29] "Common Workflow Language" "Component Pascal"        
##  [31] "Coq"                      "Crystal"                 
##  [33] "CSS"                      "Cucumber"                
##  [35] "Cuda"                     "CWeb"                    
##  [37] "D"                        "Dart"                    
##  [39] "Delphi"                   "DIGITAL Command Language"
##  [41] "DM"                       "Dockerfile"              
##  [43] "Eagle"                    "Eiffel"                  
##  [45] "Elixir"                   "Elm"                     
##  [47] "Emacs Lisp"               "Erlang"                  
##  [49] "F#"                       "F*"                      
##  [51] "Fortran"                  "FORTRAN"                 
##  [53] "FreeMarker"               "Game Maker Language"     
##  [55] "GAP"                      "GCC Machine Description" 
##  [57] "GDScript"                 "Genshi"                  
##  [59] "Gherkin"                  "GLSL"                    
##  [61] "Go"                       "Groff"                   
##  [63] "Groovy"                   "Hack"                    
##  [65] "Haskell"                  "Haxe"                    
##  [67] "HCL"                      "HTML"                    
##  [69] "IDL"                      "Java"                    
##  [71] "JavaScript"               "Jinja"                   
##  [73] "JSON"                     "Jsonnet"                 
##  [75] "Julia"                    "Jupyter Notebook"        
##  [77] "KiCad"                    "Kotlin"                  
##  [79] "LabVIEW"                  "Lean"                    
##  [81] "Liquid"                   "LiveScript"              
##  [83] "LLVM"                     "Logos"                   
##  [85] "Lua"                      "M4"                      
##  [87] "Makefile"                 "Mako"                    
##  [89] "Markdown"                 "Mathematica"             
##  [91] "Matlab"                   "MATLAB"                  
##  [93] "Meson"                    "mIRC Script"             
##  [95] "Modelica"                 "MoonScript"              
##  [97] "Mustache"                 "nesC"                    
##  [99] "Nginx"                    "Nim"                     
## [101] "Nix"                      "NSIS"                    
## [103] "Nunjucks"                 "Objective-C"             
## [105] "Objective-C++"            "Objective-J"             
## [107] "OCaml"                    "ooc"                     
## [109] "OpenEdge ABL"             "OpenSCAD"                
## [111] "Pan"                      "Pascal"                  
## [113] "Pawn"                     "Perl"                    
## [115] "Perl 6"                   "Perl6"                   
## [117] "PHP"                      "PLpgSQL"                 
## [119] "PLSQL"                    "PostScript"              
## [121] "POV-Ray SDL"              "PowerShell"              
## [123] "Processing"               "Prolog"                  
## [125] "Protocol Buffer"          "Pug"                     
## [127] "Puppet"                   "PureBasic"               
## [129] "PureScript"               "Python"                  
## [131] "q"                        "QML"                     
## [133] "R"                        "Racket"                  
## [135] "Raku"                     "Rascal"                  
## [137] "Reason"                   "Rich Text Format"        
## [139] "RobotFramework"           "Roff"                    
## [141] "Ruby"                     "Rust"                    
## [143] "SaltStack"                "SAS"                     
## [145] "Scala"                    "Scheme"                  
## [147] "SCSS"                     "Shell"                   
## [149] "Slash"                    "Smalltalk"               
## [151] "Smarty"                   "SourcePawn"              
## [153] "SQF"                      "SQL"                     
## [155] "SQLPL"                    "Standard ML"             
## [157] "Starlark"                 "Stylus"                  
## [159] "Svelte"                   "Swift"                   
## [161] "SWIG"                     "SystemVerilog"           
## [163] "Tcl"                      "TeX"                     
## [165] "Thrift"                   "TSQL"                    
## [167] "Twig"                     "TypeScript"              
## [169] "Uno"                      "UnrealScript"            
## [171] "Vala"                     "Verilog"                 
## [173] "VHDL"                     "Vim script"              
## [175] "Vim Snippet"              "VimL"                    
## [177] "Visual Basic"             "Visual Basic .NET"       
## [179] "Vue"                      "Web Ontology Language"   
## [181] "WebAssembly"              "XQuery"                  
## [183] "XSLT"                     "Xtend"                   
## [185] "YAML"                     "YARA"                    
## [187] "Zephir"

sort(unique(raw$year))

##  [1] 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022

sort(unique(raw$quarter))

## [1] 1 2 3 4

Let’s Tidy the data a bit

For the most part, I want to draw a bottom line on all of the current data, ensuring that no entries under 100 pulls a quarter are included as they simply do not have enough weight.

cleanData <- raw[raw$count > 100,]
cleanData

Let’s Make the first and second derivative

final_data = data.frame(matrix(vector(), 0, 5,
                dimnames=list(c(), c("Name", "Year", "Quarter", "growth", "dervgrowth" ))),
                stringsAsFactors=F)
for (item in unique(raw$name)) {
  tempData <- cleanData[cleanData$name == item,]
  tempDataYear <- tempData[order(-tempData$year, -tempData$quarter),]
  tempDataYear$growth = 0
  tempDataYear$dervgrowth = 0
  if(nrow(tempDataYear) >= 2)
  {
    for (row in 1:(nrow(tempDataYear)-1)) {
      tempDataYear$growth[row] = tempDataYear$count[row]-tempDataYear$count[row+1]
      
      
    }
    for (row in 1:(nrow(tempDataYear)-2)) {
      tempDataYear$dervgrowth[row] = tempDataYear$growth[row]-tempDataYear$growth[row+1]
      
      
    }
  }
  
  tempDataYear <- subset(tempDataYear, select = c(-count) )
  final_data <- rbind(final_data, tempDataYear)
  #tempDataOrdered <- tempData[order(-tempData$year)]
}

final_data

The Conclusion

At this point we have the data massaged and tidy, the first and second derivative calculated for reporting period. An easy extension to this would be to view the delta between the quarters, just to ensure that each time period is easily weighted the same.

At this point we can simply save the data and finish project 2!

write.csv(final_data,"Coding_Language_growth.csv", row.names = FALSE)

Data 607 Project-Git Data