library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openxlsx)
library(dplyr)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(varhandle)
For this project, we want to look at the pull requests by year and quarter for various programming languages. This should give an interesting indicator.
url.data <- "prs.csv"
raw <-read.csv(url.data)
raw
First things first, lets start looking at the raw data just to see what things look like. An interesting note is that this data covers from 2011-2022 with approximately 200 langauges.
sort(unique(raw$name))
## [1] "1C Enterprise" "ABAP"
## [3] "ActionScript" "Ada"
## [5] "AGS Script" "ApacheConf"
## [7] "Apex" "API Blueprint"
## [9] "Arduino" "ASP"
## [11] "Assembly" "AutoHotkey"
## [13] "AutoIt" "Batchfile"
## [15] "BitBake" "Blade"
## [17] "BlitzBasic" "C"
## [19] "C#" "C++"
## [21] "Ceylon" "Clojure"
## [23] "CMake" "COBOL"
## [25] "CodeQL" "CoffeeScript"
## [27] "ColdFusion" "Common Lisp"
## [29] "Common Workflow Language" "Component Pascal"
## [31] "Coq" "Crystal"
## [33] "CSS" "Cucumber"
## [35] "Cuda" "CWeb"
## [37] "D" "Dart"
## [39] "Delphi" "DIGITAL Command Language"
## [41] "DM" "Dockerfile"
## [43] "Eagle" "Eiffel"
## [45] "Elixir" "Elm"
## [47] "Emacs Lisp" "Erlang"
## [49] "F#" "F*"
## [51] "Fortran" "FORTRAN"
## [53] "FreeMarker" "Game Maker Language"
## [55] "GAP" "GCC Machine Description"
## [57] "GDScript" "Genshi"
## [59] "Gherkin" "GLSL"
## [61] "Go" "Groff"
## [63] "Groovy" "Hack"
## [65] "Haskell" "Haxe"
## [67] "HCL" "HTML"
## [69] "IDL" "Java"
## [71] "JavaScript" "Jinja"
## [73] "JSON" "Jsonnet"
## [75] "Julia" "Jupyter Notebook"
## [77] "KiCad" "Kotlin"
## [79] "LabVIEW" "Lean"
## [81] "Liquid" "LiveScript"
## [83] "LLVM" "Logos"
## [85] "Lua" "M4"
## [87] "Makefile" "Mako"
## [89] "Markdown" "Mathematica"
## [91] "Matlab" "MATLAB"
## [93] "Meson" "mIRC Script"
## [95] "Modelica" "MoonScript"
## [97] "Mustache" "nesC"
## [99] "Nginx" "Nim"
## [101] "Nix" "NSIS"
## [103] "Nunjucks" "Objective-C"
## [105] "Objective-C++" "Objective-J"
## [107] "OCaml" "ooc"
## [109] "OpenEdge ABL" "OpenSCAD"
## [111] "Pan" "Pascal"
## [113] "Pawn" "Perl"
## [115] "Perl 6" "Perl6"
## [117] "PHP" "PLpgSQL"
## [119] "PLSQL" "PostScript"
## [121] "POV-Ray SDL" "PowerShell"
## [123] "Processing" "Prolog"
## [125] "Protocol Buffer" "Pug"
## [127] "Puppet" "PureBasic"
## [129] "PureScript" "Python"
## [131] "q" "QML"
## [133] "R" "Racket"
## [135] "Raku" "Rascal"
## [137] "Reason" "Rich Text Format"
## [139] "RobotFramework" "Roff"
## [141] "Ruby" "Rust"
## [143] "SaltStack" "SAS"
## [145] "Scala" "Scheme"
## [147] "SCSS" "Shell"
## [149] "Slash" "Smalltalk"
## [151] "Smarty" "SourcePawn"
## [153] "SQF" "SQL"
## [155] "SQLPL" "Standard ML"
## [157] "Starlark" "Stylus"
## [159] "Svelte" "Swift"
## [161] "SWIG" "SystemVerilog"
## [163] "Tcl" "TeX"
## [165] "Thrift" "TSQL"
## [167] "Twig" "TypeScript"
## [169] "Uno" "UnrealScript"
## [171] "Vala" "Verilog"
## [173] "VHDL" "Vim script"
## [175] "Vim Snippet" "VimL"
## [177] "Visual Basic" "Visual Basic .NET"
## [179] "Vue" "Web Ontology Language"
## [181] "WebAssembly" "XQuery"
## [183] "XSLT" "Xtend"
## [185] "YAML" "YARA"
## [187] "Zephir"
sort(unique(raw$year))
## [1] 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
sort(unique(raw$quarter))
## [1] 1 2 3 4
For the most part, I want to draw a bottom line on all of the current data, ensuring that no entries under 100 pulls a quarter are included as they simply do not have enough weight.
cleanData <- raw[raw$count > 100,]
cleanData
final_data = data.frame(matrix(vector(), 0, 5,
dimnames=list(c(), c("Name", "Year", "Quarter", "growth", "dervgrowth" ))),
stringsAsFactors=F)
for (item in unique(raw$name)) {
tempData <- cleanData[cleanData$name == item,]
tempDataYear <- tempData[order(-tempData$year, -tempData$quarter),]
tempDataYear$growth = 0
tempDataYear$dervgrowth = 0
if(nrow(tempDataYear) >= 2)
{
for (row in 1:(nrow(tempDataYear)-1)) {
tempDataYear$growth[row] = tempDataYear$count[row]-tempDataYear$count[row+1]
}
for (row in 1:(nrow(tempDataYear)-2)) {
tempDataYear$dervgrowth[row] = tempDataYear$growth[row]-tempDataYear$growth[row+1]
}
}
tempDataYear <- subset(tempDataYear, select = c(-count) )
final_data <- rbind(final_data, tempDataYear)
#tempDataOrdered <- tempData[order(-tempData$year)]
}
final_data
At this point we have the data massaged and tidy, the first and second derivative calculated for reporting period. An easy extension to this would be to view the delta between the quarters, just to ensure that each time period is easily weighted the same.
At this point we can simply save the data and finish project 2!
write.csv(final_data,"Coding_Language_growth.csv", row.names = FALSE)