library("knitr")
## Warning: package 'knitr' was built under R version 3.2.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.2.4
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.2.3
library(RColorBrewer)
freq.url <- url("https://raw.githubusercontent.com/RobertSellers/SlackProjects/master/data/frequency_results.csv")

freq <- read.csv(freq.url, stringsAsFactors = FALSE, sep = ",")
freq <- freq[,2:4]

View(freq)
kable(head(freq))
skill_id t_freq dates
1 18 2016-03-16
2 0 2016-03-16
3 30 2016-03-16
4 0 2016-03-16
5 81 2016-03-16
6 0 2016-03-16
kable(tail(freq))
skill_id t_freq dates
487 118 0 2016-03-20
488 119 0 2016-03-20
489 120 53 2016-03-20
490 121 0 2016-03-20
491 122 0 2016-03-20
492 123 4 2016-03-20
freqPart1 <- subset(freq, dates == "2016-03-16")
freqPart2 <- subset(freq, dates == "2016-03-19")[1:123,]
freqPart3 <- subset(freq, dates == "2016-03-19")[124:246,]
freqPart4 <- subset(freq, dates == "2016-03-20")

skillTitle.url <- url("https://raw.githubusercontent.com/RobertSellers/SlackProjects/master/data/skills.csv") # The URL where the file listing the skill titles is located.
skillTitle <- read.csv(skillTitle.url, stringsAsFactors = FALSE)[1:123,] # Reads the skill titles into R.
skillTitle[79, 1] <- "NLP"

skillTitle[3, 1] <- "MapReduce"

View(skillTitle)


freqAllDates <- data.frame(skill_id = freqPart1$skill_id, t_freq = freqPart1$t_freq + freqPart2$t_freq + freqPart3$t_freq + freqPart4$t_freq, skill_title = skillTitle$Skill) # Creates a dataframe containing all the skills, with the frequency counts from each data gathering session summed together

View(freqAllDates)
zeroFreqs <-subset(freqAllDates, freqAllDates$t_freq == 0)
nrow(freqAllDates) #The total number of skills
## [1] 123
nrow(zeroFreqs) #The number of skills with zero mentions
## [1] 49
nrow(zeroFreqs)/nrow(freqAllDates) #The proportion of skills with zero mentions
## [1] 0.398374
freqPositive <- subset(freqAllDates, freqAllDates$t_freq > 0)
freqSort <- freqPositive[order(-freqPositive$t_freq), ] #Sort results by frequency, descending
View(freqSort)

barplot(freqSort$t_freq, main = "Mentions of Data Science Skills", xlab = "Skill", ylab = "# of mentions")

freqSort
##     skill_id t_freq             skill_title
## 88        88   1881                  pandas
## 87        87   1808               Oracle BI
## 57        57   1100              javascript
## 56        56    546                    Java
## 104      104    540               reporting
## 37        37    516                   Excel
## 108      108    398                     SAS
## 5          5    397                  apache
## 29        29    337               Curiosity
## 120      120    262            Story teller
## 106      106    240                    Ruby
## 43        43    198      Google Chart Tools
## 3          3    189               MapReduce
## 45        45    182                  Hadoop
## 79        79    105                     NLP
## 112      112    102            Social Graph
## 96        96     98                  Python
## 9          9     92 artificial intelligence
## 10        10     88                     AVT
## 99        99     85                   RDBMS
## 25        25     78           collaborative
## 86        86     78                  Oracle
## 93        93     76             Probability
## 84        84     74                   Numpy
## 83        83     71                   NoSQL
## 59        59     60                 libcurl
## 44        44     59            Graph Models
## 100      100     58                   Redis
## 74        74     55               motivated
## 1          1     49            Adaptability
## 39        39     49           Fusion Tables
## 47        47     46                    Hive
## 41        41     39                Geometry
## 75        75     36              motivation
## 123      123     36               Team work
## 65        65     35        Machine Learning
## 54        54     31              innovation
## 80        80     30           Network Graph
## 91        91     27              PostgreSQL
## 97        97     21                       R
## 60        60     18                  libsvm
## 49        49     17             Hortonworks
## 95        95     15             programming
## 77        77     14                   MySQL
## 14        14     13                BigQuery
## 34        34     12           data security
## 46        46     12                   Hbase
## 58        58     12              leadership
## 23        23     10                Cloudera
## 82        82     10         neural networks
## 20        20      9                     C++
## 33        33      9             Data Mining
## 48        48      9               Homegrown
## 85        85      9               Open Mind
## 105      105      9                Research
## 24        24      6           Collaboration
## 81        81      6          neural network
## 114      114      5                    SPSS
## 22        22      4               Cassandra
## 61        61      4          Linear Algebra
## 62        62      3                  Linusx
## 21        21      2                Calculus
## 28        28      2              creativity
## 64        64      2                Mac OS X
## 73        73      2                 MongoDB
## 4          4      1     Amazon Web Services
## 17        17      1   Business Intelligence
## 19        19      1                      C#
## 69        69      1                   Maths
## 90        90      1                     Pig
## 101      101      1                   regex
## 102      102      1              regression
## 109      109      1                   Scala
## 117      117      1                   Stata
wordcloud(words = freqSort$skill_title,  freq = freqSort$t_freq, max.words=100,colors=brewer.pal(8, "Dark2"))