library("knitr")
## Warning: package 'knitr' was built under R version 3.2.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.2.4
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.2.3
library(RColorBrewer)
freq.url <- url("https://raw.githubusercontent.com/RobertSellers/SlackProjects/master/data/frequency_results.csv")
freq <- read.csv(freq.url, stringsAsFactors = FALSE, sep = ",")
freq <- freq[,2:4]
View(freq)
kable(head(freq))
| 1 |
18 |
2016-03-16 |
| 2 |
0 |
2016-03-16 |
| 3 |
30 |
2016-03-16 |
| 4 |
0 |
2016-03-16 |
| 5 |
81 |
2016-03-16 |
| 6 |
0 |
2016-03-16 |
kable(tail(freq))
| 487 |
118 |
0 |
2016-03-20 |
| 488 |
119 |
0 |
2016-03-20 |
| 489 |
120 |
53 |
2016-03-20 |
| 490 |
121 |
0 |
2016-03-20 |
| 491 |
122 |
0 |
2016-03-20 |
| 492 |
123 |
4 |
2016-03-20 |
freqPart1 <- subset(freq, dates == "2016-03-16")
freqPart2 <- subset(freq, dates == "2016-03-19")[1:123,]
freqPart3 <- subset(freq, dates == "2016-03-19")[124:246,]
freqPart4 <- subset(freq, dates == "2016-03-20")
skillTitle.url <- url("https://raw.githubusercontent.com/RobertSellers/SlackProjects/master/data/skills.csv") # The URL where the file listing the skill titles is located.
skillTitle <- read.csv(skillTitle.url, stringsAsFactors = FALSE)[1:123,] # Reads the skill titles into R.
skillTitle[79, 1] <- "NLP"
skillTitle[3, 1] <- "MapReduce"
View(skillTitle)
freqAllDates <- data.frame(skill_id = freqPart1$skill_id, t_freq = freqPart1$t_freq + freqPart2$t_freq + freqPart3$t_freq + freqPart4$t_freq, skill_title = skillTitle$Skill) # Creates a dataframe containing all the skills, with the frequency counts from each data gathering session summed together
View(freqAllDates)
zeroFreqs <-subset(freqAllDates, freqAllDates$t_freq == 0)
nrow(freqAllDates) #The total number of skills
## [1] 123
nrow(zeroFreqs) #The number of skills with zero mentions
## [1] 49
nrow(zeroFreqs)/nrow(freqAllDates) #The proportion of skills with zero mentions
## [1] 0.398374
freqPositive <- subset(freqAllDates, freqAllDates$t_freq > 0)
freqSort <- freqPositive[order(-freqPositive$t_freq), ] #Sort results by frequency, descending
View(freqSort)
barplot(freqSort$t_freq, main = "Mentions of Data Science Skills", xlab = "Skill", ylab = "# of mentions")

freqSort
## skill_id t_freq skill_title
## 88 88 1881 pandas
## 87 87 1808 Oracle BI
## 57 57 1100 javascript
## 56 56 546 Java
## 104 104 540 reporting
## 37 37 516 Excel
## 108 108 398 SAS
## 5 5 397 apache
## 29 29 337 Curiosity
## 120 120 262 Story teller
## 106 106 240 Ruby
## 43 43 198 Google Chart Tools
## 3 3 189 MapReduce
## 45 45 182 Hadoop
## 79 79 105 NLP
## 112 112 102 Social Graph
## 96 96 98 Python
## 9 9 92 artificial intelligence
## 10 10 88 AVT
## 99 99 85 RDBMS
## 25 25 78 collaborative
## 86 86 78 Oracle
## 93 93 76 Probability
## 84 84 74 Numpy
## 83 83 71 NoSQL
## 59 59 60 libcurl
## 44 44 59 Graph Models
## 100 100 58 Redis
## 74 74 55 motivated
## 1 1 49 Adaptability
## 39 39 49 Fusion Tables
## 47 47 46 Hive
## 41 41 39 Geometry
## 75 75 36 motivation
## 123 123 36 Team work
## 65 65 35 Machine Learning
## 54 54 31 innovation
## 80 80 30 Network Graph
## 91 91 27 PostgreSQL
## 97 97 21 R
## 60 60 18 libsvm
## 49 49 17 Hortonworks
## 95 95 15 programming
## 77 77 14 MySQL
## 14 14 13 BigQuery
## 34 34 12 data security
## 46 46 12 Hbase
## 58 58 12 leadership
## 23 23 10 Cloudera
## 82 82 10 neural networks
## 20 20 9 C++
## 33 33 9 Data Mining
## 48 48 9 Homegrown
## 85 85 9 Open Mind
## 105 105 9 Research
## 24 24 6 Collaboration
## 81 81 6 neural network
## 114 114 5 SPSS
## 22 22 4 Cassandra
## 61 61 4 Linear Algebra
## 62 62 3 Linusx
## 21 21 2 Calculus
## 28 28 2 creativity
## 64 64 2 Mac OS X
## 73 73 2 MongoDB
## 4 4 1 Amazon Web Services
## 17 17 1 Business Intelligence
## 19 19 1 C#
## 69 69 1 Maths
## 90 90 1 Pig
## 101 101 1 regex
## 102 102 1 regression
## 109 109 1 Scala
## 117 117 1 Stata
wordcloud(words = freqSort$skill_title, freq = freqSort$t_freq, max.words=100,colors=brewer.pal(8, "Dark2"))
