Libraries used.
library(data.table)
## Warning: package 'data.table' was built under R version 3.2.4
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.2.3
## Loading required package: bitops
library(stringr)
library(twitteR)
## Warning: package 'twitteR' was built under R version 3.2.4
##
## Attaching package: 'twitteR'
## The following objects are masked from 'package:dplyr':
##
## id, location
require(wordcloud)
## Loading required package: wordcloud
## Warning: package 'wordcloud' was built under R version 3.2.4
## Loading required package: RColorBrewer
require(tm)
## Loading required package: tm
## Warning: package 'tm' was built under R version 3.2.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3
Twitter OAuth. Will not be evaluated. This is only required when generating new datasets. Both included, only one needed.
#Robert's Credentials
#consumer_key <- "DaAA9z8QvnxsdL0SpIr1oYwvP"
#consumer_secret <- "bCfsuODQyoYKMxPoHhZy2LxvvVqBvSM1LemzBtqm6YFeylWKUE"
#access_token <- "558596891-tDxN7T34cyVJJaBc4ExGTAq6wRfFBBlyHb2IzQvM"
#access_secret <-"nHqi5sVT2XRoCoSu0dYQnboCg1h35w5hRvtg657t8ROX8"
#setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
#Chirag's credentials
#consumer_key <- "DaAA9z8QvnxsdL0SpIr1oYwvP"
#consumer_secret <- "bCfsuODQyoYKMxPoHhZy2LxvvVqBvSM1LemzBtqm6YFeylWKUE"
#access_token <- "558596891-tDxN7T34cyVJJaBc4ExGTAq6wRfFBBlyHb2IzQvM"
#access_secret <-"nHqi5sVT2XRoCoSu0dYQnboCg1h35w5hRvtg657t8ROX8"
Sample data mining code. The following code was used to generate the .csv files that you will find inside: https://github.com/RobertSellers/SlackProjects/tree/master/data
twitter_results_Feb_March_19<-searchTwitter("#datascience", n=10000)
twitter_results_Feb_March_19 <- Map(as.data.frame, twitter_results_Feb_March_19 )
twitter_results_Feb_March_19 <- rbindlist(twitter_results_Feb_March_19 )
#write.csv(twitter_results_Feb_March_19 , file = "C:/Users/Robert/Desktop/CUNY/GitHub/R/data/twitter_results_Feb_March_19.csv")
Loading the data sources. Currently dating from 3/16, 3/18, 3/19. Will run only 3/16 for this.
twitter_results_march_16<-read.csv(file="https://raw.githubusercontent.com/RobertSellers/SlackProjects/master/data/twitter_results_March_16.csv", header=TRUE, sep=",")
twitter_results_march_18<-read.csv(file="https://raw.githubusercontent.com/RobertSellers/SlackProjects/master/data/twitter_results_March_18.csv", header=TRUE, sep=",")
twitter_results_march_19<-read.csv(file="https://raw.githubusercontent.com/RobertSellers/SlackProjects/master/data/twitter_results_March_19.csv", header=TRUE, sep=",")
Word Cloud function.
To Do: We may want a second “stopWords” variable input.
To Do: The results from this word cloud (that are relevant) should be added to the skills.csv.
To Do: Find a way to export a frequency table from this data?
dataScienceWordCloud<-function(twitterData){
text<-toupper(twitterData$text)
corpus<- Corpus(VectorSource(text))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "http\\S+\\s*")
corpus <- tm_map(corpus, toSpace, "http\\S+\\s*") #twice????
corpus <- tm_map(corpus, toSpace, "#")
corpus <- tm_map(corpus, toSpace, "http\\w+")
corpus <- tm_map(corpus, toSpace, "@\\w+")
corpus <- tm_map(corpus, toSpace, "https\\w+")
corpus <- tm_map(corpus, toSpace, "uselect")
corpus <-tm_map(corpus, removeWords, c(stopwords("english"),"#"))
wordcloud(corpus,random.order=T,min.freq=1,colors=brewer.pal(8, "Dark2"),scale = c(4, 0.2))
}
Running the word cloud on March 16
#head(twitter_results_march_16)
#dataScienceWordCloud(twitter_results_march_16)
ToDo: This uses the “skill.csv” as a look up table that is also applied to the other team’s URL method. We ought to add new values to that csv based on what we find in the twitter dataset.
lutSkills<-read.csv(file= "https://raw.githubusercontent.com/RobertSellers/SlackProjects/master/data/skills.csv", header=TRUE, sep=",")
lutSkills$text<-lutSkills$Skill
#lutSkills$text
#dataScienceWordCloud(lutSkills)
lookupFrequencies <-function(twitterData,lookupTable) {
lookupTable<-as.data.frame(lookupTable)
lookupTable$Skill<-paste0("\\<",lookupTable$Skill,"\\>")
lookupTable["counts"]<-NA
lookupTable$Skill[7] <- "xxxxxxxxxx" #C++ not working as a keyword
i<-1
for(i in 1:nrow(lookupTable)) {
lookupTable$counts[i]<-length(grep(lookupTable$Skill[i], twitterData$text))
lookupTable$counts[i]
}
return (lookupTable)
}
Running the function on March 16th
To Do - Fix plot & refine the function. Sort the data and continue to update the skills lookup table to get better data.
To Do - We ultimately want to run this on each date to ensure that we have consistency between dates and then ultimately to combine all of the data.
job1<-lookupFrequencies(twitter_results_march_16,lutSkills)
job2<-lookupFrequencies(twitter_results_march_18,lutSkills)
#job3<-lookupFrequencies(twitter_results_march_19,lutSkills)
jobs<-merge(job1,job2,by=c("Skill"))
jobs<-subset(jobs, select=c("Skill", "counts.x","counts.y"))
jobsz<-subset(jobs,jobs$counts.x>=100 & jobs$counts.y>=100)
#head(jobsz,1)
suppressWarnings(library(plotly))
#plot_ly(x = jobsz$Skill, y = jobsz$counts.x, type = "bar", color = toRGB("black"))
suppressWarnings(library(tidyr))
colnames(jobsz)[2] <- "On16th"
colnames(jobsz)[3] <- "On18th"
final<-gather(jobsz,"Date","count",2,3)
suppressWarnings(library(stringr))
final$Skill<-str_replace_all(final$Skill, "[^[:alnum:]]", " ")#clear special characters
final
## Skill Date count
## 1 Hadoop On16th 125
## 2 Machine Learning On16th 123
## 3 machinelearning On16th 275
## 4 Python On16th 427
## 5 R On16th 482
## 6 Statistics On16th 130
## 7 Hadoop On18th 122
## 8 Machine Learning On18th 129
## 9 machinelearning On18th 243
## 10 Python On18th 447
## 11 R On18th 409
## 12 Statistics On18th 118
plot_ly(x = final$Skill, y = final$count, type = "bar", color = final$Date)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels