For more details on using R topic modeling see http://tidytextmining.com/twitter.html.
Latent Dirichlet allocation (LDA) is a particularly popular method for fitting a topic model. It treats each document as a mixture of topics, and each topic as a mixture of words. This allows documents to “overlap” each other in terms of content, rather than being separated into discrete groups, in a way that mirrors typical use of natural language.
Here we go for twitter, # you need to use your own key, which can be obtain from tweeter, when you create an app on it
library(twitteR)
## Warning: package 'twitteR' was built under R version 3.3.3
library(NLP)
library(tm)
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 3.3.3
library(SnowballC)
library(httr)
## Warning: package 'httr' was built under R version 3.3.3
##
## Attaching package: 'httr'
## The following object is masked from 'package:NLP':
##
## content
# api_key <- "xxxxxxxxxxxxxxxxxxxxxxxxx"
# api_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
## [1] "Using direct authentication"
# Grab latest tweets
tweet_text <- function(x) x$getText()
# Submit a search query (terms separated by "+") and get a return
# set of data (corpus).
tweet_corpus <- function(search, n = 5000, ...) {
payload <- searchTwitter(search, n = n, ...)
sapply(payload, tweet_text)
}
# Search for some key terms, try to grab a lot if you want. Twitter will
# limit you as it sees fit (can find). Also has spatial options.
# Try these Sacramento coordinates: '38.630404,-121.293535,50mi'
tweets <- tweet_corpus("machine learning", n = 5000, lang = 'en')
# Save your corpus (because you're limited in how often you can do this for free!)
saveRDS(tweets, file = "C:/Users/Janpu/Documents/R_files/tweets.Rds", compress = 'xz')
# Okay, read that corpus back in from disk. I'm sure you have a
# different save location, right?
tweets <- readRDS("C:/Users/Janpu/Documents/R_files/tweets.Rds")
# Here we pre-process the data in some standard ways. I'll post-define each step
tweets <- iconv(tweets, to = "ASCII", sub = " ") # Convert to basic ASCII text to avoid silly characters
tweets <- tolower(tweets) # Make everything consistently lower case
tweets <- gsub("rt", " ", tweets) # Remove the "RT" (retweet) so duplicates are duplicates
tweets <- gsub("@\\w+", " ", tweets) # Remove user names (all proper names if you're wise!)
tweets <- gsub("http.+ |http.+$", " ", tweets) # Remove links
tweets <- gsub("[[:punct:]]", " ", tweets) # Remove punctuation
tweets <- gsub("[ |\t]{2,}", " ", tweets) # Remove tabs
tweets <- gsub("amp", " ", tweets) # "&" is "&" in HTML, so after punctuation removed ...
tweets <- gsub("^ ", "", tweets) # Leading blanks
tweets <- gsub(" $", "", tweets) # Lagging blanks
tweets <- gsub(" +", " ", tweets) # General spaces (should just do all whitespaces no?)
tweets <- unique(tweets) # Now get rid of duplicates!
corpus <- Corpus(VectorSource(tweets)) # Create corpus object
# Remove English stop words. This could be greatly expanded!
# Don't forget the mc.cores thing
corpus <- tm_map(corpus, removeWords, stopwords("en"), mc.cores=1)
# Remove numbers. This could have been done earlier, of course.
corpus <- tm_map(corpus, removeNumbers, mc.cores=1)
# Stem the words. Google if you don't understand
corpus <- tm_map(corpus, stemDocument, mc.cores=1)
# Remove the stems associated with our search terms!
corpus <- tm_map(corpus, removeWords, c("energi", "electr"), mc.cores=1)
pal <- brewer.pal(8, "Dark2")
wordcloud(corpus, min.freq=2, max.words = 150, random.order = TRUE, col = pal)
# Get the lengths and make sure we only create a DTM for tweets with
# some actual content
doc.lengths <- rowSums(as.matrix(DocumentTermMatrix(corpus)))
dtm <- DocumentTermMatrix(corpus[doc.lengths > 0])
# model <- LDA(dtm, 10) # Go ahead and test a simple model if you want
# Now for some topics
SEED = sample(1:1000000, 1) # Pick a random seed for replication
k = 10 # Let's start with 10 topics
# This might take a minute!
models <- list(
CTM = CTM(dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-3))),
VEM = LDA(dtm, k = k, control = list(seed = SEED)),
VEM_Fixed = LDA(dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000))
)
# There you have it. Models now holds 4 topics. See the topicmodels API documentation for details
# Top 10 terms of each topic for each model
# Do you see any themes you can label to these "topics" (lists of words)?
lapply(models, terms, 10)
## $CTM
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "learn" "learn" "machin" "machin"
## [2,] "machinelearn" "machin" "use" "machinelearn"
## [3,] "help" "deep" "learn" "data"
## [4,] "machin" "data" "machinelearn" "learn"
## [5,] "will" "work" "googl" "busi"
## [6,] "can" "new" "tech" "bigdata"
## [7,] "practic" "machinelearn" "intellig" "analyt"
## [8,] "market" "excel" "advanc" "python"
## [9,] "intellig" "big" "transform" "tech"
## [10,] "need" "interest" "iot" "algorithm"
## Topic 5 Topic 6 Topic 7 Topic 8
## [1,] "learn" "learn" "machinelearn" "machin"
## [2,] "use" "machin" "will" "learn"
## [3,] "machin" "machinelearn" "machin" "bigdata"
## [4,] "machinelearn" "algorithm" "bigdata" "can"
## [5,] "data" "ifici" "power" "technolog"
## [6,] "real" "data" "new" "big"
## [7,] "can" "intellig" "human" "make"
## [8,] "read" "job" "train" "way"
## [9,] "improv" "scienc" "make" "sta"
## [10,] "time" "can" "mani" "will"
## Topic 9 Topic 10
## [1,] "machin" "machin"
## [2,] "data" "learn"
## [3,] "machinelearn" "can"
## [4,] "learn" "will"
## [5,] "use" "big"
## [6,] "via" "machinelearn"
## [7,] "can" "analyt"
## [8,] "scienc" "ifici"
## [9,] "make" "googl"
## [10,] "skill" "work"
##
## $VEM
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "learn" "machin" "machin" "learn"
## [2,] "machinelearn" "machinelearn" "learn" "data"
## [3,] "machin" "learn" "intellig" "machin"
## [4,] "can" "use" "use" "big"
## [5,] "intellig" "can" "machinelearn" "new"
## [6,] "advanc" "make" "new" "analyt"
## [7,] "tech" "bigdata" "data" "drive"
## [8,] "datasci" "just" "ifici" "ifici"
## [9,] "busi" "ifici" "help" "busi"
## [10,] "ibmml" "will" "python" "algorithm"
## Topic 5 Topic 6 Topic 7 Topic 8
## [1,] "learn" "machin" "machin" "machin"
## [2,] "machinelearn" "learn" "can" "machinelearn"
## [3,] "machin" "data" "use" "data"
## [4,] "bigdata" "use" "data" "learn"
## [5,] "work" "engin" "machinelearn" "will"
## [6,] "way" "algorithm" "algorithm" "googl"
## [7,] "market" "help" "help" "analyt"
## [8,] "ifici" "machinelearn" "bigdata" "help"
## [9,] "fintech" "now" "skill" "top"
## [10,] "will" "deep" "make" "new"
## Topic 9 Topic 10
## [1,] "learn" "learn"
## [2,] "machinelearn" "machin"
## [3,] "data" "data"
## [4,] "use" "use"
## [5,] "technolog" "sta"
## [6,] "will" "will"
## [7,] "scienc" "bigdata"
## [8,] "googl" "tech"
## [9,] "algorithm" "can"
## [10,] "intellig" "scienc"
##
## $VEM_Fixed
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
## [1,] "learn" "will" "machin" "data" "machinelearn" "learn"
## [2,] "busi" "make" "learn" "drive" "bigdata" "machin"
## [3,] "datasci" "machin" "intellig" "develop" "tech" "deep"
## [4,] "futur" "work" "ifici" "now" "market" "engin"
## [5,] "ibmml" "learn" "new" "join" "fintech" "visual"
## [6,] "better" "way" "comput" "one" "iot" "use"
## [7,] "advanc" "just" "job" "self" "world" "sheet"
## [8,] "differ" "need" "real" "practic" "ech" "cheat"
## [9,] "deeplearn" "thing" "time" "mine" "transform" "compani"
## [10,] "machin" "onlin" "look" "best" "learn" "explor"
## Topic 7 Topic 8 Topic 9 Topic 10
## [1,] "can" "machin" "learn" "learn"
## [2,] "use" "analyt" "machinelearn" "machin"
## [3,] "algorithm" "googl" "use" "python"
## [4,] "help" "top" "technolog" "sta"
## [5,] "big" "predict" "blog" "secur"
## [6,] "machin" "scienc" "like" "icl"
## [7,] "guid" "base" "appli" "enterpris"
## [8,] "improv" "model" "exp" "get"
## [9,] "healthcar" "research" "detect" "scienc"
## [10,] "skill" "introduct" "digit" "talk"
##
## $Gibbs
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "learn" "learn" "machin" "learn" "machinelearn"
## [2,] "machin" "machin" "learn" "machin" "bigdata"
## [3,] "busi" "will" "new" "world" "tech"
## [4,] "deep" "analyt" "googl" "icl" "market"
## [5,] "futur" "interest" "job" "power" "analyt"
## [6,] "network" "build" "via" "cloud" "fintech"
## [7,] "visual" "don" "engin" "now" "ech"
## [8,] "chang" "microsoft" "better" "make" "ificialintellig"
## [9,] "sale" "see" "comput" "type" "deeplearn"
## [10,] "train" "join" "system" "check" "industri"
## Topic 6 Topic 7 Topic 8 Topic 9 Topic 10
## [1,] "use" "data" "can" "machin" "machin"
## [2,] "machin" "scienc" "help" "learn" "intellig"
## [3,] "algorithm" "python" "will" "appli" "ifici"
## [4,] "technolog" "big" "make" "today" "work"
## [5,] "datasci" "read" "get" "iot" "way"
## [6,] "drive" "develop" "googl" "healthcar" "just"
## [7,] "predict" "improv" "human" "real" "top"
## [8,] "sta" "exp" "guid" "shape" "need"
## [9,] "secur" "learn" "cybersecur" "sma" "base"
## [10,] "can" "time" "big" "deal" "detect"
# matrix of tweet assignments to predominate topic on that tweet
# for each of the models, in case you wanted to categorize them
assignments <- sapply(models, topics)
head(assignments, n=10)
## CTM VEM VEM_Fixed Gibbs
## 1 6 6 5 8
## 2 9 7 8 3
## 3 3 8 8 5
## 4 8 3 7 9
## 5 4 3 3 10
## 6 7 7 7 4
## 7 1 5 5 5
## 8 9 7 7 8
## 9 6 8 10 4
## 10 6 10 10 8