Topic modeling made just simple enough with LDA

For more details on using R topic modeling see http://tidytextmining.com/twitter.html.

Latent Dirichlet allocation (LDA) is a particularly popular method for fitting a topic model. It treats each document as a mixture of topics, and each topic as a mixture of words. This allows documents to “overlap” each other in terms of content, rather than being separated into discrete groups, in a way that mirrors typical use of natural language.

Step 1: Extract Text from Twitter

Here we go for twitter, # you need to use your own key, which can be obtain from tweeter, when you create an app on it

library(twitteR)
## Warning: package 'twitteR' was built under R version 3.3.3
library(NLP)
library(tm)
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 3.3.3
library(SnowballC)
library(httr)
## Warning: package 'httr' was built under R version 3.3.3
## 
## Attaching package: 'httr'
## The following object is masked from 'package:NLP':
## 
##     content
# api_key <- "xxxxxxxxxxxxxxxxxxxxxxxxx"
# api_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
## [1] "Using direct authentication"
# Grab latest tweets
tweet_text <- function(x) x$getText()
# Submit a search query (terms separated by "+") and get a return
# set of data (corpus).
tweet_corpus <- function(search, n = 5000, ...) {
  payload <- searchTwitter(search, n = n, ...)
  sapply(payload, tweet_text)
}

# Search for some key terms, try to grab a lot if you want. Twitter will 
# limit you as it sees fit (can find). Also has spatial options.
# Try these Sacramento coordinates: '38.630404,-121.293535,50mi'

tweets <- tweet_corpus("machine learning", n = 5000, lang = 'en')


# Save your corpus (because you're limited in how often you can do this for free!)
saveRDS(tweets, file = "C:/Users/Janpu/Documents/R_files/tweets.Rds", compress = 'xz')

# Okay, read that corpus back in from disk. I'm sure you have a
# different save location, right?
tweets <- readRDS("C:/Users/Janpu/Documents/R_files/tweets.Rds")

Step 2: Clean the Text

# Here we pre-process the data in some standard ways. I'll post-define each step
tweets <- iconv(tweets, to = "ASCII", sub = " ")  # Convert to basic ASCII text to avoid silly characters
tweets <- tolower(tweets)  # Make everything consistently lower case
tweets <- gsub("rt", " ", tweets)  # Remove the "RT" (retweet) so duplicates are duplicates
tweets <- gsub("@\\w+", " ", tweets)  # Remove user names (all proper names if you're wise!)
tweets <- gsub("http.+ |http.+$", " ", tweets)  # Remove links
tweets <- gsub("[[:punct:]]", " ", tweets)  # Remove punctuation
tweets <- gsub("[ |\t]{2,}", " ", tweets)  # Remove tabs
tweets <- gsub("amp", " ", tweets)  # "&" is "&amp" in HTML, so after punctuation removed ...
tweets <- gsub("^ ", "", tweets)  # Leading blanks
tweets <- gsub(" $", "", tweets)  # Lagging blanks
tweets <- gsub(" +", " ", tweets) # General spaces (should just do all whitespaces no?)
tweets <- unique(tweets)  # Now get rid of duplicates!

corpus <- Corpus(VectorSource(tweets))  # Create corpus object

# Remove English stop words. This could be greatly expanded!
# Don't forget the mc.cores thing
corpus <- tm_map(corpus, removeWords, stopwords("en"), mc.cores=1)  

# Remove numbers. This could have been done earlier, of course.
corpus <- tm_map(corpus, removeNumbers, mc.cores=1)

# Stem the words. Google if you don't understand
corpus <- tm_map(corpus, stemDocument, mc.cores=1)

# Remove the stems associated with our search terms!
corpus <- tm_map(corpus, removeWords, c("energi", "electr"), mc.cores=1)

Step 3: Visulization

pal <- brewer.pal(8, "Dark2")
wordcloud(corpus, min.freq=2, max.words = 150, random.order = TRUE, col = pal)

Step 4: Topic Modeling on Corpus

# Get the lengths and make sure we only create a DTM for tweets with
# some actual content
doc.lengths <- rowSums(as.matrix(DocumentTermMatrix(corpus)))
dtm <- DocumentTermMatrix(corpus[doc.lengths > 0])
# model <- LDA(dtm, 10)  # Go ahead and test a simple model if you want



# Now for some topics
SEED = sample(1:1000000, 1)  # Pick a random seed for replication
k = 10  # Let's start with 10 topics

# This might take a minute!
models <- list(
  CTM       = CTM(dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-3))),
  VEM       = LDA(dtm, k = k, control = list(seed = SEED)),
  VEM_Fixed = LDA(dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)),
  Gibbs     = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000,
                                                               thin = 100,    iter = 1000))
)



# There you have it. Models now holds 4 topics. See the topicmodels API documentation for details

# Top 10 terms of each topic for each model
# Do you see any themes you can label to these "topics" (lists of words)?
lapply(models, terms, 10)
## $CTM
##       Topic 1        Topic 2        Topic 3        Topic 4       
##  [1,] "learn"        "learn"        "machin"       "machin"      
##  [2,] "machinelearn" "machin"       "use"          "machinelearn"
##  [3,] "help"         "deep"         "learn"        "data"        
##  [4,] "machin"       "data"         "machinelearn" "learn"       
##  [5,] "will"         "work"         "googl"        "busi"        
##  [6,] "can"          "new"          "tech"         "bigdata"     
##  [7,] "practic"      "machinelearn" "intellig"     "analyt"      
##  [8,] "market"       "excel"        "advanc"       "python"      
##  [9,] "intellig"     "big"          "transform"    "tech"        
## [10,] "need"         "interest"     "iot"          "algorithm"   
##       Topic 5        Topic 6        Topic 7        Topic 8    
##  [1,] "learn"        "learn"        "machinelearn" "machin"   
##  [2,] "use"          "machin"       "will"         "learn"    
##  [3,] "machin"       "machinelearn" "machin"       "bigdata"  
##  [4,] "machinelearn" "algorithm"    "bigdata"      "can"      
##  [5,] "data"         "ifici"        "power"        "technolog"
##  [6,] "real"         "data"         "new"          "big"      
##  [7,] "can"          "intellig"     "human"        "make"     
##  [8,] "read"         "job"          "train"        "way"      
##  [9,] "improv"       "scienc"       "make"         "sta"      
## [10,] "time"         "can"          "mani"         "will"     
##       Topic 9        Topic 10      
##  [1,] "machin"       "machin"      
##  [2,] "data"         "learn"       
##  [3,] "machinelearn" "can"         
##  [4,] "learn"        "will"        
##  [5,] "use"          "big"         
##  [6,] "via"          "machinelearn"
##  [7,] "can"          "analyt"      
##  [8,] "scienc"       "ifici"       
##  [9,] "make"         "googl"       
## [10,] "skill"        "work"        
## 
## $VEM
##       Topic 1        Topic 2        Topic 3        Topic 4    
##  [1,] "learn"        "machin"       "machin"       "learn"    
##  [2,] "machinelearn" "machinelearn" "learn"        "data"     
##  [3,] "machin"       "learn"        "intellig"     "machin"   
##  [4,] "can"          "use"          "use"          "big"      
##  [5,] "intellig"     "can"          "machinelearn" "new"      
##  [6,] "advanc"       "make"         "new"          "analyt"   
##  [7,] "tech"         "bigdata"      "data"         "drive"    
##  [8,] "datasci"      "just"         "ifici"        "ifici"    
##  [9,] "busi"         "ifici"        "help"         "busi"     
## [10,] "ibmml"        "will"         "python"       "algorithm"
##       Topic 5        Topic 6        Topic 7        Topic 8       
##  [1,] "learn"        "machin"       "machin"       "machin"      
##  [2,] "machinelearn" "learn"        "can"          "machinelearn"
##  [3,] "machin"       "data"         "use"          "data"        
##  [4,] "bigdata"      "use"          "data"         "learn"       
##  [5,] "work"         "engin"        "machinelearn" "will"        
##  [6,] "way"          "algorithm"    "algorithm"    "googl"       
##  [7,] "market"       "help"         "help"         "analyt"      
##  [8,] "ifici"        "machinelearn" "bigdata"      "help"        
##  [9,] "fintech"      "now"          "skill"        "top"         
## [10,] "will"         "deep"         "make"         "new"         
##       Topic 9        Topic 10 
##  [1,] "learn"        "learn"  
##  [2,] "machinelearn" "machin" 
##  [3,] "data"         "data"   
##  [4,] "use"          "use"    
##  [5,] "technolog"    "sta"    
##  [6,] "will"         "will"   
##  [7,] "scienc"       "bigdata"
##  [8,] "googl"        "tech"   
##  [9,] "algorithm"    "can"    
## [10,] "intellig"     "scienc" 
## 
## $VEM_Fixed
##       Topic 1     Topic 2  Topic 3    Topic 4   Topic 5        Topic 6  
##  [1,] "learn"     "will"   "machin"   "data"    "machinelearn" "learn"  
##  [2,] "busi"      "make"   "learn"    "drive"   "bigdata"      "machin" 
##  [3,] "datasci"   "machin" "intellig" "develop" "tech"         "deep"   
##  [4,] "futur"     "work"   "ifici"    "now"     "market"       "engin"  
##  [5,] "ibmml"     "learn"  "new"      "join"    "fintech"      "visual" 
##  [6,] "better"    "way"    "comput"   "one"     "iot"          "use"    
##  [7,] "advanc"    "just"   "job"      "self"    "world"        "sheet"  
##  [8,] "differ"    "need"   "real"     "practic" "ech"          "cheat"  
##  [9,] "deeplearn" "thing"  "time"     "mine"    "transform"    "compani"
## [10,] "machin"    "onlin"  "look"     "best"    "learn"        "explor" 
##       Topic 7     Topic 8     Topic 9        Topic 10   
##  [1,] "can"       "machin"    "learn"        "learn"    
##  [2,] "use"       "analyt"    "machinelearn" "machin"   
##  [3,] "algorithm" "googl"     "use"          "python"   
##  [4,] "help"      "top"       "technolog"    "sta"      
##  [5,] "big"       "predict"   "blog"         "secur"    
##  [6,] "machin"    "scienc"    "like"         "icl"      
##  [7,] "guid"      "base"      "appli"        "enterpris"
##  [8,] "improv"    "model"     "exp"          "get"      
##  [9,] "healthcar" "research"  "detect"       "scienc"   
## [10,] "skill"     "introduct" "digit"        "talk"     
## 
## $Gibbs
##       Topic 1   Topic 2     Topic 3  Topic 4  Topic 5          
##  [1,] "learn"   "learn"     "machin" "learn"  "machinelearn"   
##  [2,] "machin"  "machin"    "learn"  "machin" "bigdata"        
##  [3,] "busi"    "will"      "new"    "world"  "tech"           
##  [4,] "deep"    "analyt"    "googl"  "icl"    "market"         
##  [5,] "futur"   "interest"  "job"    "power"  "analyt"         
##  [6,] "network" "build"     "via"    "cloud"  "fintech"        
##  [7,] "visual"  "don"       "engin"  "now"    "ech"            
##  [8,] "chang"   "microsoft" "better" "make"   "ificialintellig"
##  [9,] "sale"    "see"       "comput" "type"   "deeplearn"      
## [10,] "train"   "join"      "system" "check"  "industri"       
##       Topic 6     Topic 7   Topic 8      Topic 9     Topic 10  
##  [1,] "use"       "data"    "can"        "machin"    "machin"  
##  [2,] "machin"    "scienc"  "help"       "learn"     "intellig"
##  [3,] "algorithm" "python"  "will"       "appli"     "ifici"   
##  [4,] "technolog" "big"     "make"       "today"     "work"    
##  [5,] "datasci"   "read"    "get"        "iot"       "way"     
##  [6,] "drive"     "develop" "googl"      "healthcar" "just"    
##  [7,] "predict"   "improv"  "human"      "real"      "top"     
##  [8,] "sta"       "exp"     "guid"       "shape"     "need"    
##  [9,] "secur"     "learn"   "cybersecur" "sma"       "base"    
## [10,] "can"       "time"    "big"        "deal"      "detect"
# matrix of tweet assignments to predominate topic on that tweet
# for each of the models, in case you wanted to categorize them
assignments <- sapply(models, topics) 

head(assignments, n=10)
##    CTM VEM VEM_Fixed Gibbs
## 1    6   6         5     8
## 2    9   7         8     3
## 3    3   8         8     5
## 4    8   3         7     9
## 5    4   3         3    10
## 6    7   7         7     4
## 7    1   5         5     5
## 8    9   7         7     8
## 9    6   8        10     4
## 10   6  10        10     8