Basic Text An funcs for CBA @ ISB

Hi all,

Let’s take a quick, breezy walk through the workflow for basic Text An.

There’re tasks in text An that we’ll keep doing repeatedly. And following the programming dictum of DRY (Don’t Repeat Yourself), I’ve decided to functionize these tasks such that they can be invoked and deployed where and when required.

So, let’s start with these basic, user-dfefined functions first.

library(tm)             # loads required package
## Loading required package: NLP
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(RWeka)          
library(textir)
## Loading required package: distrom
## Loading required package: Matrix
## Loading required package: gamlr
## Loading required package: parallel
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union

OK. The first user-defined TextAn func is one that does basic pre-processing tasks with the tm package.

– build basic TextAn func –

basic.textan <- function(
          x,        # input file of raw text 
          min1,     # if ngram, then min length
          max1      # if ngram, then max length
            ){
  

  x = gsub("<.*?>", "", x)  # cleanup html tags using a simple regex
  
  x1 = Corpus(VectorSource(x))      # Constructs a source for a vector as input
  
  x1 = tm_map(x1, stripWhitespace)  # removes white space
  x1 = tm_map(x1, tolower)      # converts to lower case
  x1 = tm_map(x1, removePunctuation)    # removes punctuatuion marks
  x1 = tm_map(x1, removeNumbers)    # removes numbers in the documents
  
  x1 = tm_map(x1, removeWords,      # selectively removes words from corpus
            c(stopwords('english'), "phone"))   # these words. Longer lists can be read-in
  
  x1 = tm_map(x1, stemDocument)
  x1 <- tm_map(x1, PlainTextDocument)   # stores corpus as plainText again after pre-processing

t = Sys.time()  # set counter for below func  
  
  ngram <- function(x1) NGramTokenizer(x1, Weka_control(min = min1, max = max1)) #invoking RWeka for ngram tokenizing   

  tdm0 <- TermDocumentMatrix(x1,    # tdm0 is output TDM object for text corpus x1
 
                control = list(tokenize = ngram,    
                                                tolower = TRUE, 
                                                removePunctuation = TRUE,
                                                stripWhitespace = TRUE,
                                                removeNumbers = TRUE,
                                                stopwords = TRUE,
                                                stemDocument = TRUE)

  )     # patience. Takes a minute, longer for big corpora.

Sys.time() - t  # time taken to run the ngram tokenizer.  

  ## remove blank documents (i.e. columns with zero sums)
  
  a0 = NULL; 
      for (i1 in 1:ncol(tdm0)){ if (sum(tdm0[, i1]) == 0) {a0 = c(a0, i1)} }  # if a token's freq sum in corpus=0, then index it for dropping
    length(a0)      # no. of empty docs in the corpus
    if (length(a0) >0) { tdm01 = tdm0[, -a0]} else {tdm01 = tdm0};  dim(tdm01)  # under TF weighing
    inspect(tdm01[1:5, 1:10])       # to view elements in tdm1, use inspect()
  
  # convert tdms to dtms
  # change dtm weighting from Tf (term freq) to TfIdf (term freq Inverse Doc freq)
  test = rownames(tdm01);   test1 = gsub(" ", "-", test);   rownames(tdm01) = test1
    dtm0 = t(tdm01)         # docs are rows and terms are cols
    dtm = t(weightTfIdf(tdm01, normalize = T))          # new dtm with TfIdf weighting using tm package
  

  # rearrange terms in descending order of Tf and view
  a2 = apply(dtm0, 2, sum) %>% 
            sort(decreasing = TRUE, index.return = TRUE)
  
    dtm01 = dtm0[, a2$ix];  inspect(dtm01[1:10, 1:10])
    dtm1 = dtm[, a2$ix];        dtm1[1:10, 1:10]    # inspect() doesn;t work after tfidf() applied
    outp = list(dtm01, dtm1)
  
  outp  }   # basic.textan() func ends

There, basic pre-processing tasks are done. Do note that we however did some NLP here already thge moment we brought ngrams into the picture.

Next, how about a simple func to build wordclouds using the wordcloud package

– func to make wordcloud and view some freqs for docs —

makewordc = function(a){    # plot wordcloud func opens. a is a DTM object
  
  a.colsum = apply(a, 2, sum);
  
  min1 = min(100, length(a.colsum)) # no more than 100 terms in wordcloud
  
  words = colnames(a)[1:min1]
  
  freq = 10 * a.colsum/mean(a.colsum)
  
  # if (max(freq) > 100) {freq = log(100* freq/max(freq)) } 
  
  wordcloud(words,  # wordcloud func begins
        freq,           
        scale = c(8, 0.3),  # can change this to adjust font scale
        colors=1:20)        # randomly choose between 10 colors

    } # func ends

Easy, peasy, eh?

Next, for sentiment An, we’re going to need word lists for sentiment laden words. These have been sent to you. I picked them from Princeton’s sentiment An project.

— Elementary sentiment analysis —

pos=scan(file.choose(), what="character", comment.char=";") # read-in positive-words.txt

neg=scan(file.choose(), what="character", comment.char=";")     # read-in negative-words.txt

pos.words=c(pos,"wow", "kudos", "hurray")           # including our own positive words to the existing list

neg.words = c(neg)

# positive sentiment wordcloud

makeposwordc = function(a){ # plot wordcloud func opens
  
  pos.matches = match(colnames(a), pos.words)       # match() returns the position of the matched term or NA
  
  pos.matches = !is.na(pos.matches)
  
  b1 = apply(a, 2, sum)[pos.matches];    b1 = as.data.frame(b1);
  
  colnames(b1) = c("freq");
  
  wordcloud(rownames(b1), b1[,1], scale=c(5, 1), colors=1:10)   # wordcloud of positive words
  
    }   # function ends for positive wordcloud


# negative sentiment wordlist

makenegwordc = function(a){ # plot wordcloud func opens
  
  neg.matches = match(colnames(a), neg.words)       # match() returns the position of the matched term or NA
  
  neg.matches = !is.na(neg.matches)
  
  b1 = apply(a, 2, sum)[neg.matches];    b1 = as.data.frame(b1);
  
  colnames(b1) = c("freq");
  
  wordcloud(rownames(b1), b1[,1], scale=c(5, 1), colors=1:10)   # wordcloud of negative words
  
}   # func ends

Boy, that was easy. Remember, we’re using princeton;s general purpose sentiment wordlist.

Howver, what are the odds a general purpose list will serve as well as a customized word list in a particular domain?

hence, its imperative that we make, refine, update our own domain-specific (and sometimes project-sepecific, even) word-lists as we proceed.

Oftentimes, such lists can be considered proprietary information and may need to be protected under copyright etc.

— Function to build barplots of text tokens —

barPlot <- function(dtm,            # input the dtm (not tdm)
                    number,         # select num of words to plot (usually 20)
                    xlabel, ylabel, 
                    title, 
                    col){
  
  a0 = apply(dtm, 2, sum)
  a1 = a0[order(a0, decreasing = T)]
  # alternately, a1 = apply(dtm, 2, sum) %>% order(decreasing = T)

  
  data = data.frame(word = names(a1), freq = a1)
  colnames(data) = c("word", "freq")
  data1 = data[1:number, ]
  data = data1[order(data1$word), 1:2]; # head(data)
  
  plot <- ggplot(data, 
                 
                 aes(x = factor(data$word, levels = data$word), y = data$freq)) + 
    
    geom_bar(stat = "identity", fill = col, colour = col) + 
    
    xlab(xlabel) + 
    ylab(ylabel) +
    ggtitle(title) 
  
  
  plot <- plot + 
    theme(axis.text.x  = element_text(angle = 45, vjust = 0.5, color="black"), 
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank())
  
  return(plot)
  
} # func ends

# barPlot(dtm1, 25, "token", "freq", "barplot by TFIDF", "red")

Last but not least, R code to generate co-occurrence graphs.

Input is a DTM and output is a cleaned, distilled (for removing cross-edges) network graph of central nodes connecting to most co-occurring peripheral nodes.

— Func to build cleaned COGs —

  distill.cog = function(dtm1,  # func opens and dtm is first input
            s,  # no of seed or central nodes
            k1,     # max no of connections per central node
            n1) # restrcit to the top n1 words, else graph becomes messy
                {  # func opens

    dtm2 <- rbind.data.frame(dtm1,as.data.frame(t(colSums(dtm1))))

    dtm3 <- dtm2[1:(nrow(dtm2)-1), order(dtm2[nrow(dtm2),],decreasing = TRUE)[1:n1]]

    mat = as.matrix((dtm3))  # input dtm here
    mat1 = t(mat) %*% mat    # build 1 mode term term matrix

    a = colSums(mat1)  # collect colsums into a vector obj a
    b = order(-a)     # nice syntax for ordering vector in decr order 

    mat2 = mat1[b,b]  
    diag(mat2) =  0

    ## +++ go row by row and find top k adjacencies +++ ##

    wc = NULL

    for (i1 in 1:s){

      thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]

      mat2[i1, mat2[i1,] < thresh1] = 0   # wow. didn't need 2 use () in the subset here.

      mat2[i1, mat2[i1,] > 0 ] = 1

      word = names(mat2[i1, mat2[i1,] > 0])

      mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0

      wc = c(wc,word)

        } # i1 loop ends

    mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]

    ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))]  # removed any NAs from the list

    mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]

    graph <- graph.adjacency(mat4, mode = "undirected", weighted=T)    # Create Network object

    graph = simplify(graph) 

    V(graph)$color[1:5] = "gray"

    V(graph)$color[6:length(V(graph))] = "white"

    V(graph)$frame.color[6:length(V(graph))] = adjustcolor("white", alpha.f = 0.7)

    plot(graph,  vertex.label.cex = 1.2,
                 layout = layout.kamada.kawai) 

  } # func ends

That’s the basic funcs we’ll need and keep invoking as we need them going further.

Sudhir