Hi all,
Let’s take a quick, breezy walk through the workflow for basic Text An.
There’re tasks in text An that we’ll keep doing repeatedly. And following the programming dictum of DRY (Don’t Repeat Yourself), I’ve decided to functionize these tasks such that they can be invoked and deployed where and when required.
So, let’s start with these basic, user-dfefined functions first.
library(tm) # loads required package
## Loading required package: NLP
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(RWeka)
library(textir)
## Loading required package: distrom
## Loading required package: Matrix
## Loading required package: gamlr
## Loading required package: parallel
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
OK. The first user-defined TextAn func is one that does basic pre-processing tasks with the tm package.
basic.textan <- function(
x, # input file of raw text
min1, # if ngram, then min length
max1 # if ngram, then max length
){
x = gsub("<.*?>", "", x) # cleanup html tags using a simple regex
x1 = Corpus(VectorSource(x)) # Constructs a source for a vector as input
x1 = tm_map(x1, stripWhitespace) # removes white space
x1 = tm_map(x1, tolower) # converts to lower case
x1 = tm_map(x1, removePunctuation) # removes punctuatuion marks
x1 = tm_map(x1, removeNumbers) # removes numbers in the documents
x1 = tm_map(x1, removeWords, # selectively removes words from corpus
c(stopwords('english'), "phone")) # these words. Longer lists can be read-in
x1 = tm_map(x1, stemDocument)
x1 <- tm_map(x1, PlainTextDocument) # stores corpus as plainText again after pre-processing
t = Sys.time() # set counter for below func
ngram <- function(x1) NGramTokenizer(x1, Weka_control(min = min1, max = max1)) #invoking RWeka for ngram tokenizing
tdm0 <- TermDocumentMatrix(x1, # tdm0 is output TDM object for text corpus x1
control = list(tokenize = ngram,
tolower = TRUE,
removePunctuation = TRUE,
stripWhitespace = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
stemDocument = TRUE)
) # patience. Takes a minute, longer for big corpora.
Sys.time() - t # time taken to run the ngram tokenizer.
## remove blank documents (i.e. columns with zero sums)
a0 = NULL;
for (i1 in 1:ncol(tdm0)){ if (sum(tdm0[, i1]) == 0) {a0 = c(a0, i1)} } # if a token's freq sum in corpus=0, then index it for dropping
length(a0) # no. of empty docs in the corpus
if (length(a0) >0) { tdm01 = tdm0[, -a0]} else {tdm01 = tdm0}; dim(tdm01) # under TF weighing
inspect(tdm01[1:5, 1:10]) # to view elements in tdm1, use inspect()
# convert tdms to dtms
# change dtm weighting from Tf (term freq) to TfIdf (term freq Inverse Doc freq)
test = rownames(tdm01); test1 = gsub(" ", "-", test); rownames(tdm01) = test1
dtm0 = t(tdm01) # docs are rows and terms are cols
dtm = t(weightTfIdf(tdm01, normalize = T)) # new dtm with TfIdf weighting using tm package
# rearrange terms in descending order of Tf and view
a2 = apply(dtm0, 2, sum) %>%
sort(decreasing = TRUE, index.return = TRUE)
dtm01 = dtm0[, a2$ix]; inspect(dtm01[1:10, 1:10])
dtm1 = dtm[, a2$ix]; dtm1[1:10, 1:10] # inspect() doesn;t work after tfidf() applied
outp = list(dtm01, dtm1)
outp } # basic.textan() func ends
There, basic pre-processing tasks are done. Do note that we however did some NLP here already thge moment we brought ngrams into the picture.
Next, how about a simple func to build wordclouds using the wordcloud package
makewordc = function(a){ # plot wordcloud func opens. a is a DTM object
a.colsum = apply(a, 2, sum);
min1 = min(100, length(a.colsum)) # no more than 100 terms in wordcloud
words = colnames(a)[1:min1]
freq = 10 * a.colsum/mean(a.colsum)
# if (max(freq) > 100) {freq = log(100* freq/max(freq)) }
wordcloud(words, # wordcloud func begins
freq,
scale = c(8, 0.3), # can change this to adjust font scale
colors=1:20) # randomly choose between 10 colors
} # func ends
Easy, peasy, eh?
Next, for sentiment An, we’re going to need word lists for sentiment laden words. These have been sent to you. I picked them from Princeton’s sentiment An project.
pos=scan(file.choose(), what="character", comment.char=";") # read-in positive-words.txt
neg=scan(file.choose(), what="character", comment.char=";") # read-in negative-words.txt
pos.words=c(pos,"wow", "kudos", "hurray") # including our own positive words to the existing list
neg.words = c(neg)
# positive sentiment wordcloud
makeposwordc = function(a){ # plot wordcloud func opens
pos.matches = match(colnames(a), pos.words) # match() returns the position of the matched term or NA
pos.matches = !is.na(pos.matches)
b1 = apply(a, 2, sum)[pos.matches]; b1 = as.data.frame(b1);
colnames(b1) = c("freq");
wordcloud(rownames(b1), b1[,1], scale=c(5, 1), colors=1:10) # wordcloud of positive words
} # function ends for positive wordcloud
# negative sentiment wordlist
makenegwordc = function(a){ # plot wordcloud func opens
neg.matches = match(colnames(a), neg.words) # match() returns the position of the matched term or NA
neg.matches = !is.na(neg.matches)
b1 = apply(a, 2, sum)[neg.matches]; b1 = as.data.frame(b1);
colnames(b1) = c("freq");
wordcloud(rownames(b1), b1[,1], scale=c(5, 1), colors=1:10) # wordcloud of negative words
} # func ends
Boy, that was easy. Remember, we’re using princeton;s general purpose sentiment wordlist.
Howver, what are the odds a general purpose list will serve as well as a customized word list in a particular domain?
hence, its imperative that we make, refine, update our own domain-specific (and sometimes project-sepecific, even) word-lists as we proceed.
Oftentimes, such lists can be considered proprietary information and may need to be protected under copyright etc.
barPlot <- function(dtm, # input the dtm (not tdm)
number, # select num of words to plot (usually 20)
xlabel, ylabel,
title,
col){
a0 = apply(dtm, 2, sum)
a1 = a0[order(a0, decreasing = T)]
# alternately, a1 = apply(dtm, 2, sum) %>% order(decreasing = T)
data = data.frame(word = names(a1), freq = a1)
colnames(data) = c("word", "freq")
data1 = data[1:number, ]
data = data1[order(data1$word), 1:2]; # head(data)
plot <- ggplot(data,
aes(x = factor(data$word, levels = data$word), y = data$freq)) +
geom_bar(stat = "identity", fill = col, colour = col) +
xlab(xlabel) +
ylab(ylabel) +
ggtitle(title)
plot <- plot +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, color="black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
return(plot)
} # func ends
# barPlot(dtm1, 25, "token", "freq", "barplot by TFIDF", "red")
Last but not least, R code to generate co-occurrence graphs.
Input is a DTM and output is a cleaned, distilled (for removing cross-edges) network graph of central nodes connecting to most co-occurring peripheral nodes.
distill.cog = function(dtm1, # func opens and dtm is first input
s, # no of seed or central nodes
k1, # max no of connections per central node
n1) # restrcit to the top n1 words, else graph becomes messy
{ # func opens
dtm2 <- rbind.data.frame(dtm1,as.data.frame(t(colSums(dtm1))))
dtm3 <- dtm2[1:(nrow(dtm2)-1), order(dtm2[nrow(dtm2),],decreasing = TRUE)[1:n1]]
mat = as.matrix((dtm3)) # input dtm here
mat1 = t(mat) %*% mat # build 1 mode term term matrix
a = colSums(mat1) # collect colsums into a vector obj a
b = order(-a) # nice syntax for ordering vector in decr order
mat2 = mat1[b,b]
diag(mat2) = 0
## +++ go row by row and find top k adjacencies +++ ##
wc = NULL
for (i1 in 1:s){
thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
mat2[i1, mat2[i1,] < thresh1] = 0 # wow. didn't need 2 use () in the subset here.
mat2[i1, mat2[i1,] > 0 ] = 1
word = names(mat2[i1, mat2[i1,] > 0])
mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
wc = c(wc,word)
} # i1 loop ends
mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))] # removed any NAs from the list
mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
graph <- graph.adjacency(mat4, mode = "undirected", weighted=T) # Create Network object
graph = simplify(graph)
V(graph)$color[1:5] = "gray"
V(graph)$color[6:length(V(graph))] = "white"
V(graph)$frame.color[6:length(V(graph))] = adjustcolor("white", alpha.f = 0.7)
plot(graph, vertex.label.cex = 1.2,
layout = layout.kamada.kawai)
} # func ends
That’s the basic funcs we’ll need and keep invoking as we need them going further.
Sudhir