Analysing the extracted data:

Let’s assign the Library and define the functions.

rm(list=ls())                   # Clear workspace

#--------------------------------------------------------#
# Step 0 - Assign Library & define functions             #
#--------------------------------------------------------#

library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(igraph)
library(maptpx)

The text.clean is the function defined to clean the data which has been inputted in the x variable. We use the regex to remove the html tags from the inputted data. The data which has been inputted would be converted into lower characters and would be removed from the white spaces, html tags, the leading and trailing white spaces.

text.clean = function(x)                    # text data
{ require("tm")
  x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
  x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
  x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric 
  x  =  tolower(x)                          # convert to lower case characters
  x  =  removeNumbers(x)                    # removing numbers
  x  =  stripWhitespace(x)                  # removing white space
  x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
  return(x)
}

The function distill.cog is defined to display the COG’s in the program. The below function has 3 inputs. the TCM adjacent matrix is inputted through the mat1 variable, the title for the graph is inputted through the title variable, the number of central nodes are inputted via the variable s and the maximum numer of connections for the cog is inputted through the variable k1. The Kamada Kawai layout is used to create the COG’s.

distill.cog = function(mat1, # input TCM ADJ MAT
                       title, # title for the graph
                       s,    # no. of central nodes
                       k1){  # max no. of connections  
  
  a = colSums(mat1) # collect colsums into a vector obj a
  b = order(-a)     # nice syntax for ordering vector in decr order  
  
  mat2 = mat1[b,b]  #
  
  diag(mat2) =  0
  
  ## +++ go row by row and find top k adjacencies +++ ##
  
  wc = NULL
  
  for (i1 in 1:s){ 
    thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
    mat2[i1, mat2[i1,] < thresh1] = 0   # wow. didn't need 2 use () in the subset here.
    mat2[i1, mat2[i1,] > 0 ] = 1
    word = names(mat2[i1, mat2[i1,] > 0])
    mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
    wc = c(wc,word)
  } # i1 loop ends
  
  
  mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
  ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))]  # removed any NAs from the list
  mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
  graph <- graph.adjacency(mat4, mode = "undirected", weighted=T)    # Create Network object
  graph = simplify(graph) 
  V(graph)$color[1:s] = "green"
  V(graph)$color[(s+1):length(V(graph))] = "pink"
  
  graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ])
  
  plot(graph, 
       layout = layout.kamada.kawai, 
      main = title)

  } # func ends

Let’s input the extracted text to perform the analysis. The text of the subject GST are inputted into a file.

#--------------------------------------------------------#
# Step 1 - Reading text data                             #
#--------------------------------------------------------#

search_terms = c('gst')

file.der = read.csv(paste0("C:\\Users\\Latitude Owner\\Desktop\\pract\\gst_google_search.csv"))

The minimum row count is found from the 4 files and the data is consolidated row by row for the minimum count of the rows in the four files.

file.der = file.der[!is.na(file.der$text)|file.der$text != '',]

n = min(nrow(file.der))

data = data.frame(id = 1:n, 
                      text3 = file.der$text[1:n],
                      stringsAsFactors = F)
data$text = paste(data$text3)

The consolidated text are written into the variable data. The dimensions of the data is displayed below:

dim(data)

Let’s input the stopwords for analysing the data. The stopwords includes the words the sentence forming terms, the pronouns, collective verbs, tenses, joining terms and so on that is used to create a sentence.

# Read Stopwords list
stpw1 = readLines(file.choose()) # stopwords list from git'
stpw2 = tm::stopwords('english') # tm package stop word list; tokenizer package has the same name function
comn  = unique(c(stpw1, stpw2))  # Union of two list #'solid state chemistry','microeconomics','linguistic'
stopwords = unique(gsub("'"," ",comn))  # final stop word lsit after removing punctuation

Now let’s clean the text by invoking the function text.clean and remove the words present in the stopwords file. The white space is also removed from the data.

x  = text.clean(data$text)             # pre-process text corpus
x  =  removeWords(x,stopwords)         # removing stopwords created above
x  =  stripWhitespace(x)               # removing white space
# x  =  stemDocument(x)

Let’s create the Document Term Matrix for the data stripped from white spaces using the text2vec package. The tokens are created and the analysis is performed on the text based on the bigrams. Only the terms that has an occurance of greater than ten is selected for creating the DTM. The terms thus selected is then ordered in the descending order and sorted.

#--------------------------------------------------------#
####### Create DTM using text2vec package                #
#--------------------------------------------------------#

t1 = Sys.time()

tok_fun = word_tokenizer

it_0 = itoken( x,
               #preprocessor = text.clean,
               tokenizer = tok_fun,
               ids = data$id,
               # progressbar = T
               )

vocab = create_vocabulary(it_0,
                          #ngram = c(2L, 2L)
                          #stopwords = stopwords
)

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 10)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_0  = create_dtm(it_0, vectorizer)

# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),]       #terms in decreasing order of freq
head(tsum)
tail(tsum)

From the data sorted in the descending order the top 1000 bigrams are then extracted and converted into unigrams.The corpus is then converted into a Document Term Matrix.

# select Top 1000 bigrams to unigram
if (length(tsum) > 1000) {n = 1000} else {n = length(tsum)}
tsum = tsum[1:n]

#-------------------------------------------------------
# Code bi-grams as unigram in clean text corpus

text2 = x
text2 = paste("",text2,"")

pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0

for (term in names(tsum)){
  i = i + 1
  focal.term = gsub("_", " ",term)        # in case dot was word-separator
  replacement.term = term
  text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
  # setTxtProgressBar(pb, i)
}


it_m = itoken(text2,
              # preprocessor = text.clean,
              tokenizer = tok_fun,
              ids = data$id,
              # progressbar = T
              )

vocab = create_vocabulary(it_m
                          #ngram = c(2L, 2L),
                          #stopwords = stopwords
)

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 1)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_m  = create_dtm(it_m, vectorizer)
dim(dtm_m)

dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)

print(difftime(Sys.time(), t1, units = 'sec'))

From the document term matrix the rows with less than 5 tokens are removed and the coloums with terms occurring less than 4 times in the corpus are removed. The other terms are alone retained.

# some basic clean-up ops
dim(dtm)

a0 = apply(dtm, 1, sum)   # apply sum operation to dtm's rows. i.e. get rowSum
  dtm = dtm[(a0 > 5),]    # retain only those rows with token rowSum >5, i.e. delete empty rows
  dim(dtm); rm(a0)        # delete a0 object

a0 = apply(dtm, 2, sum)   # use apply() to find colSUms this time
  dtm = dtm[, (a0 > 6)]     # retain only those terms that occurred > 4 times in the corpus
  # dtm = dtm[, (a0 > 4)]     # retain only those terms that occurred > 4 times in the corpus
  dim(dtm); rm(a0)

For the Document Term Matrix where the terms has occured more than 6 times the Word cloud summary is displayed below.

# view summary wordlcoud
a0 = apply(dtm, 2, sum)     # colSum vector of dtm
  a0[1:5]                   # view what a0 obj is like
  a1 = order(as.vector(a0), decreasing = TRUE)     # vector of token locations
  a0 = a0[a1]     # a0 ordered asper token locations
  a0[1:5]         # view a0 now

windows() # opens new image window
wordcloud(names(a0), a0,     # invoke wordcloud() func. Use ?wordcloud for more info
          scale=c(4,1), 
          3, # min.freq 
          max.words = 100,
          colors = brewer.pal(8, "Dark2"))
title(sub = "Quick Summary Wordcloud")

Now let’s create the Term Co-Matrix and view the distilled COG for the full corpus.

#------------------------------------------------------#
# Step 1a - Term Co-occurance Matrix                             #
#------------------------------------------------------#

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 5)

vectorizer = vocab_vectorizer(pruned_vocab, grow_dtm = FALSE, skip_grams_window = 3L)
tcm = create_tcm(it_m, vectorizer)

tcm.mat = as.matrix(tcm)
adj.mat = tcm.mat + t(tcm.mat)

diag(adj.mat) = 0     # set diagonals of the adj matrix to zero --> node isn't its own neighor
a0 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a0[1:50], a0[1:50]])

windows()
distill.cog(adj.mat, 'Distilled COG for full corpus',  10,  10)

Now that we have the DTM and the TCM for the full corpus let’s run a model based text analytics on the text recieved on the four different subjects.

The size of the input matrix is

install.packages("rmarkdown")
# rm(list=ls())                   # Clear workspace

#--------------------------------------------------------#
# Step 0 - Assign Library & define functions             #
#--------------------------------------------------------#

library(text2vec)
library(data.table)
library(stringr)
library(tm)
library(RWeka)
library(tokenizers)
library(slam)
library(wordcloud)
library(ggplot2)

text.clean = function(x)                    # text data
{ require("tm")
  x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
  x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
  x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric 
  x  =  tolower(x)                          # convert to lower case characters
  x  =  removeNumbers(x)                    # removing numbers
  x  =  stripWhitespace(x)                  # removing white space
  x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
  return(x)
}


#--------------------------------------------------------#
# Step 1 - Reading text data                             #
#--------------------------------------------------------#

# temp.text = readLines('C:\\Users\\30773\\Desktop\\Data Science\\cba batch 7\\text classification\\testdata.txt')
temp_text = read.csv(file.choose(), stringsAsFactors = FALSE)  #  iron man reviews
temp.text = temp_text$text
# head(temp.text,1)
data = data.frame(id = 1:length(temp.text), text = temp.text, stringsAsFactors = F)
# dim(data)

# Read Stopwords list
#stpw1 = readLines('C:\\Users\\30773\\Desktop\\Data Science\\cba batch 7\\stopwords.txt')      #
stpw1 = readLines(file.choose())  #  stop words
stpw2 = tm::stopwords('english')                   # tm package stop word list; tokenizer package has the same name function
comn  = unique(c(stpw1, stpw2))                 # Union of two list
stopwords = unique(gsub("'"," ",comn))  # final stop word lsit after removing punctuation

x  = text.clean(data$text)             # pre-process text corpus
x  =  removeWords(x,stopwords)            # removing stopwords created above
x  =  stripWhitespace(x)                  # removing white space
# x  =  stemDocument(x)

#--------------------------------------------------------#
######  Create DTM using text2vec package                #
#--------------------------------------------------------#

t1 = Sys.time()

tok_fun = word_tokenizer

it_m = itoken(x,
              # preprocessor = text.clean,
              tokenizer = tok_fun,
              ids = data$id,
              progressbar = T)

vocab = create_vocabulary(it_m
                          # ngram = c(2L, 2L),
                          #stopwords = stopwords
                          )

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 1)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_m  = create_dtm(it_m, vectorizer)
dim(dtm_m)

dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)
  a0 = (apply(dtm, 1, sum) > 0)   # build vector to identify non-empty docs
  dtm = dtm[a0,]                  # drop empty docs


print(difftime(Sys.time(), t1, units = 'sec'))

#--------------------------------------------------------#
#             Sentiment Analysis                         #
#--------------------------------------------------------#
require(qdap) || install.packages("qdap") # ensure java is up to date!
library(qdap)

x1 = x[a0]    # remove empty docs from corpus

t1 = Sys.time()   # set timer

pol = polarity(x1)         # Calculate the polarity from qdap dictionary
wc = pol$all[,2]                  # Word Count in each doc
val = pol$all[,3]                 # average polarity score
p  = pol$all[,4]                  # Positive words info
n  = pol$all[,5]                  # Negative Words info  

Sys.time() - t1  # how much time did the above take?

head(pol$all)
head(pol$group)

positive_words = unique(setdiff(unlist(p),"-"))  # Positive words list
negative_words = unique(setdiff(unlist(n),"-"))  # Negative words list

print(positive_words)       # Print all the positive words found in the corpus
print(negative_words)       # Print all neg words

#--------------------------------------------------------#
#   Create Postive Words wordcloud                       #
#--------------------------------------------------------#

pos.tdm = dtm[,which(colnames(dtm) %in% positive_words)]
m = as.matrix(pos.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows() # opens new image window
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))
title(sub = "Positive Words - Wordcloud")

# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows() # opens new image window
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "blue") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#--------------------------------------------------------#
#  Create Negative Words wordcloud                       #
#--------------------------------------------------------#

neg.tdm = dtm[,which(colnames(dtm) %in% negative_words) ]
m = as.matrix(neg.tdm)
v = sort(colSums(m), decreasing = TRUE)
windows()
wordcloud(names(v), v, scale=c(4,1),1, max.words=100,colors=brewer.pal(8, "Dark2"))         
title(sub = "Negative Words - Wordcloud")

# plot barchart for top tokens
test = as.data.frame(v[1:15])
windows()
ggplot(test, aes(x = rownames(test), y = test)) + 
  geom_bar(stat = "identity", fill = "red") +
  geom_text(aes(label = test), vjust= -0.20) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Practicum_Group_Assignment

Cris Alfo_Abhineet

December 20, 2016

Analysing the extracted data: