Topic003

################################################################################
## topic-networks.R A sample analysis of a document collection using some     ##
##    standard NLP techniques and a new approach with topic networks.         ##
##                                                                            ##
## To run this code you must first download and extract the sample corpus     ##
##  corpus.zip                                                                ##
## and set the corpus.directory variable to point to the corpus directory.    ##
##                                                                            ##
## Requires the following packages:                                           ##
##    igraph                                                                  ##
##    ggplot2                                                                 ##
##    RColorBrewer                                                            ##
##    slam                                                                    ##
##    tm                                                                      ##
##    topicmodels                                                             ##
##    wordcloud                                                               ##
##                                                                            ##
## Much of the analysis below is based on ideas from Solomon Messing's blog   ##  
## post                                                                       ##
##    Working with Bipartite/Affiliation Network Data in R                    ##
## at                                                                         ##
##    http://solomonmessing.wordpress.com/2012/09/30/                         ##
##                                                                            ##
################################################################################
# Set the path to the corpus directory.                                        #
# ---------------------------------------------------------------------------- #
corpus.directory <- 'C:/Users/s-das/Syncplicity/MyProjects_IMP/MY_Papers_V2/HACKING_CONTESTs/Topic-Networks-master/Topic-Networks-master/corpus/corpus'
setwd('C:/Users/s-das/Syncplicity/MyProjects_IMP/MY_Papers_V2/HACKING_CONTESTs/Topic-Networks-master/Topic-Networks-master/doc/')
# ---------------------------------------------------------------------------- #
# Load required libraries.                                                     #
# ---------------------------------------------------------------------------- #
require(igraph, quietly=TRUE)
require(RColorBrewer, quietly=TRUE)
require(ggplot2, quietly=TRUE)
require(slam, quietly=TRUE)
require(tm, quietly=TRUE)
require(topicmodels, quietly=TRUE)
require(wordcloud, quietly=TRUE)
# ---------------------------------------------------------------------------- #
# Load the corpus data.                                                        #
# ---------------------------------------------------------------------------- #
corpus.source <- DirSource(corpus.directory, encoding="UTF-8")
corpus <- Corpus(corpus.source)
corpus.copy <-corpus
# ---------------------------------------------------------------------------- #
# Build the document term matrix.                                              #
# ---------------------------------------------------------------------------- #
corpus.dtm <- DocumentTermMatrix(
  corpus, 
  control = list(
    stemming = TRUE, 
    stopwords = TRUE, 
    minWordLength = 3,
    removeNumbers = TRUE, 
    removePunctuation = TRUE
  )
)
# ---------------------------------------------------------------------------- #
# Visualize frequent words.                                                    #
# ---------------------------------------------------------------------------- #
word.freq <- sort(col_sums(corpus.dtm), decreasing=TRUE)
# barplot with the top 20 most frequent terms
top.terms <- head(word.freq, n=20)
# complete the stems and fix missing values and errors:
completions <- stemCompletion(names(top.terms), dictionary=corpus.copy, type="prevalent")
completions<-ifelse(completions == "", names(top.terms), completions)
names(top.terms) <-completions
names(top.terms)[10] <-"every"
##pdf("mostfreq.pdf")
op <- par(mar = c(4,6.1,.1,.2))
barplot(top.terms, las=2, horiz=TRUE)
##dev.off()
##png("mostfreq.png")
op <- par(mar = c(4,6.1,.1,.2))
barplot(top.terms, las=2, horiz=TRUE)

###dev.off()
# ---------------------------------------------------------------------------- #
# Build the tf-idf matrix and use it to filter the document term matrix.       #
# ---------------------------------------------------------------------------- #
term.tfidf <- tapply(corpus.dtm$v/row_sums(corpus.dtm)[corpus.dtm$i], corpus.dtm$j, mean) * log2(nDocs(corpus.dtm)/col_sums(corpus.dtm > 0))
dtm <- corpus.dtm[,term.tfidf >= 0.01]
dtm <- dtm[row_sums(dtm) > 0,]
# ---------------------------------------------------------------------------- #
# Build a topic model and collect relevant data in data frames.                #
# ---------------------------------------------------------------------------- #
corpus.tm <- LDA(dtm, k = 22)
corpus.tm.terms <- terms(corpus.tm, 3)
corpus.tm.topics <- topics(corpus.tm, 4)
topic.labels <- apply(corpus.tm.terms, 2, function(x) paste(x, collapse=", "))
document.labels <- colnames(corpus.tm.topics)
dt.df <- data.frame(
  document=rep(document.labels, each=4),
  topic=as.vector(corpus.tm.topics)
)
dt.df$document <- as.numeric(gsub(".txt","",dt.df$document))
dt.df <- dt.df[order(dt.df$document),]
# ---------------------------------------------------------------------------- #
# Build the incidence matrice and document-topic network.                      #
# ---------------------------------------------------------------------------- #
dt.matrix <- as.matrix(table(dt.df))
dt.network <- graph.incidence(dt.matrix)
# ---------------------------------------------------------------------------- #
# Visualize the document-topic network.                                        #
# ---------------------------------------------------------------------------- #
n.docs <- nrow(dt.matrix)
n.topics <- ncol(dt.matrix)
n.vertices <- n.docs + n.topics
V(dt.network)$color[1:n.docs] <- rgb(1,0,0,.4)
V(dt.network)$color[(n.docs+1):n.vertices] <- rgb(0,1,0,.5)
V(dt.network)$label <- NA
V(dt.network)$size[1:n.docs] <- 2
V(dt.network)$size[(n.docs+1):n.vertices] <- 6
E(dt.network)$width <- .5
E(dt.network)$color <- rgb(.5,.5,0,.4)
tkplot(dt.network)
## [1] 1

## [1] 2

## [1] 3

2020-02-24