Kostas
21 September 2016
We will examine some text mining tools you can use in R, some visualizations and a case study: l
Topic Detection with Latent Dirichlet Allocation
Visualize the results. Show the sentiment, the most frequent words, how the words co occure, show the topic distribution and the sentiment of the topics
## imbd movie reviews
## http://ai.stanford.edu/~amaas/data/sentiment/
## Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
## The dataset has preseperated train and test dataset. I have allready read the datasets into one 50000 reviews dataset and exported to a txt file.
allreviews=readLines("myreviews.txt")
allreviews=allreviews[-1]
label=read.table("mylabels.txt")
allreviews=data.frame(x=allreviews,y=label$x)
allreviews=allreviews[1:25000,]
positive=rbind(allreviews[1:12500,])
rownames(positive)=c(1:length(positive[,1]))
negative=rbind(allreviews[12501:25000,])
smp_size <- floor(0.70 * nrow(positive))
train_ind <- sample(seq_len(nrow(positive)), size = smp_size)
postrain=positive[train_ind,]
postest=positive[-train_ind,]
smp_size <- floor(0.70 * nrow(negative))
train_ind <- sample(seq_len(nrow(negative)), size = smp_size)
negtrain=negative[train_ind,]
negtest=negative[-train_ind,]
# So, now we have a train set with 35000 reviews and a test set with 15000 reviews
train=rbind(negtrain,postrain)
test=rbind(negtest,postest)
require(tm)
# clean train dataset ###########
train_corpus=Corpus(DataframeSource(as.matrix(train$x)))
p1=tm_map(train_corpus,tolower)
p2=tm_map(p1,removeNumbers)
p3=tm_map(p2,removeWords,stopwords("en")) # Other options is SMART stopword list and some other languages (not greek)
p4=tm_map(p3,removePunctuation)
p5=tm_map(p4,stripWhitespace)
p6=tm_map(p5,PlainTextDocument,lazy=TRUE)
p7=tm_map(p6,stemDocument, language = "en")
# clean test dataset ###########
test_corpus=Corpus(DataframeSource(as.matrix(test$x)))
tp1=tm_map(test_corpus,tolower)
tp2=tm_map(tp1,removeNumbers)
tp3=tm_map(tp2,removeWords,stopwords("en")) # Other options is SMART stopword list and some other languages (not greek)
tp4=tm_map(tp3,removePunctuation)
tp5=tm_map(tp4,stripWhitespace)
tp6=tm_map(tp5,PlainTextDocument,lazy=TRUE)
tp7=tm_map(tp6,stemDocument, language = "en")
# Now we pre processed the movie reviews. An exploratory analysis is needed to check unusuall terms and any other replacements
# Usefull note: library "wordnet" makes it possible to replace words with synonyms. Example:
require("wordnet")
Sys.setenv(WNHOME ="C:/Program Files (x86)/WordNet/2.1/dict")
setDict("C:/Program Files (x86)/WordNet/2.1/dict")
synonyms("company","NOUN")
[1] "caller" "companionship" "company" "fellowship"
[5] "party" "ship's company" "society" "troupe"
## You need to set the Wordnet Dictionary. Note that it depends on your system if error occurs or not. A simple google search will solve everything. Now with the tm package:
# tm_map( corpus,replaceWords,synonyms(dict,"company"),by="company")
# You can manually replace any word you like. Example: tm_map(corpus,replaceWords,"burger","burgers")
#train_dtm=DocumentTermMatrix(p7,control = list(global = c(10,Inf)))
## global argument is used to remove terms that occure less than 10 times in the corpus.
# train_dtm=DocumentTermMatrix(p7,control = list(global = c(10,Inf),tolower=T,removeNumbers=T, removePunctuation=T,stopwords=T,stripWhitespace=T,stemming=T))
train_dtm=DocumentTermMatrix(p7,control = list(global = c(10,Inf),wordLengths=c(3,Inf),weighting = function(x) weightTfIdf(x, normalize = FALSE)))
train_dtm1=removeSparseTerms(train_dtm,(0.96)) # Remove sparse terms
train_BoW=as.matrix(train_dtm1)
rownames(train_BoW)<-c(1:length(rownames(train_BoW)))
train_data_m=data.frame(x=train_BoW,y=train$y) # Created the train data.frame
DTM_test= DocumentTermMatrix(tp7,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE),wordLengths=c(3,Inf),dictionary=Terms(train_dtm1))) # here again you can add the text preprocess commands, as shown before
test1_BoW=as.matrix(DTM_test)
rownames(test1_BoW)<-c(1:length(rownames(test1_BoW)))
test1_data<-data.frame(x=test1_BoW,y= test$y ) # Test set data.frame is ready!
#library(rJava)
#install.packages("RWeka")
#require(RWeka)
#NgramTokenizer = function(x) NGramTokenizer(x,Weka_control(min=1,max=2))
#train_dtm_bigram=train_dtm=DocumentTermMatrix(p7,control = list(global = c(10,Inf),weighting = function(x) weightTfIdf(x, normalize = FALSE),tokenize=NgramTokenizer))
#test_dtm_bigram=DocumentTermMatrix(tp7,control = list(tokenize=NgramTokenizer,weighting = function(x) weightTfIdf(x, normalize = FALSE),dictionary=Terms(train_dtm_bigram)))
FreqTerms=findFreqTerms(train_dtm1,lowfreq=10000,highfreq=Inf)
term.freq=colSums(train_BoW)
term.freq=subset(term.freq,term.freq>=10000)
df=data.frame(term=names(term.freq),freq=term.freq)
require(syuzhet)
syuzhet.sentiment<-get_sentiment(as.character(train$x),method="afinn" )
afinn_sentiment=ifelse(syuzhet.sentiment<=0,"N","P")
conf.matrix=table(afinn_sentiment,train$y)
accuracy= sum(diag(conf.matrix))/sum(conf.matrix)
print(accuracy)
[1] 0.7074286
assocs.cast=findAssocs(DTM_test,"cast",corlimit=0.11)
assocs.horror=findAssocs(DTM_test,"horror",corlimit=0.11)
assocs.action=findAssocs(DTM_test,"action",corlimit=0.11)
print(c(assocs.cast,assocs.horror,assocs.action))
$cast
support role perform actor excel fine play talent
0.19 0.17 0.14 0.13 0.12 0.11 0.11 0.11
$horror
dead hous effect film flick budget genr kill death movi
0.19 0.17 0.14 0.14 0.14 0.12 0.12 0.12 0.11 0.11
$action
sequenc fight scene
0.14 0.13 0.12
action=data.frame(x=names(assocs.action$action),y=assocs.action$action)
ggplot(data=action, aes(x=x, y= y))+geom_bar(colour="purple",fill="brown",width=0.5, stat="identity") +guides(fill=FALSE)+coord_flip()+ggtitle("The terms which associate \n most with \"action\"")+xlab("")
# αdd +expand_limits(y=c(0,1))+scale_y_continuous(labels=percent) if you want different y limits and different labels
-We use the test set of 7500reviews to reduce the computation time.
require(tm)
require(RWeka)
topic_corpus=Corpus(DataframeSource(as.matrix(as.character(test$x))))
lt1=tm_map(topic_corpus,tolower)
lt2=tm_map(lt1,removeNumbers)
lt3=tm_map(lt2,removeWords,stopwords("SMART")) # Remove stopwords from SMART list
lt4=tm_map(lt3,removePunctuation)
lt4=tm_map(lt4,removeWords,"br")
lt5=tm_map(lt4,stripWhitespace)
lt6=tm_map(lt5,PlainTextDocument,lazy=TRUE)
lt7=tm_map(lt6,stemDocument, language = "en")
NgramTokenizer = function(x) NGramTokenizer(x,Weka_control(min=1,max=2))
DTM_topic=DocumentTermMatrix(lt7,control = list(global = c(10,Inf),wordLengths=c(3,Inf),minDocFreq=2,tokenize=NgramTokenizer))
DTM_topic_s=removeSparseTerms(DTM_topic,(0.97))
require(topicmodels)
k <- 5
t1=Sys.time()
ldaOut2 <-LDA(DTM_topic_s,k, method="Gibbs",control = list(alpha = 1))
t2=Sys.time()
t2-t1
Time difference of 1.19132 mins
ldaOut.terms2 <- as.matrix(terms(ldaOut2,10))
print(ldaOut.terms2)
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
[1,] "movi" "film" "stori" "show" "film"
[2,] "watch" "scene" "life" "great" "scene"
[3,] "bad" "kill" "love" "watch" "charact"
[4,] "good" "guy" "charact" "play" "work"
[5,] "act" "horror" "peopl" "time" "make"
[6,] "make" "thing" "live" "love" "perform"
[7,] "time" "good" "man" "good" "actor"
[8,] "plot" "back" "world" "comedi" "director"
[9,] "thing" "plot" "end" "seri" "direct"
[10,] "actor" "end" "time" "funni" "great"
topicmatrix2=as.matrix(DTM_topic_s)
row.names(topicmatrix2)=c(1:length(test$y))
lda_inf2 <- posterior(ldaOut2,topicmatrix2[10:11,])
lda_inf2$topics
1 2 3 4 5
10 0.2500000 0.3500000 0.0500000 0.1000000 0.250000
11 0.2682927 0.1463415 0.2682927 0.1219512 0.195122
# as.character(test$x[10:11]) run this to read the comments
lda_inf2 <- posterior(ldaOut2,topicmatrix2)
i=1
mat=c(1:dim(topicmatrix2)[1])
for (i in 1:dim(topicmatrix2)[1]) {
if((lda_inf2$topics[i,1]>0.39) & (lda_inf2$topics[i,1]==max(lda_inf2$topics[i,])))
{ mat[i]="Topic 1"}
else if((lda_inf2$topics[i,2]>0.39) & (lda_inf2$topics[i,2]==max(lda_inf2$topics[i,])))
{mat[i]="Topic 2"}
else if((lda_inf2$topics[i,3]>0.39) & (lda_inf2$topics[i,3]==max(lda_inf2$topics[i,])))
{mat[i]="Topic 3"}
else if((lda_inf2$topics[i,4]>0.39) & (lda_inf2$topics[i,4]==max(lda_inf2$topics[i,])))
{mat[i]="Topic 4"}
else if((lda_inf2$topics[i,5]>0.39) & (lda_inf2$topics[i,5]==max(lda_inf2$topics[i,])))
{mat[i]="Topic 5"}
else{mat[i]="Multitopic"}
}
#barplotdata=data.frame(x=c(1:length(mat)),y=mat)
#library(ggplot2)
#qplot(t, data=barplotdata, geom="bar", fill=t)+ggtitle("Συχνότητες θεμάτων σε 7500 κριτικές. Μία κριτική\nκατατάσσεται σε ένα θέμα εαν αυτό έχει πιθανότητα =>0.4")+ylab("Συχνότητα")+scale_fill_brewer(name="",palette="Spectral")+xlab("")
stop_words <- stopwords("SMART")
# pre-processing:
reviews <- gsub("'", "", test$x) # remove apostrophes
reviews <- gsub("[[:punct:]]", " ", reviews) # replace punctuation with space
reviews <- gsub("[[:cntrl:]]", " ", reviews) # replace control characters with space
reviews <- gsub("^[[:space:]]+", "", reviews) # remove whitespace at beginning of documents
reviews <- gsub("[[:space:]]+$", "", reviews) # remove whitespace at end of documents
reviews <- tolower(reviews) # force to lowercase
# tokenize on space and output as a list:
doc.list <- strsplit(reviews, "[[:space:]]+")
# compute the table of terms:
term.table <- table(unlist(doc.list))
term.table <- sort(term.table, decreasing = TRUE)
# remove terms that are stop words or occur fewer than 5 times:
del <- names(term.table) %in% stop_words | term.table < 5
term.table <- term.table[!del]
vocab <- names(term.table)
# now put the documents into the format required by the lda package:
get.terms <- function(x) {
index <- match(x, vocab)
index <- index[!is.na(index)]
rbind(as.integer(index - 1), as.integer(rep(1, length(index))))
}
documents <- lapply(doc.list, get.terms)
# Compute some statistics related to the data set:
D <- length(documents) # number of documents
W <- length(vocab) # number of terms in the vocab
doc.length <- sapply(documents, function(x) sum(x[2, ])) # number of tokens per document
N <- sum(doc.length) # total number of tokens in the data
term.frequency <- as.integer(term.table) # frequencies of terms in the corpus
K <- 5 ## Important: the number of topics. Play with It
G <- 5000
alpha <- 1
eta <- 1
# Fit the model:
# install.packages("lda")
library(lda)
set.seed(357)
t1 <- Sys.time()
fit <- lda.collapsed.gibbs.sampler(documents = documents, K = K, vocab = vocab,num.iterations = G, alpha = alpha,eta = eta, initial = NULL, burnin = 0,compute.log.likelihood = TRUE)
t2 <- Sys.time()
t2 - t1 # about 1.133277 hours on laptop
Time difference of 12.21508 mins
theta <- t(apply(fit$document_sums + alpha, 2, function(x) x/sum(x)))
phi <- t(apply(t(fit$topics) + eta, 2, function(x) x/sum(x)))
MovieReviews <- list(phi = phi,theta = theta,doc.length = doc.length,
vocab = vocab,term.frequency = term.frequency)
#install.packages("LDAvis")
library(LDAvis)
# create the JSON object to feed the visualization:
json <- with(MovieReviews,createJSON(phi = MovieReviews$phi,theta = MovieReviews$theta,doc.length = MovieReviews$doc.length,vocab = MovieReviews$vocab, term.frequency = MovieReviews$term.frequency))
#serVis(json,as.gist = TRUE) # it opens a window in your browser where you verify your github account. Then it creates a gist, from where you can share the interactice visualization
library(FSelector)
chi_squared_weights= chi.squared(y~.,data=train_data_m)
all=dim(chi_squared_weights)[1]
zeroes=length(chi_squared_weights$attr_importance[chi_squared_weights$attr_importance==0])
final=all-zeroes
subsetx=cutoff.k(chi_squared_weights,final)
train_F=data.frame(train_data_m[,subsetx],y=train$y)
test_F=data.frame(test1_data[subsetx],y=test$y)
# 0.80 accuracy
t1=Sys.time()
library(e1071)
NBclassifier=naiveBayes(y~.,data=train_F)
test1Pred=predict(NBclassifier,newdata=test_F)
t2=Sys.time()
t2-t1 #how fast NB classifier is
Time difference of 54.69847 secs
Confusion Matrix and Statistics
True
Prediction N P
N 2827 636
P 923 3114
Accuracy : 0.7921
95% CI : (0.7828, 0.8013)
No Information Rate : 0.5
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5843
Mcnemar's Test P-Value : 4.375e-13
Sensitivity : 0.8304
Specificity : 0.7539
Pos Pred Value : 0.7714
Neg Pred Value : 0.8163
Prevalence : 0.5000
Detection Rate : 0.4152
Detection Prevalence : 0.5383
Balanced Accuracy : 0.7921
'Positive' Class : P
# We achieve 83% accuracy.
t1=Sys.time()
library(kernlab)
ksvm_class=ksvm(y~.,data=train_F)
SVMPred=predict(ksvm_class,newdata=test_F)
t2=Sys.time()
t2-t1
Time difference of 3.289436 mins
confusionMatrix(SVMPred,test_F$y,positive="P",dnn=c("Prediction","True"))
Confusion Matrix and Statistics
True
Prediction N P
N 3054 568
P 696 3182
Accuracy : 0.8315
95% CI : (0.8228, 0.8399)
No Information Rate : 0.5
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6629
Mcnemar's Test P-Value : 0.0003541
Sensitivity : 0.8485
Specificity : 0.8144
Pos Pred Value : 0.8205
Neg Pred Value : 0.8432
Prevalence : 0.5000
Detection Rate : 0.4243
Detection Prevalence : 0.5171
Balanced Accuracy : 0.8315
'Positive' Class : P