Import data and representation

First install and load packages needed for text mining.

install.packages(c('tm', 'SnowballC', 'wordcloud', 'topicmodels'))

library(tm)

## Loading required package: NLP

library(SnowballC)
library(wordcloud)

## Loading required package: RColorBrewer

Next we load the move review dataset

reviews = read.csv("movie_reviews.csv", stringsAsFactors = F, row.names = 1)

The review dataset has two variables: content of the review, and polarity of the review (0 or 1). To use the tm package we first transfrom the dataset to a corpus:

review_corpus = Corpus(VectorSource(reviews$content))

Next we normalize the texts in the reviews using a series of pre-processing steps: 1. Switch to lower case 2. Remove numbers 3. Remove punctuation marks and stopwords 4. Remove extra whitespaces

review_corpus = tm_map(review_corpus, content_transformer(tolower))
review_corpus = tm_map(review_corpus, removeNumbers)
review_corpus = tm_map(review_corpus, removePunctuation)
review_corpus = tm_map(review_corpus, removeWords, c("the", "and", stopwords("english")))
review_corpus =  tm_map(review_corpus, stripWhitespace)

After the above transformations the first review looks like

inspect(review_corpus[1])

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 2269

To analyze the textual data, we use a Document-Term Matrix (DTM) representation: documents as the rows, terms/words as the columns, frequency of the term in the document as the entries. Because the number of unique words in the corpus the dimension can be large.

review_dtm <- DocumentTermMatrix(review_corpus)
review_dtm

## <<DocumentTermMatrix (documents: 2000, terms: 46460)>>
## Non-/sparse entries: 538003/92381997
## Sparsity           : 99%
## Maximal term length: 61
## Weighting          : term frequency (tf)

inspect(review_dtm[500:505, 500:505])

## <<DocumentTermMatrix (documents: 6, terms: 6)>>
## Non-/sparse entries: 0/36
## Sparsity           : 100%
## Maximal term length: 11
## Weighting          : term frequency (tf)
## 
##      Terms
## Docs  adolescents adolf adolph adopt adopted adopter
##   500           0     0      0     0       0       0
##   501           0     0      0     0       0       0
##   502           0     0      0     0       0       0
##   503           0     0      0     0       0       0
##   504           0     0      0     0       0       0
##   505           0     0      0     0       0       0

To reduce the dimension of the DTM, we can emove the less frequent terms such that the sparsity is less than 0.95

review_dtm = removeSparseTerms(review_dtm, 0.99)
review_dtm

## <<DocumentTermMatrix (documents: 2000, terms: 4366)>>
## Non-/sparse entries: 400173/8331827
## Sparsity           : 95%
## Maximal term length: 17
## Weighting          : term frequency (tf)

The first review now looks like

inspect(review_dtm[1,1:20])

## <<DocumentTermMatrix (documents: 1, terms: 20)>>
## Non-/sparse entries: 1/19
## Sparsity           : 95%
## Maximal term length: 12
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs abandon abandoned abilities ability able aboard absence absent
##    1       0         0         0       0    0      0       0      0
##     Terms
## Docs absolute absolutely absurd abuse abyss academy accent accept
##    1        0          0      0     0     0       0      0      0
##     Terms
## Docs acceptable accepts accident accidentally
##    1          0       0        1            0

We can draw a simple word cloud

findFreqTerms(review_dtm, 1000)

##  [1] "action"     "also"       "another"    "back"       "bad"       
##  [6] "best"       "can"        "character"  "characters" "director"  
## [11] "doesnt"     "dont"       "end"        "even"       "film"      
## [16] "films"      "first"      "get"        "good"       "great"     
## [21] "hes"        "just"       "know"       "life"       "like"      
## [26] "little"     "love"       "made"       "make"       "man"       
## [31] "many"       "movie"      "movies"     "much"       "never"     
## [36] "new"        "one"        "people"     "plot"       "really"    
## [41] "scene"      "scenes"     "see"        "seems"      "something" 
## [46] "still"      "story"      "time"       "two"        "way"       
## [51] "well"       "will"       "work"

freq = data.frame(sort(colSums(as.matrix(review_dtm)), decreasing=TRUE))
wordcloud(rownames(freq), freq[,1], max.words=50, colors=brewer.pal(1, "Dark2"))

One may argue that in the wordcloud, words such as one, film, movie do not carry too much meaning in the setting, since we know that the entire corpus is about movies. Therefore sometimes it is necessary to use the tf–idf(term frequency–inverse document frequency) instead of the frequencies of the term as entries, tf-idf measures the relative importance of a word to a document.

review_dtm_tfidf <- DocumentTermMatrix(review_corpus, control = list(weighting = weightTfIdf))
review_dtm_tfidf = removeSparseTerms(review_dtm_tfidf, 0.95)
review_dtm_tfidf

## <<DocumentTermMatrix (documents: 2000, terms: 963)>>
## Non-/sparse entries: 254264/1671736
## Sparsity           : 87%
## Maximal term length: 14
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)

# The first document
inspect(review_dtm_tfidf[1,1:20])

## <<DocumentTermMatrix (documents: 1, terms: 20)>>
## Non-/sparse entries: 3/17
## Sparsity           : 85%
## Maximal term length: 10
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## 
##     Terms
## Docs ability able absolutely across act acting action actor      actors
##    1       0    0          0      0   0      0      0     0 0.005821345
##     Terms
## Docs actress actual   actually add admit age agent        ago air alien
##    1       0      0 0.01088143   0     0   0     0 0.01054396   0     0
##     Terms
## Docs alive
##    1     0

Is the new word cloud more informative?

freq = data.frame(sort(colSums(as.matrix(review_dtm_tfidf)), decreasing=TRUE))
wordcloud(rownames(freq), freq[,1], max.words=100, colors=brewer.pal(1, "Dark2"))

Predictive modeling

To predict the polarity (sentiment) of a review, we can make use of a precompiled list of words with positive and negative meanings (Source

neg_words = read.table("negative-words.txt", header = F, stringsAsFactors = F)[, 1]
pos_words = read.table("positive-words.txt", header = F, stringsAsFactors = F)[, 1]

As simple indicators, we create two variables (neg, pos) that contain the number of positive and negative words in each document

reviews$neg = sapply(review_corpus, tm_term_score, neg_words)
reviews$pos = sapply(review_corpus, tm_term_score, pos_words)

Let’s remove the actual texual content for statistical model building

reviews$content = NULL

Now we can combine the tf-idf matrix with the sentiment polarity according to the sentiment lists.

reviews = cbind(reviews, as.matrix(review_dtm_tfidf))
reviews$polarity = as.factor(reviews$polarity)

Split to testing and training set

id_train <- sample(nrow(reviews),nrow(reviews)*0.80)
reviews.train = reviews[id_train,]
reviews.test = reviews[-id_train,]

The rest should be natural for you by this point. We can compare the performance of logistic regression, decision tree, SVM, and neural network models.

install.packages(c('rpart', 'rpart.plot', 'e1071', 'nnet'))

library(rpart)
library(rpart.plot)
library(e1071)
library(nnet)

Train models:

reviews.tree = rpart(polarity~.,  method = "class", data = reviews.train);  
prp(reviews.tree)

reviews.glm = glm(polarity~ ., family = "binomial", data =reviews.train, maxit = 100);  
reviews.svm = svm(polarity~., data = reviews.train);
reviews.nnet = nnet(polarity~., data=reviews.train, size=1, maxit=500)

## # weights:  968
## initial  value 1110.859994 
## iter  10 value 957.189873
## iter  20 value 945.434800
## iter  30 value 919.799736
## iter  40 value 533.447779
## iter  50 value 109.572367
## iter  60 value 36.610575
## iter  70 value 16.088250
## iter  80 value 11.392190
## iter  90 value 10.226104
## iter 100 value 2.211291
## iter 110 value 0.025293
## iter 120 value 0.004061
## iter 130 value 0.002048
## iter 140 value 0.000988
## iter 150 value 0.000665
## iter 160 value 0.000599
## iter 170 value 0.000203
## iter 180 value 0.000187
## final  value 0.000079 
## converged

Evaluate performance with the test set:

pred.tree = predict(reviews.tree, reviews.test,  type="class")
table(reviews.test$polarity,pred.tree,dnn=c("Obs","Pred"))

##    Pred
## Obs   0   1
##   0 136  63
##   1  64 137

mean(ifelse(reviews.test$polarity != pred.tree, 1, 0))

## [1] 0.3175

pred.glm = as.numeric(predict(reviews.glm, reviews.test, type="response") > 0.5)
table(reviews.test$polarity,pred.glm,dnn=c("Obs","Pred"))

##    Pred
## Obs   0   1
##   0 141  58
##   1  61 140

mean(ifelse(reviews.test$polarity != pred.glm, 1, 0))

## [1] 0.2975

pred.svm = predict(reviews.svm, reviews.test)
table(reviews.test$polarity,pred.svm,dnn=c("Obs","Pred"))

##    Pred
## Obs   0   1
##   0 170  29
##   1  41 160

mean(ifelse(reviews.test$polarity != pred.svm, 1, 0))

## [1] 0.175

prob.nnet= predict(reviews.nnet,reviews.test)
pred.nnet = as.numeric(prob.nnet > 0.5)
table(reviews.test$polarity, pred.nnet, dnn=c("Obs","Pred"))

##    Pred
## Obs   0   1
##   0 162  37
##   1  41 160

mean(ifelse(reviews.test$polarity != pred.nnet, 1, 0))

## [1] 0.195

Basic Text Mining with R

Import data and representation

Predictive modeling