library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringi)
library(tm)
## Warning: package 'tm' was built under R version 4.1.3
## Loading required package: NLP
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.1.3
## Loading required package: RColorBrewer
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(tidyr)
library(mgsub)
## Warning: package 'mgsub' was built under R version 4.1.3
library(scales) 
library(stringr)
library(textdata)
## Warning: package 'textdata' was built under R version 4.1.3
library(widyr)
## Warning: package 'widyr' was built under R version 4.1.3
library(SnowballC)
library(syuzhet)
## Warning: package 'syuzhet' was built under R version 4.1.3
## 
## Attaching package: 'syuzhet'
## The following object is masked from 'package:scales':
## 
##     rescale
#read text file
review <-readLines("test.ft.txt.bz2", n = 10000)
TextDoc <- Corpus(VectorSource(review))

#Replacing symbols with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "__")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "__"): transformation drops
## documents
TextDoc <- tm_map(TextDoc, toSpace, ":")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, ":"): transformation drops
## documents
TextDoc <- tm_map(TextDoc, toSpace, "-")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "-"): transformation drops
## documents
#Convert text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
## Warning in tm_map.SimpleCorpus(TextDoc, removeNumbers): transformation drops
## documents
# Remove English common stop words
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(TextDoc, removeWords, stopwords("english")):
## transformation drops documents
# Remove punctuation
TextDoc <- tm_map(TextDoc, removePunctuation)
## Warning in tm_map.SimpleCorpus(TextDoc, removePunctuation): transformation drops
## documents
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
## Warning in tm_map.SimpleCorpus(TextDoc, stripWhitespace): transformation drops
## documents
# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)
## Warning in tm_map.SimpleCorpus(TextDoc, stemDocument): transformation drops
## documents
# Build a term-document matrix
TextDoc_tdm <- TermDocumentMatrix(TextDoc)
inspect(TextDoc_tdm)
## <<TermDocumentMatrix (terms: 24783, documents: 10000)>>
## Non-/sparse entries: 344400/247485600
## Sparsity           : 100%
## Maximal term length: 74
## Weighting          : term frequency (tf)
## Sample             :
##        Docs
## Terms   1176 164 2187 3071 3625 3666 6029 6972 7834 9825
##   book     0   0    0    0    0    1    0    0    0    2
##   good     1   0    0    0    1    0    2    2    1    1
##   great    0   2    0    0    0    1    1    1    0    0
##   just     0   3    0    0    0    0    0    2    1    2
##   label    1   1    1    1    1    1    1    1    1    1
##   like     1   5    1    1    1    0    0    1    0    0
##   movi     0   0    0    1    2    0    2    0    2    0
##   one      1   1    1    0    0    0    1    2    1    1
##   read     0   0    0    0    0    0    0    0    0    1
##   time     2   0    1    0    0    0    0    2    0    0
tdm_m <- as.matrix(TextDoc_tdm)

#Word counts
wordCounts <- rowSums(tdm_m)
wordCounts <- sort(wordCounts, decreasing = T)
head(wordCounts)
## label  book   one  movi  like  read 
## 10061  7209  4103  3795  3402  3373
#word cloud
wordcloud(names(wordCounts), wordCounts, min.freq = 2,
          max.words = 50, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

#Sentiment Analysis 
pos <- "C:/Users/sharl/Desktop/USF/2021/Spring 2021/LIS 4761 Data-Text Mining/HW/Labs/week6_Lab/positive-words.txt"
neg <- "C:/Users/sharl/Desktop/USF/2021/Spring 2021/LIS 4761 Data-Text Mining/HW/Labs/week6_Lab/negative-words.txt"

p <- scan(pos, character(0), sep = "\n")
n <- scan(neg, character(0), sep = "\n")

#remove header
p <- p[-1:-34]
n <- n[-1:-34]

#calculate total number of words
totalWords <- sum(wordCounts)

#create vector containing all the words
words <- names(wordCounts)

matched <- match(words, p, nomatch = 0)
head(matched, 10)
##  [1]    0    0    0    0 1083    0  827  852    0    0
matched[5]
## [1] 1083
p[1083]
## [1] "like"
words[5]
## [1] "like"
mCounts <- wordCounts[which(matched != 0)]
length(mCounts)
## [1] 388
mWords <- names(mCounts)
nPos <- sum(mCounts)
nPos
## [1] 34188
n_matched <- match(words, n, nomatch = 0)
nCounts <- wordCounts[which(n_matched != 0)]
nNeg <- sum(nCounts)
nWords <- names(nCounts)
nNeg
## [1] 20510
length(nCounts)
## [1] 898
#Calculate the percentage of words that are positive or negative
total_Words <- length(words)
ratioPos <- nPos/total_Words
ratioPos
## [1] 1.379494
ratioNeg <- nNeg/total_Words
ratioNeg
## [1] 0.8275834
totalRatio <- data.frame(ratioPos, ratioNeg)
totalRatio
##   ratioPos  ratioNeg
## 1 1.379494 0.8275834