library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringi)
library(tm)
## Warning: package 'tm' was built under R version 4.1.3
## Loading required package: NLP
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.1.3
## Loading required package: RColorBrewer
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(tidyr)
library(mgsub)
## Warning: package 'mgsub' was built under R version 4.1.3
library(scales)
library(stringr)
library(textdata)
## Warning: package 'textdata' was built under R version 4.1.3
library(widyr)
## Warning: package 'widyr' was built under R version 4.1.3
library(SnowballC)
library(syuzhet)
## Warning: package 'syuzhet' was built under R version 4.1.3
##
## Attaching package: 'syuzhet'
## The following object is masked from 'package:scales':
##
## rescale
#read text file
review <-readLines("test.ft.txt.bz2", n = 10000)
TextDoc <- Corpus(VectorSource(review))
#Replacing symbols with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "__")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "__"): transformation drops
## documents
TextDoc <- tm_map(TextDoc, toSpace, ":")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, ":"): transformation drops
## documents
TextDoc <- tm_map(TextDoc, toSpace, "-")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "-"): transformation drops
## documents
#Convert text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
## Warning in tm_map.SimpleCorpus(TextDoc, removeNumbers): transformation drops
## documents
# Remove English common stop words
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(TextDoc, removeWords, stopwords("english")):
## transformation drops documents
# Remove punctuation
TextDoc <- tm_map(TextDoc, removePunctuation)
## Warning in tm_map.SimpleCorpus(TextDoc, removePunctuation): transformation drops
## documents
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
## Warning in tm_map.SimpleCorpus(TextDoc, stripWhitespace): transformation drops
## documents
# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)
## Warning in tm_map.SimpleCorpus(TextDoc, stemDocument): transformation drops
## documents
# Build a term-document matrix
TextDoc_tdm <- TermDocumentMatrix(TextDoc)
inspect(TextDoc_tdm)
## <<TermDocumentMatrix (terms: 24783, documents: 10000)>>
## Non-/sparse entries: 344400/247485600
## Sparsity : 100%
## Maximal term length: 74
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1176 164 2187 3071 3625 3666 6029 6972 7834 9825
## book 0 0 0 0 0 1 0 0 0 2
## good 1 0 0 0 1 0 2 2 1 1
## great 0 2 0 0 0 1 1 1 0 0
## just 0 3 0 0 0 0 0 2 1 2
## label 1 1 1 1 1 1 1 1 1 1
## like 1 5 1 1 1 0 0 1 0 0
## movi 0 0 0 1 2 0 2 0 2 0
## one 1 1 1 0 0 0 1 2 1 1
## read 0 0 0 0 0 0 0 0 0 1
## time 2 0 1 0 0 0 0 2 0 0
tdm_m <- as.matrix(TextDoc_tdm)
#Word counts
wordCounts <- rowSums(tdm_m)
wordCounts <- sort(wordCounts, decreasing = T)
head(wordCounts)
## label book one movi like read
## 10061 7209 4103 3795 3402 3373
#word cloud
wordcloud(names(wordCounts), wordCounts, min.freq = 2,
max.words = 50, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

#Sentiment Analysis
pos <- "C:/Users/sharl/Desktop/USF/2021/Spring 2021/LIS 4761 Data-Text Mining/HW/Labs/week6_Lab/positive-words.txt"
neg <- "C:/Users/sharl/Desktop/USF/2021/Spring 2021/LIS 4761 Data-Text Mining/HW/Labs/week6_Lab/negative-words.txt"
p <- scan(pos, character(0), sep = "\n")
n <- scan(neg, character(0), sep = "\n")
#remove header
p <- p[-1:-34]
n <- n[-1:-34]
#calculate total number of words
totalWords <- sum(wordCounts)
#create vector containing all the words
words <- names(wordCounts)
matched <- match(words, p, nomatch = 0)
head(matched, 10)
## [1] 0 0 0 0 1083 0 827 852 0 0
matched[5]
## [1] 1083
p[1083]
## [1] "like"
words[5]
## [1] "like"
mCounts <- wordCounts[which(matched != 0)]
length(mCounts)
## [1] 388
mWords <- names(mCounts)
nPos <- sum(mCounts)
nPos
## [1] 34188
n_matched <- match(words, n, nomatch = 0)
nCounts <- wordCounts[which(n_matched != 0)]
nNeg <- sum(nCounts)
nWords <- names(nCounts)
nNeg
## [1] 20510
length(nCounts)
## [1] 898
#Calculate the percentage of words that are positive or negative
total_Words <- length(words)
ratioPos <- nPos/total_Words
ratioPos
## [1] 1.379494
ratioNeg <- nNeg/total_Words
ratioNeg
## [1] 0.8275834
totalRatio <- data.frame(ratioPos, ratioNeg)
totalRatio
## ratioPos ratioNeg
## 1 1.379494 0.8275834