sentiment.R

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.1.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringi)
library(tm)

## Warning: package 'tm' was built under R version 4.1.3

## Loading required package: NLP

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.1.3

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.1.3

## Loading required package: RColorBrewer

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(tidyr)
library(mgsub)

## Warning: package 'mgsub' was built under R version 4.1.3

library(scales) 
library(stringr)
library(textdata)

## Warning: package 'textdata' was built under R version 4.1.3

library(widyr)

## Warning: package 'widyr' was built under R version 4.1.3

library(SnowballC)
library(syuzhet)

## Warning: package 'syuzhet' was built under R version 4.1.3

## 
## Attaching package: 'syuzhet'

## The following object is masked from 'package:scales':
## 
##     rescale

#read text file
review <-readLines("test.ft.txt.bz2", n = 10000)
TextDoc <- Corpus(VectorSource(review))

#Replacing symbols with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "__")

## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "__"): transformation drops
## documents

TextDoc <- tm_map(TextDoc, toSpace, ":")

## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, ":"): transformation drops
## documents

TextDoc <- tm_map(TextDoc, toSpace, "-")

## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "-"): transformation drops
## documents

#Convert text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(tolower)):
## transformation drops documents

# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)

## Warning in tm_map.SimpleCorpus(TextDoc, removeNumbers): transformation drops
## documents

# Remove English common stop words
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(TextDoc, removeWords, stopwords("english")):
## transformation drops documents

# Remove punctuation
TextDoc <- tm_map(TextDoc, removePunctuation)

## Warning in tm_map.SimpleCorpus(TextDoc, removePunctuation): transformation drops
## documents

# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)

## Warning in tm_map.SimpleCorpus(TextDoc, stripWhitespace): transformation drops
## documents

# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)

## Warning in tm_map.SimpleCorpus(TextDoc, stemDocument): transformation drops
## documents

# Build a term-document matrix
TextDoc_tdm <- TermDocumentMatrix(TextDoc)
inspect(TextDoc_tdm)

## <<TermDocumentMatrix (terms: 24783, documents: 10000)>>
## Non-/sparse entries: 344400/247485600
## Sparsity           : 100%
## Maximal term length: 74
## Weighting          : term frequency (tf)
## Sample             :
##        Docs
## Terms   1176 164 2187 3071 3625 3666 6029 6972 7834 9825
##   book     0   0    0    0    0    1    0    0    0    2
##   good     1   0    0    0    1    0    2    2    1    1
##   great    0   2    0    0    0    1    1    1    0    0
##   just     0   3    0    0    0    0    0    2    1    2
##   label    1   1    1    1    1    1    1    1    1    1
##   like     1   5    1    1    1    0    0    1    0    0
##   movi     0   0    0    1    2    0    2    0    2    0
##   one      1   1    1    0    0    0    1    2    1    1
##   read     0   0    0    0    0    0    0    0    0    1
##   time     2   0    1    0    0    0    0    2    0    0

tdm_m <- as.matrix(TextDoc_tdm)

#Word counts
wordCounts <- rowSums(tdm_m)
wordCounts <- sort(wordCounts, decreasing = T)
head(wordCounts)

## label  book   one  movi  like  read 
## 10061  7209  4103  3795  3402  3373

#word cloud
wordcloud(names(wordCounts), wordCounts, min.freq = 2,
          max.words = 50, rot.per = 0.35, colors = brewer.pal(8, "Dark2"))

#Sentiment Analysis 
pos <- "C:/Users/sharl/Desktop/USF/2021/Spring 2021/LIS 4761 Data-Text Mining/HW/Labs/week6_Lab/positive-words.txt"
neg <- "C:/Users/sharl/Desktop/USF/2021/Spring 2021/LIS 4761 Data-Text Mining/HW/Labs/week6_Lab/negative-words.txt"

p <- scan(pos, character(0), sep = "\n")
n <- scan(neg, character(0), sep = "\n")

#remove header
p <- p[-1:-34]
n <- n[-1:-34]

#calculate total number of words
totalWords <- sum(wordCounts)

#create vector containing all the words
words <- names(wordCounts)

matched <- match(words, p, nomatch = 0)
head(matched, 10)

##  [1]    0    0    0    0 1083    0  827  852    0    0

matched[5]

## [1] 1083

p[1083]

## [1] "like"

words[5]

## [1] "like"

mCounts <- wordCounts[which(matched != 0)]
length(mCounts)

## [1] 388

mWords <- names(mCounts)
nPos <- sum(mCounts)
nPos

## [1] 34188

n_matched <- match(words, n, nomatch = 0)
nCounts <- wordCounts[which(n_matched != 0)]
nNeg <- sum(nCounts)
nWords <- names(nCounts)
nNeg

## [1] 20510

length(nCounts)

## [1] 898

#Calculate the percentage of words that are positive or negative
total_Words <- length(words)
ratioPos <- nPos/total_Words
ratioPos

## [1] 1.379494

ratioNeg <- nNeg/total_Words
ratioNeg

## [1] 0.8275834

totalRatio <- data.frame(ratioPos, ratioNeg)
totalRatio

##   ratioPos  ratioNeg
## 1 1.379494 0.8275834

sentiment.R

sharl

2022-05-26