comments_work3.Rmd

library(reshape2)
library(ggplot2)
library(tm)

Loading required package: NLP


Attaching package: 'NLP'

The following object is masked from 'package:ggplot2':

    annotate

library(topicmodels) 
library(RColorBrewer)
library(wordcloud)

Reading the file that contains comments and converting it from wide to long format

c <- read.csv("/Users/analucic/Documents/comments_work3.csv", stringsAsFactors=FALSE)
d <- melt(c, variable.name = "key", value.names = "value", id.vars = c("id"), factorsAsStrings=F)
head(d)

Keeping the column that contains comments only and pre-processing comments:converting terms to lower case, removing punctuation, and whitespace

text <- d$value 
text<- text[!is.na(text)]
text <- gsub("'", '', text)
text <- gsub("’", '', text)
text <- gsub("“", '', text)
text <- gsub("”", '', text)
text <- gsub("’re", '', text)
text <- gsub("'", '', text)
m <- Corpus(VectorSource(text))  
m <- tm_map(m, tolower)

Warning in tm_map.SimpleCorpus(m, tolower): transformation drops documents

m <- tm_map(m, removePunctuation)

Warning in tm_map.SimpleCorpus(m, removePunctuation): transformation drops
documents

m <- tm_map(m, stripWhitespace)

Warning in tm_map.SimpleCorpus(m, stripWhitespace): transformation drops
documents

Removing words that we don’t want in the data set, in addition to the default text mining library stopwords list

myStopWords <- c("thank", "thanks", "use", "see", "used", "via", "amp", "kylo", "ben", "solo", "rey", "leia", "luke", "skywalker", "han", "solo", "chewie", "dean", "winchester", "sam", "castiel", "love", "heart", "yay", "omg", "captain", "america", "steve", "rogers", "bucky", "barnes", "winter", "soldier", "sam", "wilson", "tony", "stark", "ironman", "falcon", "black", "widow", "natasha", "clint", "s", "m", "3", "cas", "cant", "like", "much", "really", "cas", "reylo", "balthazar", "im", "d")
m <- tm_map(m, removeWords, c(stopwords("english"), myStopWords))

Warning in tm_map.SimpleCorpus(m, removeWords, c(stopwords("english"),
myStopWords)): transformation drops documents

Creating a term document matrix: terms in rows and documents as columns

tdm <- TermDocumentMatrix(m, control = list(wordLengths = c(1, Inf)))
tdm = removeSparseTerms(tdm, 0.999)
tdm

<<TermDocumentMatrix (terms: 220, documents: 226320)>>
Non-/sparse entries: 118745/49671655
Sparsity           : 100%
Maximal term length: 12
Weighting          : term frequency (tf)

Finding associated terms with the terms of interest, level of correlation is specified as 0.3

findAssocs(tdm, c("twitter", "tumblr", "ao3", "reading", "writing", "learning", "looking", "art", "fanart", "dresses", "multimedia", "meta", "fanon", "canon", "discord", "pillowfort", "dreamwidth", "kofi", "imagine", "see", "recipe", "links", "website", "wookiepedia", "explain", "describe", "description", "visualize", "hear", "smell", "taste", "understand", "know"), corlimit=0.3)

$twitter
numeric(0)

$tumblr
numeric(0)

$ao3
numeric(0)

$reading
commenting      words       kind 
      0.60       0.31       0.30 

$writing
numeric(0)

$learning
numeric(0)

$looking
forward 
   0.67 

$art
numeric(0)

$fanart
numeric(0)

$dresses
numeric(0)

$multimedia
numeric(0)

$meta
numeric(0)

$fanon
numeric(0)

$canon
numeric(0)

$discord
numeric(0)

$pillowfort
numeric(0)

$dreamwidth
numeric(0)

$kofi
numeric(0)

$imagine
numeric(0)

$see
numeric(0)

$recipe
numeric(0)

$links
numeric(0)

$website
numeric(0)

$wookiepedia
numeric(0)

$explain
numeric(0)

$describe
numeric(0)

$description
numeric(0)

$visualize
numeric(0)

$hear
numeric(0)

$smell
numeric(0)

$taste
numeric(0)

$understand
numeric(0)

$know
numeric(0)

Establishing frequent terms. Taking a subset of those that appear more than and equal to 1,000 times (more comments available than for other two works)

(freq.terms <- findFreqTerms(tdm, lowfreq = 1000))

 [1] "one"     "also"    "oh"      "reading" "wait"    "always"  "chapter"
 [8] "can"     "dont"    "reys"    "even"    "know"    "next"    "will"   
[15] "just"    "well"    "think"   "going"   "now"     "time"    "good"   
[22] "way"     "read"    "story"   "update"  "get"     "please"  "fic"

one

also

oh

reading

wait

always

chapter

can

dont

reys

even

know

next

will

just

well

think

going

now

time

good

way

read

story

update

get

please

fic

term.freq <- rowSums(as.matrix(tdm))  
term.freq <- subset(term.freq, term.freq >= 1000)

Converting frequent terms into a data frame format for easier plotting

df <- data.frame(term = names(term.freq), freq = term.freq) 
ggplot(df, aes(x = reorder(term, -freq), y = freq)) + geom_bar(stat = "identity") +  theme(axis.text.x=element_text(angle=45, hjust=1))

Establishing word frequency. Specified minimum frequency for the word cloud plot is 1,000

wm <- as.matrix(tdm) 
word.freq <- sort(rowSums(wm), decreasing = T)
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 1000, random.order = F, colors = pal)

Transposing the term document matrix and converting it into a document term matrix (documents as rows and colums as terms)

dtm <- as.DocumentTermMatrix(tdm)

summing frequencies for documents/individual comments and eliminating those that do not have any terms

#dtm = removeSparseTerms(dtm, 0.99)
#dtm
row_total = apply(dtm, 1, sum) 
dtm.new = dtm[row_total>0,] 
dtm.new

<<DocumentTermMatrix (documents: 33261, terms: 220)>>
Non-/sparse entries: 118745/7198675
Sparsity           : 98%
Maximal term length: 12
Weighting          : term frequency (tf)

Running topic modeling (Latent Dirichlet Allocation) on the document term matrix and requeting 8 topics. Specifying that each topic model should consist of 10 words.

lda <- LDA(dtm.new, k = 8) 
term <- terms(lda, 10) 
term

      Topic 1   Topic 2   Topic 3   Topic 4   Topic 5   Topic 6
 [1,] "will"    "just"    "chapter" "chapter" "will"    "one"  
 [2,] "just"    "fic"     "just"    "snoke"   "reading" "two"  
 [3,] "also"    "chapter" "also"    "reading" "chapter" "fic"  
 [4,] "oh"      "will"    "oh"      "will"    "next"    "way"  
 [5,] "going"   "back"    "think"   "going"   "one"     "just" 
 [6,] "next"    "writing" "youre"   "wait"    "need"    "will" 
 [7,] "writing" "also"    "wait"    "time"    "dont"    "read" 
 [8,] "things"  "now"     "well"    "well"    "oh"      "reys" 
 [9,] "reys"    "dont"    "story"   "youre"   "end"     "hope" 
[10,] "time"    "know"    "next"    "happy"   "get"     "still"
      Topic 7      Topic 8 
 [1,] "chapter"    "please"
 [2,] "reading"    "just"  
 [3,] "right"      "now"   
 [4,] "oh"         "story" 
 [5,] "think"      "time"  
 [6,] "well"       "hope"  
 [7,] "time"       "going" 
 [8,] "just"       "one"   
 [9,] "commenting" "update"
[10,] "soon"       "take"

will

just

also

oh

going

next

writing

things

reys

time

just

fic

chapter

will

back

writing

also

now

dont

know

chapter

just

also

oh

think

youre

wait

well

story

next

chapter

snoke

reading

will

going

wait

time

well

youre

happy

will

reading

chapter

next

one

need

dont

oh

end

get

one

two

fic

way

just

will

read

reys

hope

still

chapter

reading

right

oh

think

well

time

just

commenting

soon

please

just

now

story

time

hope

going

one

update

take

50 most frequent words and their frequencies

findMostFreqTerms(dtm, n = 50, INDEX = rep(1, dtm$nrow))[[1]]

   chapter       just       will    reading       also         oh 
      3444       3101       2203       2026       1640       1590 
       one      going       wait        now      story        get 
      1564       1491       1438       1431       1426       1399 
      time       good       know     update        can       next 
      1361       1360       1321       1238       1231       1231 
      read      think       well       reys     please        fic 
      1221       1161       1136       1130       1089       1079 
       way       dont       even     always    amazing       hope 
      1057       1031       1018       1003        990        986 
     happy      loved      youre    writing      great       want 
       928        896        882        878        849        841 
      kind      snoke       feel      first        two       glad 
       832        829        815        806        804        782 
     words      still commenting       make      right       back 
       769        759        758        741        734        716 
     kylos  beautiful 
       700        695

LS0tCnRpdGxlOiAiY29tbWVudHNfd29yazMuUm1kIgphdXRob3I6ICJBbmEgTHVjaWMgJiBMYXVyZW4gUm91c2UiCmRhdGU6ICIxNCBNYXkgMjAxOSIKb3V0cHV0OiAiaHRtbF9ub3RlYm9vayIKLS0tCgpgYGB7cn0KbGlicmFyeShyZXNoYXBlMikKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KHRtKSAKbGlicmFyeSh0b3BpY21vZGVscykgCmxpYnJhcnkoUkNvbG9yQnJld2VyKQpsaWJyYXJ5KHdvcmRjbG91ZCkgCmBgYAoKIyMjIyMgUmVhZGluZyB0aGUgZmlsZSB0aGF0IGNvbnRhaW5zIGNvbW1lbnRzIGFuZCBjb252ZXJ0aW5nIGl0IGZyb20gd2lkZSB0byBsb25nIGZvcm1hdApgYGB7cn0KYyA8LSByZWFkLmNzdigiL1VzZXJzL2FuYWx1Y2ljL0RvY3VtZW50cy9jb21tZW50c193b3JrMy5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzPUZBTFNFKQpkIDwtIG1lbHQoYywgdmFyaWFibGUubmFtZSA9ICJrZXkiLCB2YWx1ZS5uYW1lcyA9ICJ2YWx1ZSIsIGlkLnZhcnMgPSBjKCJpZCIpLCBmYWN0b3JzQXNTdHJpbmdzPUYpCmhlYWQoZCkKYGBgCiMjIyMjIEtlZXBpbmcgdGhlIGNvbHVtbiB0aGF0IGNvbnRhaW5zIGNvbW1lbnRzIG9ubHkgYW5kIHByZS1wcm9jZXNzaW5nIGNvbW1lbnRzOmNvbnZlcnRpbmcgdGVybXMgdG8gbG93ZXIgY2FzZSwgcmVtb3ZpbmcgcHVuY3R1YXRpb24sIGFuZCB3aGl0ZXNwYWNlCmBgYHtyfQp0ZXh0IDwtIGQkdmFsdWUgCnRleHQ8LSB0ZXh0WyFpcy5uYSh0ZXh0KV0KdGV4dCA8LSBnc3ViKCInIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCZIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCcIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCdIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCZcmUiLCAnJywgdGV4dCkKdGV4dCA8LSBnc3ViKCInIiwgJycsIHRleHQpCm0gPC0gQ29ycHVzKFZlY3RvclNvdXJjZSh0ZXh0KSkgIAptIDwtIHRtX21hcChtLCB0b2xvd2VyKQptIDwtIHRtX21hcChtLCByZW1vdmVQdW5jdHVhdGlvbikgCm0gPC0gdG1fbWFwKG0sIHN0cmlwV2hpdGVzcGFjZSkKYGBgCgojIyMjIyBSZW1vdmluZyB3b3JkcyB0aGF0IHdlIGRvbid0IHdhbnQgaW4gdGhlIGRhdGEgc2V0LCBpbiBhZGRpdGlvbiB0byB0aGUgZGVmYXVsdCB0ZXh0IG1pbmluZyBsaWJyYXJ5IHN0b3B3b3JkcyBsaXN0CmBgYHtyfQpteVN0b3BXb3JkcyA8LSBjKCJ0aGFuayIsICJ0aGFua3MiLCAidXNlIiwgInNlZSIsICJ1c2VkIiwgInZpYSIsICJhbXAiLCAia3lsbyIsICJiZW4iLCAic29sbyIsICJyZXkiLCAibGVpYSIsICJsdWtlIiwgInNreXdhbGtlciIsICJoYW4iLCAic29sbyIsICJjaGV3aWUiLCAiZGVhbiIsICJ3aW5jaGVzdGVyIiwgInNhbSIsICJjYXN0aWVsIiwgImxvdmUiLCAiaGVhcnQiLCAieWF5IiwgIm9tZyIsICJjYXB0YWluIiwgImFtZXJpY2EiLCAic3RldmUiLCAicm9nZXJzIiwgImJ1Y2t5IiwgImJhcm5lcyIsICJ3aW50ZXIiLCAic29sZGllciIsICJzYW0iLCAid2lsc29uIiwgInRvbnkiLCAic3RhcmsiLCAiaXJvbm1hbiIsICJmYWxjb24iLCAiYmxhY2siLCAid2lkb3ciLCAibmF0YXNoYSIsICJjbGludCIsICJzIiwgIm0iLCAiMyIsICJjYXMiLCAiY2FudCIsICJsaWtlIiwgIm11Y2giLCAicmVhbGx5IiwgImNhcyIsICJyZXlsbyIsICJiYWx0aGF6YXIiLCAiaW0iLCAiZCIpCm0gPC0gdG1fbWFwKG0sIHJlbW92ZVdvcmRzLCBjKHN0b3B3b3JkcygiZW5nbGlzaCIpLCBteVN0b3BXb3JkcykpIApgYGAKIyMjIyMgQ3JlYXRpbmcgYSB0ZXJtIGRvY3VtZW50IG1hdHJpeDogdGVybXMgaW4gcm93cyBhbmQgZG9jdW1lbnRzIGFzIGNvbHVtbnMKYGBge3J9CnRkbSA8LSBUZXJtRG9jdW1lbnRNYXRyaXgobSwgY29udHJvbCA9IGxpc3Qod29yZExlbmd0aHMgPSBjKDEsIEluZikpKQp0ZG0gPSByZW1vdmVTcGFyc2VUZXJtcyh0ZG0sIDAuOTk5KQp0ZG0KYGBgCiMjIyMjIEZpbmRpbmcgYXNzb2NpYXRlZCB0ZXJtcyB3aXRoIHRoZSB0ZXJtcyBvZiBpbnRlcmVzdCwgbGV2ZWwgb2YgY29ycmVsYXRpb24gaXMgc3BlY2lmaWVkIGFzIDAuMwpgYGB7cn0KZmluZEFzc29jcyh0ZG0sIGMoInR3aXR0ZXIiLCAidHVtYmxyIiwgImFvMyIsICJyZWFkaW5nIiwgIndyaXRpbmciLCAibGVhcm5pbmciLCAibG9va2luZyIsICJhcnQiLCAiZmFuYXJ0IiwgImRyZXNzZXMiLCAibXVsdGltZWRpYSIsICJtZXRhIiwgImZhbm9uIiwgImNhbm9uIiwgImRpc2NvcmQiLCAicGlsbG93Zm9ydCIsICJkcmVhbXdpZHRoIiwgImtvZmkiLCAiaW1hZ2luZSIsICJzZWUiLCAicmVjaXBlIiwgImxpbmtzIiwgIndlYnNpdGUiLCAid29va2llcGVkaWEiLCAiZXhwbGFpbiIsICJkZXNjcmliZSIsICJkZXNjcmlwdGlvbiIsICJ2aXN1YWxpemUiLCAiaGVhciIsICJzbWVsbCIsICJ0YXN0ZSIsICJ1bmRlcnN0YW5kIiwgImtub3ciKSwgY29ybGltaXQ9MC4zKQpgYGAKIyMjIyMgRXN0YWJsaXNoaW5nIGZyZXF1ZW50IHRlcm1zLiBUYWtpbmcgYSBzdWJzZXQgb2YgdGhvc2UgdGhhdCBhcHBlYXIgbW9yZSB0aGFuIGFuZCBlcXVhbCB0byAxLDAwMCB0aW1lcyAobW9yZSBjb21tZW50cyBhdmFpbGFibGUgdGhhbiBmb3Igb3RoZXIgdHdvIHdvcmtzKQpgYGB7cn0KKGZyZXEudGVybXMgPC0gZmluZEZyZXFUZXJtcyh0ZG0sIGxvd2ZyZXEgPSAxMDAwKSkgCnRlcm0uZnJlcSA8LSByb3dTdW1zKGFzLm1hdHJpeCh0ZG0pKSAgCnRlcm0uZnJlcSA8LSBzdWJzZXQodGVybS5mcmVxLCB0ZXJtLmZyZXEgPj0gMTAwMCkgCmBgYAojIyMjIyBDb252ZXJ0aW5nIGZyZXF1ZW50IHRlcm1zIGludG8gYSBkYXRhIGZyYW1lIGZvcm1hdCBmb3IgZWFzaWVyIHBsb3R0aW5nCmBgYHtyfQpkZiA8LSBkYXRhLmZyYW1lKHRlcm0gPSBuYW1lcyh0ZXJtLmZyZXEpLCBmcmVxID0gdGVybS5mcmVxKSAKZ2dwbG90KGRmLCBhZXMoeCA9IHJlb3JkZXIodGVybSwgLWZyZXEpLCB5ID0gZnJlcSkpICsgZ2VvbV9iYXIoc3RhdCA9ICJpZGVudGl0eSIpICsgIHRoZW1lKGF4aXMudGV4dC54PWVsZW1lbnRfdGV4dChhbmdsZT00NSwgaGp1c3Q9MSkpCmBgYAojIyMjIEVzdGFibGlzaGluZyB3b3JkIGZyZXF1ZW5jeS4gU3BlY2lmaWVkIG1pbmltdW0gZnJlcXVlbmN5IGZvciB0aGUgd29yZCBjbG91ZCBwbG90IGlzIDEsMDAwCmBgYHtyfQp3bSA8LSBhcy5tYXRyaXgodGRtKSAKd29yZC5mcmVxIDwtIHNvcnQocm93U3Vtcyh3bSksIGRlY3JlYXNpbmcgPSBUKQpwYWwgPC0gYnJld2VyLnBhbCg5LCAiQnVHbiIpWy0oMTo0KV0Kd29yZGNsb3VkKHdvcmRzID0gbmFtZXMod29yZC5mcmVxKSwgZnJlcSA9IHdvcmQuZnJlcSwgbWluLmZyZXEgPSAxMDAwLCByYW5kb20ub3JkZXIgPSBGLCBjb2xvcnMgPSBwYWwpIApgYGAKIyMjIyMgVHJhbnNwb3NpbmcgdGhlIHRlcm0gZG9jdW1lbnQgbWF0cml4IGFuZCBjb252ZXJ0aW5nIGl0IGludG8gYSBkb2N1bWVudCB0ZXJtIG1hdHJpeCAoZG9jdW1lbnRzIGFzIHJvd3MgYW5kIGNvbHVtcyBhcyB0ZXJtcykKYGBge3J9CmR0bSA8LSBhcy5Eb2N1bWVudFRlcm1NYXRyaXgodGRtKSAKYGBgCiMjIyMjIHN1bW1pbmcgZnJlcXVlbmNpZXMgZm9yIGRvY3VtZW50cy9pbmRpdmlkdWFsIGNvbW1lbnRzIGFuZCBlbGltaW5hdGluZyB0aG9zZSB0aGF0IGRvIG5vdCBoYXZlIGFueSB0ZXJtcwpgYGB7cn0KI2R0bSA9IHJlbW92ZVNwYXJzZVRlcm1zKGR0bSwgMC45OSkKI2R0bQpyb3dfdG90YWwgPSBhcHBseShkdG0sIDEsIHN1bSkgCmR0bS5uZXcgPSBkdG1bcm93X3RvdGFsPjAsXSAKZHRtLm5ldwoKYGBgCiMjIyMjIFJ1bm5pbmcgdG9waWMgbW9kZWxpbmcgKExhdGVudCBEaXJpY2hsZXQgQWxsb2NhdGlvbikgb24gdGhlIGRvY3VtZW50IHRlcm0gbWF0cml4IGFuZCByZXF1ZXRpbmcgOCB0b3BpY3MuIFNwZWNpZnlpbmcgdGhhdCBlYWNoIHRvcGljIG1vZGVsIHNob3VsZCBjb25zaXN0IG9mIDEwIHdvcmRzLiAKYGBge3J9CmxkYSA8LSBMREEoZHRtLm5ldywgayA9IDgpIAp0ZXJtIDwtIHRlcm1zKGxkYSwgMTApIAp0ZXJtCmBgYAojIyMjIyA1MCBtb3N0IGZyZXF1ZW50IHdvcmRzIGFuZCB0aGVpciBmcmVxdWVuY2llcwpgYGB7cn0KZmluZE1vc3RGcmVxVGVybXMoZHRtLCBuID0gNTAsIElOREVYID0gcmVwKDEsIGR0bSRucm93KSlbWzFdXQpgYGAK