library(reshape2)
library(ggplot2)
library(tm)
Loading required package: NLP
Attaching package: 'NLP'
The following object is masked from 'package:ggplot2':
annotate
library(topicmodels)
library(RColorBrewer)
library(wordcloud)
Removing words that we don’t want in the data set, in addition to the default text mining library stopwords list
myStopWords <- c("thank", "thanks", "use", "see", "used", "via", "amp", "kylo", "ben", "solo", "rey", "leia", "luke", "skywalker", "han", "solo", "chewie", "dean", "winchester", "sam", "castiel", "love", "heart", "yay", "omg", "captain", "america", "steve", "rogers", "bucky", "barnes", "winter", "soldier", "sam", "wilson", "tony", "stark", "ironman", "falcon", "black", "widow", "natasha", "clint", "s", "m", "3", "cas", "cant", "like", "much", "really", "cas", "reylo", "balthazar", "im", "d")
m <- tm_map(m, removeWords, c(stopwords("english"), myStopWords))
Warning in tm_map.SimpleCorpus(m, removeWords, c(stopwords("english"),
myStopWords)): transformation drops documents
Creating a term document matrix: terms in rows and documents as columns
tdm <- TermDocumentMatrix(m, control = list(wordLengths = c(1, Inf)))
tdm = removeSparseTerms(tdm, 0.999)
tdm
<<TermDocumentMatrix (terms: 220, documents: 226320)>>
Non-/sparse entries: 118745/49671655
Sparsity : 100%
Maximal term length: 12
Weighting : term frequency (tf)
Finding associated terms with the terms of interest, level of correlation is specified as 0.3
findAssocs(tdm, c("twitter", "tumblr", "ao3", "reading", "writing", "learning", "looking", "art", "fanart", "dresses", "multimedia", "meta", "fanon", "canon", "discord", "pillowfort", "dreamwidth", "kofi", "imagine", "see", "recipe", "links", "website", "wookiepedia", "explain", "describe", "description", "visualize", "hear", "smell", "taste", "understand", "know"), corlimit=0.3)
$twitter
numeric(0)
$tumblr
numeric(0)
$ao3
numeric(0)
$reading
commenting words kind
0.60 0.31 0.30
$writing
numeric(0)
$learning
numeric(0)
$looking
forward
0.67
$art
numeric(0)
$fanart
numeric(0)
$dresses
numeric(0)
$multimedia
numeric(0)
$meta
numeric(0)
$fanon
numeric(0)
$canon
numeric(0)
$discord
numeric(0)
$pillowfort
numeric(0)
$dreamwidth
numeric(0)
$kofi
numeric(0)
$imagine
numeric(0)
$see
numeric(0)
$recipe
numeric(0)
$links
numeric(0)
$website
numeric(0)
$wookiepedia
numeric(0)
$explain
numeric(0)
$describe
numeric(0)
$description
numeric(0)
$visualize
numeric(0)
$hear
numeric(0)
$smell
numeric(0)
$taste
numeric(0)
$understand
numeric(0)
$know
numeric(0)
Establishing word frequency. Specified minimum frequency for the word cloud plot is 1,000
wm <- as.matrix(tdm)
word.freq <- sort(rowSums(wm), decreasing = T)
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 1000, random.order = F, colors = pal)

Transposing the term document matrix and converting it into a document term matrix (documents as rows and colums as terms)
dtm <- as.DocumentTermMatrix(tdm)
Running topic modeling (Latent Dirichlet Allocation) on the document term matrix and requeting 8 topics. Specifying that each topic model should consist of 10 words.
lda <- LDA(dtm.new, k = 8)
term <- terms(lda, 10)
term
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
[1,] "will" "just" "chapter" "chapter" "will" "one"
[2,] "just" "fic" "just" "snoke" "reading" "two"
[3,] "also" "chapter" "also" "reading" "chapter" "fic"
[4,] "oh" "will" "oh" "will" "next" "way"
[5,] "going" "back" "think" "going" "one" "just"
[6,] "next" "writing" "youre" "wait" "need" "will"
[7,] "writing" "also" "wait" "time" "dont" "read"
[8,] "things" "now" "well" "well" "oh" "reys"
[9,] "reys" "dont" "story" "youre" "end" "hope"
[10,] "time" "know" "next" "happy" "get" "still"
Topic 7 Topic 8
[1,] "chapter" "please"
[2,] "reading" "just"
[3,] "right" "now"
[4,] "oh" "story"
[5,] "think" "time"
[6,] "well" "hope"
[7,] "time" "going"
[8,] "just" "one"
[9,] "commenting" "update"
[10,] "soon" "take"
will
just
also
oh
going
next
writing
things
reys
time
just
fic
chapter
will
back
writing
also
now
dont
know
chapter
just
also
oh
think
youre
wait
well
story
next
chapter
snoke
reading
will
going
wait
time
well
youre
happy
will
reading
chapter
next
one
need
dont
oh
end
get
one
two
fic
way
just
will
read
reys
hope
still
chapter
reading
right
oh
think
well
time
just
commenting
soon
please
just
now
story
time
hope
going
one
update
take
50 most frequent words and their frequencies
findMostFreqTerms(dtm, n = 50, INDEX = rep(1, dtm$nrow))[[1]]
chapter just will reading also oh
3444 3101 2203 2026 1640 1590
one going wait now story get
1564 1491 1438 1431 1426 1399
time good know update can next
1361 1360 1321 1238 1231 1231
read think well reys please fic
1221 1161 1136 1130 1089 1079
way dont even always amazing hope
1057 1031 1018 1003 990 986
happy loved youre writing great want
928 896 882 878 849 841
kind snoke feel first two glad
832 829 815 806 804 782
words still commenting make right back
769 759 758 741 734 716
kylos beautiful
700 695
LS0tCnRpdGxlOiAiY29tbWVudHNfd29yazMuUm1kIgphdXRob3I6ICJBbmEgTHVjaWMgJiBMYXVyZW4gUm91c2UiCmRhdGU6ICIxNCBNYXkgMjAxOSIKb3V0cHV0OiAiaHRtbF9ub3RlYm9vayIKLS0tCgpgYGB7cn0KbGlicmFyeShyZXNoYXBlMikKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KHRtKSAKbGlicmFyeSh0b3BpY21vZGVscykgCmxpYnJhcnkoUkNvbG9yQnJld2VyKQpsaWJyYXJ5KHdvcmRjbG91ZCkgCmBgYAoKIyMjIyMgUmVhZGluZyB0aGUgZmlsZSB0aGF0IGNvbnRhaW5zIGNvbW1lbnRzIGFuZCBjb252ZXJ0aW5nIGl0IGZyb20gd2lkZSB0byBsb25nIGZvcm1hdApgYGB7cn0KYyA8LSByZWFkLmNzdigiL1VzZXJzL2FuYWx1Y2ljL0RvY3VtZW50cy9jb21tZW50c193b3JrMy5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzPUZBTFNFKQpkIDwtIG1lbHQoYywgdmFyaWFibGUubmFtZSA9ICJrZXkiLCB2YWx1ZS5uYW1lcyA9ICJ2YWx1ZSIsIGlkLnZhcnMgPSBjKCJpZCIpLCBmYWN0b3JzQXNTdHJpbmdzPUYpCmhlYWQoZCkKYGBgCiMjIyMjIEtlZXBpbmcgdGhlIGNvbHVtbiB0aGF0IGNvbnRhaW5zIGNvbW1lbnRzIG9ubHkgYW5kIHByZS1wcm9jZXNzaW5nIGNvbW1lbnRzOmNvbnZlcnRpbmcgdGVybXMgdG8gbG93ZXIgY2FzZSwgcmVtb3ZpbmcgcHVuY3R1YXRpb24sIGFuZCB3aGl0ZXNwYWNlCmBgYHtyfQp0ZXh0IDwtIGQkdmFsdWUgCnRleHQ8LSB0ZXh0WyFpcy5uYSh0ZXh0KV0KdGV4dCA8LSBnc3ViKCInIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCZIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCcIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCdIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCZcmUiLCAnJywgdGV4dCkKdGV4dCA8LSBnc3ViKCInIiwgJycsIHRleHQpCm0gPC0gQ29ycHVzKFZlY3RvclNvdXJjZSh0ZXh0KSkgIAptIDwtIHRtX21hcChtLCB0b2xvd2VyKQptIDwtIHRtX21hcChtLCByZW1vdmVQdW5jdHVhdGlvbikgCm0gPC0gdG1fbWFwKG0sIHN0cmlwV2hpdGVzcGFjZSkKYGBgCgojIyMjIyBSZW1vdmluZyB3b3JkcyB0aGF0IHdlIGRvbid0IHdhbnQgaW4gdGhlIGRhdGEgc2V0LCBpbiBhZGRpdGlvbiB0byB0aGUgZGVmYXVsdCB0ZXh0IG1pbmluZyBsaWJyYXJ5IHN0b3B3b3JkcyBsaXN0CmBgYHtyfQpteVN0b3BXb3JkcyA8LSBjKCJ0aGFuayIsICJ0aGFua3MiLCAidXNlIiwgInNlZSIsICJ1c2VkIiwgInZpYSIsICJhbXAiLCAia3lsbyIsICJiZW4iLCAic29sbyIsICJyZXkiLCAibGVpYSIsICJsdWtlIiwgInNreXdhbGtlciIsICJoYW4iLCAic29sbyIsICJjaGV3aWUiLCAiZGVhbiIsICJ3aW5jaGVzdGVyIiwgInNhbSIsICJjYXN0aWVsIiwgImxvdmUiLCAiaGVhcnQiLCAieWF5IiwgIm9tZyIsICJjYXB0YWluIiwgImFtZXJpY2EiLCAic3RldmUiLCAicm9nZXJzIiwgImJ1Y2t5IiwgImJhcm5lcyIsICJ3aW50ZXIiLCAic29sZGllciIsICJzYW0iLCAid2lsc29uIiwgInRvbnkiLCAic3RhcmsiLCAiaXJvbm1hbiIsICJmYWxjb24iLCAiYmxhY2siLCAid2lkb3ciLCAibmF0YXNoYSIsICJjbGludCIsICJzIiwgIm0iLCAiMyIsICJjYXMiLCAiY2FudCIsICJsaWtlIiwgIm11Y2giLCAicmVhbGx5IiwgImNhcyIsICJyZXlsbyIsICJiYWx0aGF6YXIiLCAiaW0iLCAiZCIpCm0gPC0gdG1fbWFwKG0sIHJlbW92ZVdvcmRzLCBjKHN0b3B3b3JkcygiZW5nbGlzaCIpLCBteVN0b3BXb3JkcykpIApgYGAKIyMjIyMgQ3JlYXRpbmcgYSB0ZXJtIGRvY3VtZW50IG1hdHJpeDogdGVybXMgaW4gcm93cyBhbmQgZG9jdW1lbnRzIGFzIGNvbHVtbnMKYGBge3J9CnRkbSA8LSBUZXJtRG9jdW1lbnRNYXRyaXgobSwgY29udHJvbCA9IGxpc3Qod29yZExlbmd0aHMgPSBjKDEsIEluZikpKQp0ZG0gPSByZW1vdmVTcGFyc2VUZXJtcyh0ZG0sIDAuOTk5KQp0ZG0KYGBgCiMjIyMjIEZpbmRpbmcgYXNzb2NpYXRlZCB0ZXJtcyB3aXRoIHRoZSB0ZXJtcyBvZiBpbnRlcmVzdCwgbGV2ZWwgb2YgY29ycmVsYXRpb24gaXMgc3BlY2lmaWVkIGFzIDAuMwpgYGB7cn0KZmluZEFzc29jcyh0ZG0sIGMoInR3aXR0ZXIiLCAidHVtYmxyIiwgImFvMyIsICJyZWFkaW5nIiwgIndyaXRpbmciLCAibGVhcm5pbmciLCAibG9va2luZyIsICJhcnQiLCAiZmFuYXJ0IiwgImRyZXNzZXMiLCAibXVsdGltZWRpYSIsICJtZXRhIiwgImZhbm9uIiwgImNhbm9uIiwgImRpc2NvcmQiLCAicGlsbG93Zm9ydCIsICJkcmVhbXdpZHRoIiwgImtvZmkiLCAiaW1hZ2luZSIsICJzZWUiLCAicmVjaXBlIiwgImxpbmtzIiwgIndlYnNpdGUiLCAid29va2llcGVkaWEiLCAiZXhwbGFpbiIsICJkZXNjcmliZSIsICJkZXNjcmlwdGlvbiIsICJ2aXN1YWxpemUiLCAiaGVhciIsICJzbWVsbCIsICJ0YXN0ZSIsICJ1bmRlcnN0YW5kIiwgImtub3ciKSwgY29ybGltaXQ9MC4zKQpgYGAKIyMjIyMgRXN0YWJsaXNoaW5nIGZyZXF1ZW50IHRlcm1zLiBUYWtpbmcgYSBzdWJzZXQgb2YgdGhvc2UgdGhhdCBhcHBlYXIgbW9yZSB0aGFuIGFuZCBlcXVhbCB0byAxLDAwMCB0aW1lcyAobW9yZSBjb21tZW50cyBhdmFpbGFibGUgdGhhbiBmb3Igb3RoZXIgdHdvIHdvcmtzKQpgYGB7cn0KKGZyZXEudGVybXMgPC0gZmluZEZyZXFUZXJtcyh0ZG0sIGxvd2ZyZXEgPSAxMDAwKSkgCnRlcm0uZnJlcSA8LSByb3dTdW1zKGFzLm1hdHJpeCh0ZG0pKSAgCnRlcm0uZnJlcSA8LSBzdWJzZXQodGVybS5mcmVxLCB0ZXJtLmZyZXEgPj0gMTAwMCkgCmBgYAojIyMjIyBDb252ZXJ0aW5nIGZyZXF1ZW50IHRlcm1zIGludG8gYSBkYXRhIGZyYW1lIGZvcm1hdCBmb3IgZWFzaWVyIHBsb3R0aW5nCmBgYHtyfQpkZiA8LSBkYXRhLmZyYW1lKHRlcm0gPSBuYW1lcyh0ZXJtLmZyZXEpLCBmcmVxID0gdGVybS5mcmVxKSAKZ2dwbG90KGRmLCBhZXMoeCA9IHJlb3JkZXIodGVybSwgLWZyZXEpLCB5ID0gZnJlcSkpICsgZ2VvbV9iYXIoc3RhdCA9ICJpZGVudGl0eSIpICsgIHRoZW1lKGF4aXMudGV4dC54PWVsZW1lbnRfdGV4dChhbmdsZT00NSwgaGp1c3Q9MSkpCmBgYAojIyMjIEVzdGFibGlzaGluZyB3b3JkIGZyZXF1ZW5jeS4gU3BlY2lmaWVkIG1pbmltdW0gZnJlcXVlbmN5IGZvciB0aGUgd29yZCBjbG91ZCBwbG90IGlzIDEsMDAwCmBgYHtyfQp3bSA8LSBhcy5tYXRyaXgodGRtKSAKd29yZC5mcmVxIDwtIHNvcnQocm93U3Vtcyh3bSksIGRlY3JlYXNpbmcgPSBUKQpwYWwgPC0gYnJld2VyLnBhbCg5LCAiQnVHbiIpWy0oMTo0KV0Kd29yZGNsb3VkKHdvcmRzID0gbmFtZXMod29yZC5mcmVxKSwgZnJlcSA9IHdvcmQuZnJlcSwgbWluLmZyZXEgPSAxMDAwLCByYW5kb20ub3JkZXIgPSBGLCBjb2xvcnMgPSBwYWwpIApgYGAKIyMjIyMgVHJhbnNwb3NpbmcgdGhlIHRlcm0gZG9jdW1lbnQgbWF0cml4IGFuZCBjb252ZXJ0aW5nIGl0IGludG8gYSBkb2N1bWVudCB0ZXJtIG1hdHJpeCAoZG9jdW1lbnRzIGFzIHJvd3MgYW5kIGNvbHVtcyBhcyB0ZXJtcykKYGBge3J9CmR0bSA8LSBhcy5Eb2N1bWVudFRlcm1NYXRyaXgodGRtKSAKYGBgCiMjIyMjIHN1bW1pbmcgZnJlcXVlbmNpZXMgZm9yIGRvY3VtZW50cy9pbmRpdmlkdWFsIGNvbW1lbnRzIGFuZCBlbGltaW5hdGluZyB0aG9zZSB0aGF0IGRvIG5vdCBoYXZlIGFueSB0ZXJtcwpgYGB7cn0KI2R0bSA9IHJlbW92ZVNwYXJzZVRlcm1zKGR0bSwgMC45OSkKI2R0bQpyb3dfdG90YWwgPSBhcHBseShkdG0sIDEsIHN1bSkgCmR0bS5uZXcgPSBkdG1bcm93X3RvdGFsPjAsXSAKZHRtLm5ldwoKYGBgCiMjIyMjIFJ1bm5pbmcgdG9waWMgbW9kZWxpbmcgKExhdGVudCBEaXJpY2hsZXQgQWxsb2NhdGlvbikgb24gdGhlIGRvY3VtZW50IHRlcm0gbWF0cml4IGFuZCByZXF1ZXRpbmcgOCB0b3BpY3MuIFNwZWNpZnlpbmcgdGhhdCBlYWNoIHRvcGljIG1vZGVsIHNob3VsZCBjb25zaXN0IG9mIDEwIHdvcmRzLiAKYGBge3J9CmxkYSA8LSBMREEoZHRtLm5ldywgayA9IDgpIAp0ZXJtIDwtIHRlcm1zKGxkYSwgMTApIAp0ZXJtCmBgYAojIyMjIyA1MCBtb3N0IGZyZXF1ZW50IHdvcmRzIGFuZCB0aGVpciBmcmVxdWVuY2llcwpgYGB7cn0KZmluZE1vc3RGcmVxVGVybXMoZHRtLCBuID0gNTAsIElOREVYID0gcmVwKDEsIGR0bSRucm93KSlbWzFdXQpgYGAK