library(reshape2)
library(ggplot2)
library(tm)
Loading required package: NLP
Attaching package: 'NLP'
The following object is masked from 'package:ggplot2':
annotate
library(topicmodels)
library(RColorBrewer)
library(wordcloud)
Removing certain words that we don’t want in the data set, in addition to the default text mining library stopwords list
myStopWords <- c("thank", "thanks", "use", "see", "used", "via", "amp", "kylo", "ben", "solo", "rey", "leia", "luke", "skywalker", "han", "solo", "chewie", "dean", "winchester", "sam", "castiel", "love", "heart", "yay", "omg", "captain", "america", "steve", "rogers", "bucky", "barnes", "winter", "soldier", "sam", "wilson", "tony", "stark", "ironman", "falcon", "black", "widow", "natasha", "clint", "s", "m", "3", "cas", "cant", "like", "much", "really", "cas", "reylo", "balthazar", "im", "d")
m <- tm_map(m, removeWords, c(stopwords("english"), myStopWords))
Warning in tm_map.SimpleCorpus(m, removeWords, c(stopwords("english"),
myStopWords)): transformation drops documents
Creating a term document matrix: terms in rows and documents as columns
tdm <- TermDocumentMatrix(m, control = list(wordLengths = c(1, Inf)))
tdm
<<TermDocumentMatrix (terms: 3913, documents: 9128)>>
Non-/sparse entries: 16213/35701651
Sparsity : 100%
Maximal term length: 32
Weighting : term frequency (tf)
Finding associated terms with the terms of interest, level of correlation is specified as 0.3
findAssocs(tdm, c("twitter", "tumblr", "ao3", "reading", "writing", "learning", "looking", "art", "fanart", "dresses", "multimedia", "meta", "fanon", "canon", "discord", "pillowfort", "dreamwidth", "kofi", "imagine", "see", "recipe", "links", "website", "wookiepedia", "explain", "describe", "description", "visualize", "hear", "smell", "taste", "understand", "know"), corlimit=0.3)
$twitter
numeric(0)
$tumblr
noise pop pitched
0.50 0.50 0.35
$ao3
dumb
0.32
$reading
numeric(0)
$writing
alisa mwah ❤️ appreciate
0.44 0.43 0.35 0.35
$learning
numeric(0)
$looking
forward alisa mwah appreciate ❤️
0.83 0.64 0.63 0.54 0.52
$art
mentioning softest mastered boyswonder origamii
0.35 0.35 0.35 0.35 0.35
$fanart
numeric(0)
$dresses
numeric(0)
$multimedia
numeric(0)
$meta
numeric(0)
$fanon
numeric(0)
$canon
clapping besides delight
0.71 0.50 0.32
$discord
numeric(0)
$pillowfort
numeric(0)
$dreamwidth
numeric(0)
$kofi
numeric(0)
$imagine
relief drawn sparkly particularly pink
0.71 0.71 0.71 0.50 0.50
worried
0.35
$see
numeric(0)
$recipe
chicken soup
0.35 0.35
$links
numeric(0)
$website
numeric(0)
$wookiepedia
numeric(0)
$explain
caught guess check hopefully
0.45 0.35 0.33 0.32
$describe
fuzzies filled incredible words
0.50 0.35 0.32 0.31
$description
exact
0.71
$visualize
numeric(0)
$hear
numeric(0)
$smell
numeric(0)
$taste
numeric(0)
$understand
concern hits
0.50 0.35
$know
dont
0.34
Establishing frequent terms. Taking a subset of those that appear more than and equal to 50 times
(freq.terms <- findFreqTerms(tdm, lowfreq = 50))
[1] "loved" "now" "oh" "chapter" "good"
[6] "read" "man" "sweet" "just" "well"
[11] "aw" "better" "glad" "youre" "fic"
[16] "best" "one" "happy" "right" "christmas"
[21] "hope" "new" "story" "buckys" "amazing"
[26] "writing" "way" "forward" "looking" "will"
[31] "little" "make" "get" "life" "reading"
[36] "also" "dont" "even" "enjoying" "gonna"
[41] "going" "next" "time" "know" "can"
[46] "got" "♥" "think" "made" "ive"
[51] "still"
loved
now
oh
chapter
good
read
man
sweet
just
well
aw
better
glad
youre
fic
best
one
happy
right
christmas
hope
new
story
buckys
amazing
writing
way
forward
looking
will
little
make
get
life
reading
also
dont
even
enjoying
gonna
going
next
time
know
can
got
♥
think
made
ive
still
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 50)
Establishing word frequency. Specified minimum frequency for the word cloud plot is 50
wm <- as.matrix(tdm)
word.freq <- sort(rowSums(wm), decreasing = T)
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 50, random.order = F, colors = pal)

Transposing the term document matrix and converting it into a document term matrix (documents as rows and colums as terms)
dtm <- as.DocumentTermMatrix(tdm)
Running topic modeling (Latent Dirichlet Allocation) on the document term matrix and requeting 8 topics. Specifying that each topic model should consist of 10 words.
lda <- LDA(dtm.new, k = 8)
term <- terms(lda, 10)
term
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
[1,] "buckys" "just" "story" "chapter" "oh" "glad"
[2,] "make" "loved" "aw" "know" "happy" "youre"
[3,] "ive" "made" "now" "dont" "christmas" "writing"
[4,] "yes" "can" "right" "next" "well" "little"
[5,] "thing" "hes" "man" "time" "sweet" "forward"
[6,] "back" "say" "going" "even" "hope" "enjoying"
[7,] "go" "two" "♥" "think" "best" "looking"
[8,] "god" "chapters" "life" "every" "new" "still"
[9,] "ever" "getting" "great" "wait" "hard" "gonna"
[10,] "kudos" "makes" "theyre" "stella" "wonderful" "appreciate"
Topic 7 Topic 8
[1,] "fic" "reading"
[2,] "read" "one"
[3,] "will" "good"
[4,] "better" "get"
[5,] "also" "enjoyed"
[6,] "amazing" "got"
[7,] "perfect" "things"
[8,] "feel" "boys"
[9,] "cute" "write"
[10,] "nice" "advent"
buckys
make
ive
yes
thing
back
go
god
ever
kudos
just
loved
made
can
hes
say
two
chapters
getting
makes
story
aw
now
right
man
going
♥
life
great
theyre
chapter
know
dont
next
time
even
think
every
wait
stella
oh
happy
christmas
well
sweet
hope
best
new
hard
wonderful
glad
youre
writing
little
forward
enjoying
looking
still
gonna
appreciate
fic
read
will
better
also
amazing
perfect
feel
cute
nice
reading
one
good
get
enjoyed
got
things
boys
write
advent
50 most frequent words and their frequencies
findMostFreqTerms(dtm, n = 50, INDEX = rep(1, dtm$nrow))[[1]]
reading glad just fic chapter oh story
183 172 164 151 148 146 118
youre happy good aw writing one christmas
110 110 106 102 98 94 93
now know get well loved read will
89 89 84 82 81 81 81
dont little right sweet better hope next
76 75 74 73 72 70 70
buckys man going made also time way
66 64 64 64 63 62 61
best think can make even got forward
59 59 56 55 54 54 53
life enjoying ive looking still new amazing
53 53 52 51 51 50 50
gonna
50
LS0tCnRpdGxlOiAiY29tbWVudHNfd29yazIuUm1kIgphdXRob3I6ICJBbmEgTHVjaWMgJiBMYXVyZW4gUm91c2UiCmRhdGU6ICIxNCBNYXkgMjAxOSIKb3V0cHV0OiAiaHRtbF9ub3RlYm9vayIKLS0tCmBgYHtyfQpsaWJyYXJ5KHJlc2hhcGUyKQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkodG0pIApsaWJyYXJ5KHRvcGljbW9kZWxzKSAKbGlicmFyeShSQ29sb3JCcmV3ZXIpCmxpYnJhcnkod29yZGNsb3VkKSAKYGBgCgojIyMjIyBSZWFkaW5nIHRoZSBmaWxlIHRoYXQgY29udGFpbnMgc2NyYXBlZCBjb21tZW50cyBhbmQgY29udmVydGluZyBpdCBmcm9tIHdpZGUgdG8gbG9uZyBmb3JtYXQKYGBge3J9CmMgPC0gcmVhZC5jc3YoIi9Vc2Vycy9hbmFsdWNpYy9Eb2N1bWVudHMvY29tbWVudHNfd29yazIuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycz1GQUxTRSkKZCA8LSBtZWx0KGMsIHZhcmlhYmxlLm5hbWUgPSAia2V5IiwgdmFsdWUubmFtZXMgPSAidmFsdWUiLCBpZC52YXJzID0gYygiaWQiKSwgZmFjdG9yc0FzU3RyaW5ncz1GKQpoZWFkKGQpCmBgYAojIyMjIyBLZWVwaW5nIHRoZSBjb2x1bW4gdGhhdCBjb250YWlucyBjb21tZW50cyBvbmx5IGFuZCBwcmUtcHJvY2Vzc2luZyBjb21tZW50czpjb252ZXJ0aW5nIHRlcm1zIHRvIGxvd2VyIGNhc2UsIHJlbW92aW5nIHB1bmN0dWF0aW9uLCBhbmQgd2hpdGVzcGFjZQpgYGB7cn0KdGV4dCA8LSBkJHZhbHVlIAp0ZXh0PC0gdGV4dFshaXMubmEodGV4dCldCnRleHQgPC0gZ3N1YigiJyIsICcnLCB0ZXh0KQp0ZXh0IDwtIGdzdWIoIuKAmSIsICcnLCB0ZXh0KQp0ZXh0IDwtIGdzdWIoIuKAnCIsICcnLCB0ZXh0KQp0ZXh0IDwtIGdzdWIoIuKAnSIsICcnLCB0ZXh0KQp0ZXh0IDwtIGdzdWIoIuKAmXJlIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1YigiJyIsICcnLCB0ZXh0KQptIDwtIENvcnB1cyhWZWN0b3JTb3VyY2UodGV4dCkpICAKbSA8LSB0bV9tYXAobSwgdG9sb3dlcikKbSA8LSB0bV9tYXAobSwgcmVtb3ZlUHVuY3R1YXRpb24pIAptIDwtIHRtX21hcChtLCBzdHJpcFdoaXRlc3BhY2UpCmBgYAoKIyMjIyMgUmVtb3ZpbmcgY2VydGFpbiB3b3JkcyB0aGF0IHdlIGRvbid0IHdhbnQgaW4gdGhlIGRhdGEgc2V0LCBpbiBhZGRpdGlvbiB0byB0aGUgZGVmYXVsdCB0ZXh0IG1pbmluZyBsaWJyYXJ5IHN0b3B3b3JkcyBsaXN0CmBgYHtyfQpteVN0b3BXb3JkcyA8LSBjKCJ0aGFuayIsICJ0aGFua3MiLCAidXNlIiwgInNlZSIsICJ1c2VkIiwgInZpYSIsICJhbXAiLCAia3lsbyIsICJiZW4iLCAic29sbyIsICJyZXkiLCAibGVpYSIsICJsdWtlIiwgInNreXdhbGtlciIsICJoYW4iLCAic29sbyIsICJjaGV3aWUiLCAiZGVhbiIsICJ3aW5jaGVzdGVyIiwgInNhbSIsICJjYXN0aWVsIiwgImxvdmUiLCAiaGVhcnQiLCAieWF5IiwgIm9tZyIsICJjYXB0YWluIiwgImFtZXJpY2EiLCAic3RldmUiLCAicm9nZXJzIiwgImJ1Y2t5IiwgImJhcm5lcyIsICJ3aW50ZXIiLCAic29sZGllciIsICJzYW0iLCAid2lsc29uIiwgInRvbnkiLCAic3RhcmsiLCAiaXJvbm1hbiIsICJmYWxjb24iLCAiYmxhY2siLCAid2lkb3ciLCAibmF0YXNoYSIsICJjbGludCIsICJzIiwgIm0iLCAiMyIsICJjYXMiLCAiY2FudCIsICJsaWtlIiwgIm11Y2giLCAicmVhbGx5IiwgImNhcyIsICJyZXlsbyIsICJiYWx0aGF6YXIiLCAiaW0iLCAiZCIpCm0gPC0gdG1fbWFwKG0sIHJlbW92ZVdvcmRzLCBjKHN0b3B3b3JkcygiZW5nbGlzaCIpLCBteVN0b3BXb3JkcykpIApgYGAKIyMjIyMgQ3JlYXRpbmcgYSB0ZXJtIGRvY3VtZW50IG1hdHJpeDogdGVybXMgaW4gcm93cyBhbmQgZG9jdW1lbnRzIGFzIGNvbHVtbnMKYGBge3J9CnRkbSA8LSBUZXJtRG9jdW1lbnRNYXRyaXgobSwgY29udHJvbCA9IGxpc3Qod29yZExlbmd0aHMgPSBjKDEsIEluZikpKQp0ZG0KYGBgCiMjIyMjIEZpbmRpbmcgYXNzb2NpYXRlZCB0ZXJtcyB3aXRoIHRoZSB0ZXJtcyBvZiBpbnRlcmVzdCwgbGV2ZWwgb2YgY29ycmVsYXRpb24gaXMgc3BlY2lmaWVkIGFzIDAuMwpgYGB7cn0KZmluZEFzc29jcyh0ZG0sIGMoInR3aXR0ZXIiLCAidHVtYmxyIiwgImFvMyIsICJyZWFkaW5nIiwgIndyaXRpbmciLCAibGVhcm5pbmciLCAibG9va2luZyIsICJhcnQiLCAiZmFuYXJ0IiwgImRyZXNzZXMiLCAibXVsdGltZWRpYSIsICJtZXRhIiwgImZhbm9uIiwgImNhbm9uIiwgImRpc2NvcmQiLCAicGlsbG93Zm9ydCIsICJkcmVhbXdpZHRoIiwgImtvZmkiLCAiaW1hZ2luZSIsICJzZWUiLCAicmVjaXBlIiwgImxpbmtzIiwgIndlYnNpdGUiLCAid29va2llcGVkaWEiLCAiZXhwbGFpbiIsICJkZXNjcmliZSIsICJkZXNjcmlwdGlvbiIsICJ2aXN1YWxpemUiLCAiaGVhciIsICJzbWVsbCIsICJ0YXN0ZSIsICJ1bmRlcnN0YW5kIiwgImtub3ciKSwgY29ybGltaXQ9MC4zKQpgYGAKIyMjIyMgRXN0YWJsaXNoaW5nIGZyZXF1ZW50IHRlcm1zLiBUYWtpbmcgYSBzdWJzZXQgb2YgdGhvc2UgdGhhdCBhcHBlYXIgbW9yZSB0aGFuIGFuZCBlcXVhbCB0byA1MCB0aW1lcwpgYGB7cn0KKGZyZXEudGVybXMgPC0gZmluZEZyZXFUZXJtcyh0ZG0sIGxvd2ZyZXEgPSA1MCkpIAp0ZXJtLmZyZXEgPC0gcm93U3Vtcyhhcy5tYXRyaXgodGRtKSkgIAp0ZXJtLmZyZXEgPC0gc3Vic2V0KHRlcm0uZnJlcSwgdGVybS5mcmVxID49IDUwKSAKYGBgCiMjIyMjIENvbnZlcnRpbmcgZnJlcXVlbnQgdGVybXMgaW50byBhIGRhdGEgZnJhbWUgZm9ybWF0IGZvciBlYXNpZXIgcGxvdHRpbmcKYGBge3J9CmRmIDwtIGRhdGEuZnJhbWUodGVybSA9IG5hbWVzKHRlcm0uZnJlcSksIGZyZXEgPSB0ZXJtLmZyZXEpIApnZ3Bsb3QoZGYsIGFlcyh4ID0gcmVvcmRlcih0ZXJtLCAtZnJlcSksIHkgPSBmcmVxKSkgKyBnZW9tX2JhcihzdGF0ID0gImlkZW50aXR5IikgKyAgdGhlbWUoYXhpcy50ZXh0Lng9ZWxlbWVudF90ZXh0KGFuZ2xlPTQ1LCBoanVzdD0xKSkKYGBgCiMjIyMgRXN0YWJsaXNoaW5nIHdvcmQgZnJlcXVlbmN5LiBTcGVjaWZpZWQgbWluaW11bSBmcmVxdWVuY3kgZm9yIHRoZSB3b3JkIGNsb3VkIHBsb3QgaXMgNTAKYGBge3J9CndtIDwtIGFzLm1hdHJpeCh0ZG0pIAp3b3JkLmZyZXEgPC0gc29ydChyb3dTdW1zKHdtKSwgZGVjcmVhc2luZyA9IFQpCnBhbCA8LSBicmV3ZXIucGFsKDksICJCdUduIilbLSgxOjQpXQp3b3JkY2xvdWQod29yZHMgPSBuYW1lcyh3b3JkLmZyZXEpLCBmcmVxID0gd29yZC5mcmVxLCBtaW4uZnJlcSA9IDUwLCByYW5kb20ub3JkZXIgPSBGLCBjb2xvcnMgPSBwYWwpIApgYGAKIyMjIyMgVHJhbnNwb3NpbmcgdGhlIHRlcm0gZG9jdW1lbnQgbWF0cml4IGFuZCBjb252ZXJ0aW5nIGl0IGludG8gYSBkb2N1bWVudCB0ZXJtIG1hdHJpeCAoZG9jdW1lbnRzIGFzIHJvd3MgYW5kIGNvbHVtcyBhcyB0ZXJtcykKYGBge3J9CmR0bSA8LSBhcy5Eb2N1bWVudFRlcm1NYXRyaXgodGRtKSAKYGBgCiMjIyMjIHN1bW1pbmcgZnJlcXVlbmNpZXMgZm9yIGRvY3VtZW50cy9pbmRpdmlkdWFsIGNvbW1lbnRzIGFuZCBlbGltaW5hdGluZyB0aG9zZSB0aGF0IGRvIG5vdCBoYXZlIGFueSB0ZXJtcwpgYGB7cn0KI2R0bSA9IHJlbW92ZVNwYXJzZVRlcm1zKGR0bSwgMC45OSkKI2R0bQpyb3dfdG90YWwgPSBhcHBseShkdG0sIDEsIHN1bSkgCmR0bS5uZXcgPSBkdG1bcm93X3RvdGFsPjAsXSAKZHRtLm5ldwoKYGBgCiMjIyMjIFJ1bm5pbmcgdG9waWMgbW9kZWxpbmcgKExhdGVudCBEaXJpY2hsZXQgQWxsb2NhdGlvbikgb24gdGhlIGRvY3VtZW50IHRlcm0gbWF0cml4IGFuZCByZXF1ZXRpbmcgOCB0b3BpY3MuIFNwZWNpZnlpbmcgdGhhdCBlYWNoIHRvcGljIG1vZGVsIHNob3VsZCBjb25zaXN0IG9mIDEwIHdvcmRzLgpgYGB7cn0KbGRhIDwtIExEQShkdG0ubmV3LCBrID0gOCkgCnRlcm0gPC0gdGVybXMobGRhLCAxMCkgCnRlcm0KYGBgCiMjIyMjIDUwIG1vc3QgZnJlcXVlbnQgd29yZHMgYW5kIHRoZWlyIGZyZXF1ZW5jaWVzCmBgYHtyfQpmaW5kTW9zdEZyZXFUZXJtcyhkdG0sIG4gPSA1MCwgSU5ERVggPSByZXAoMSwgZHRtJG5yb3cpKVtbMV1dCmBgYAoK