library(reshape2)
library(ggplot2)
library(tm)
Loading required package: NLP
Attaching package: 'NLP'
The following object is masked from 'package:ggplot2':
annotate
library(topicmodels)
library(RColorBrewer)
library(wordcloud)
Removing certain words that we don’t want in the data set, in addition to the default text mining library stopwords list
myStopWords <- c("thank", "thanks", "use", "see", "used", "via", "amp", "kylo", "ben", "solo", "rey", "leia", "luke", "skywalker", "han", "solo", "chewie", "dean", "winchester", "sam", "castiel", "love", "heart", "yay", "omg", "captain", "america", "steve", "rogers", "bucky", "barnes", "winter", "soldier", "sam", "wilson", "tony", "stark", "ironman", "falcon", "black", "widow", "natasha", "clint", "'", "s", "m", "3", "cas", "cant", "dont", "like", "much", "really", "cas", "reylo", "balthazar", "im", "d", "make", "makes", "get")
m <- tm_map(m, removeWords, c(stopwords("english"), myStopWords))
Warning in tm_map.SimpleCorpus(m, removeWords, c(stopwords("english"),
myStopWords)): transformation drops documents
Creating a term document matrix: terms in rows and documents as columns
tdm <- TermDocumentMatrix(m, control = list(wordLengths = c(1, Inf)))
tdm
<<TermDocumentMatrix (terms: 2108, documents: 1830)>>
Non-/sparse entries: 4701/3852939
Sparsity : 100%
Maximal term length: 36
Weighting : term frequency (tf)
Finding associated terms with the terms of interest, level of correlation is specified as 0.3
findAssocs(tdm, c("twitter", "tumblr", "ao3", "reading", "writing", "learning", "looking", "art", "fanart", "dresses", "multimedia", "meta", "fanon", "canon", "discord", "pillowfort", "dreamwidth", "kofi", "imagine", "see", "recipe", "links", "website", "wookiepedia", "explain", "describe", "description", "visualize", "hear", "smell", "taste", "understand", "know"), corlimit=0.3)
$twitter
numeric(0)
$tumblr
numeric(0)
$ao3
numeric(0)
$reading
numeric(0)
$writing
domesticity exercise
0.41 0.41
$learning
numeric(0)
$looking
forward spun sugar
0.53 0.50 0.50
glass pattern saw
0.41 0.35 0.35
streetdoccas ending folks
0.35 0.35 0.35
woot delighted shadowrunsupernatural
0.35 0.35 0.35
made
0.32
$art
lucky pimmys amazing ageing amiss amount gotten indecent
0.43 0.43 0.38 0.30 0.30 0.30 0.30 0.30
mécène spent casting map— spell flames trinkets
0.30 0.30 0.30 0.30 0.30 0.30 0.30
$fanart
numeric(0)
$dresses
numeric(0)
$multimedia
numeric(0)
$meta
numeric(0)
$fanon
accepted grumpy meat planet absolute morning throwing person
1.00 1.00 1.00 1.00 0.71 0.58 0.58 0.50
least
0.38
$canon
moose squirrel recognize
0.41 0.41 0.33
$discord
ago conversation ho quite stopped
1.00 1.00 1.00 0.71 0.71
watching hot broom morning boy
0.71 0.71 0.71 0.58 0.58
spn damn closet ive part
0.58 0.58 0.58 0.50 0.35
long time
0.32 0.32
$pillowfort
numeric(0)
$dreamwidth
numeric(0)
$kofi
numeric(0)
$imagine
dealing streets within environment interactions
1.00 1.00 1.00 0.71 0.71
brilliant giving perfectly us everyone
0.58 0.58 0.45 0.38 0.35
moving cool
0.32 0.30
$see
numeric(0)
$recipe
numeric(0)
$links
numeric(0)
$website
numeric(0)
$wookiepedia
numeric(0)
$explain
numeric(0)
$describe
numeric(0)
$description
warm
0.33
$visualize
numeric(0)
$hear
numeric(0)
$smell
numeric(0)
$taste
alcohol chasing mmmmmmm spiced woody edge
1.00 1.00 1.00 1.00 1.00 0.58
$understand
history
0.58
$know
itll tomorrow
0.31 0.31
Establishing frequent terms. Taking a subset of terms that appear more than and equal to 15 times
(freq.terms <- findFreqTerms(tdm, lowfreq = 15))
[1] "bit" "world" "glad" "people" "way"
[6] "just" "didnt" "know" "still" "said"
[11] "chapter" "well" "youre" "back" "one"
[16] "reading" "amazing" "made" "poor" "thats"
[21] "story" "done" "things" "good" "even"
[26] "now" "little" "oh" "beautiful" "lot"
[31] "think" "great" "man"
bit
world
glad
people
way
just
didnt
know
still
said
chapter
well
youre
back
one
reading
amazing
made
poor
thats
story
done
things
good
even
now
little
oh
beautiful
lot
think
great
man
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 15)
Establishing word frequency. Minimum term frequency for the word cloud plot is 20
wm <- as.matrix(tdm)
word.freq <- sort(rowSums(wm), decreasing = T)
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 20, random.order = F, colors = pal)

Transposing the term document matrix and converting it into a document term matrix (documents as rows and colums as terms)
dtm <- as.DocumentTermMatrix(tdm)
Running topic modeling (Latent Dirichlet Allocation) on the document term matrix and requesting 8 topics. Specifying that each topic model should consist of 10 words.
lda <- LDA(dtm.new, k = 8)
term <- terms(lda, 10)
term
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
[1,] "still" "good" "made" "glad" "know" "said"
[2,] "lot" "well" "things" "people" "read" "wanted"
[3,] "little" "beautiful" "enough" "worked" "didnt" "work"
[4,] "even" "thats" "think" "bit" "interesting" "kids"
[5,] "just" "worldbuilding" "felt" "many" "world" "bad"
[6,] "story" "looked" "looking" "write" "deans" "writing"
[7,] "one" "descriptions" "scene" "thing" "voice" "course"
[8,] "youre" "chapter" "need" "nurse" "brain" "around"
[9,] "banter" "whole" "line" "creepy" "djinn" "eyes"
[10,] "wasnt" "just" "go" "tension" "yes" "loved"
Topic 7 Topic 8
[1,] "oh" "one"
[2,] "reading" "amazing"
[3,] "now" "art"
[4,] "great" "back"
[5,] "got" "imagery"
[6,] "man" "come"
[7,] "chapter" "fun"
[8,] "yeah" "cool"
[9,] "okay" "poor"
[10,] "feels" "take"
still
lot
little
even
just
story
one
youre
banter
wasnt
good
well
beautiful
thats
worldbuilding
looked
descriptions
chapter
whole
just
made
things
enough
think
felt
looking
scene
need
line
go
glad
people
worked
bit
many
write
thing
nurse
creepy
tension
know
read
didnt
interesting
world
deans
voice
brain
djinn
yes
said
wanted
work
kids
bad
writing
course
around
eyes
loved
oh
reading
now
great
got
man
chapter
yeah
okay
feels
one
amazing
art
back
imagery
come
fun
cool
poor
take
50 most frequent words and their frequencies
findMostFreqTerms(dtm, n = 50, INDEX = rep(1, dtm$nrow))[[1]]
just one oh good well glad
38 38 38 33 30 29
world lot people chapter thats way
27 23 22 22 22 20
know still things bit little said
20 20 20 19 19 17
reading story man youre back poor
17 17 17 16 16 16
done now didnt amazing made even
16 16 15 15 15 15
beautiful think great scene need going
15 15 15 14 14 14
read work can worked description everything
14 14 13 13 12 12
feel line best got voice fun
12 12 12 12 12 12
whole loved
11 11
LS0tCnRpdGxlOiAiY29tbWVudHNfd29yazEuUm1kIgphdXRob3I6ICJBbmEgTHVjaWMgJiBMYXVyZW4gUm91c2UiCmRhdGU6ICI5IE1heSAyMDE5IgpvdXRwdXQ6ICJodG1sX25vdGVib29rIgotLS0KYGBge3J9CmxpYnJhcnkocmVzaGFwZTIpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeSh0bSkgCmxpYnJhcnkodG9waWNtb2RlbHMpIApsaWJyYXJ5KFJDb2xvckJyZXdlcikKbGlicmFyeSh3b3JkY2xvdWQpIApgYGAKCiMjIyMjIFJlYWRpbmcgdGhlIGZpbGUgdGhhdCBjb250YWlucyBzY3JhcGVkIGNvbW1lbnRzIGZyb20gdGhlIHdlYiBzaXRlIGFuZCBjb252ZXJ0aW5nIHRoZSBmaWxlIGZyb20gd2lkZSB0byBsb25nIGZvcm1hdApgYGB7cn0KYyA8LSByZWFkLmNzdigiL1VzZXJzL2FuYWx1Y2ljL0RvY3VtZW50cy9jb21tZW50c193b3JrMS5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzPUZBTFNFKQpkIDwtIG1lbHQoYywgdmFyaWFibGUubmFtZSA9ICJrZXkiLCB2YWx1ZS5uYW1lcyA9ICJ2YWx1ZSIsIGlkLnZhcnMgPSBjKCJpZCIpLCBmYWN0b3JzQXNTdHJpbmdzPUYpCmhlYWQoZCkKYGBgCiMjIyMjIEtlZXBpbmcgdGhlIGNvbHVtbiB0aGF0IGNvbnRhaW5zIGNvbW1lbnRzIG9ubHkgYW5kIHByZS1wcm9jZXNzaW5nIGNvbW1lbnRzOmNvbnZlcnRpbmcgdGVybXMgdG8gbG93ZXIgY2FzZSwgcmVtb3ZpbmcgcHVuY3R1YXRpb24sIGFuZCB3aGl0ZXNwYWNlCmBgYHtyfQp0ZXh0IDwtIGQkdmFsdWUgCnRleHQ8LSB0ZXh0WyFpcy5uYSh0ZXh0KV0KdGV4dCA8LSBnc3ViKCLigJkiLCAnJywgdGV4dCkKdGV4dCA8LSBnc3ViKCInIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCcIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCdIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCZcmUiLCAnJywgdGV4dCkKdGV4dCA8LSBnc3ViKCInIiwgJycsIHRleHQpCm0gPC0gQ29ycHVzKFZlY3RvclNvdXJjZSh0ZXh0KSkgIAptIDwtIHRtX21hcChtLCB0b2xvd2VyKQptIDwtIHRtX21hcChtLCByZW1vdmVQdW5jdHVhdGlvbikgCm0gPC0gdG1fbWFwKG0sIHN0cmlwV2hpdGVzcGFjZSkKYGBgCgojIyMjIyBSZW1vdmluZyBjZXJ0YWluIHdvcmRzIHRoYXQgd2UgZG9uJ3Qgd2FudCBpbiB0aGUgZGF0YSBzZXQsIGluIGFkZGl0aW9uIHRvIHRoZSBkZWZhdWx0IHRleHQgbWluaW5nIGxpYnJhcnkgc3RvcHdvcmRzIGxpc3QKYGBge3J9Cm15U3RvcFdvcmRzIDwtIGMoInRoYW5rIiwgInRoYW5rcyIsICJ1c2UiLCAic2VlIiwgInVzZWQiLCAidmlhIiwgImFtcCIsICJreWxvIiwgImJlbiIsICJzb2xvIiwgInJleSIsICJsZWlhIiwgImx1a2UiLCAic2t5d2Fsa2VyIiwgImhhbiIsICJzb2xvIiwgImNoZXdpZSIsICJkZWFuIiwgIndpbmNoZXN0ZXIiLCAic2FtIiwgImNhc3RpZWwiLCAibG92ZSIsICJoZWFydCIsICJ5YXkiLCAib21nIiwgImNhcHRhaW4iLCAiYW1lcmljYSIsICJzdGV2ZSIsICJyb2dlcnMiLCAiYnVja3kiLCAiYmFybmVzIiwgIndpbnRlciIsICJzb2xkaWVyIiwgInNhbSIsICJ3aWxzb24iLCAidG9ueSIsICJzdGFyayIsICJpcm9ubWFuIiwgImZhbGNvbiIsICJibGFjayIsICJ3aWRvdyIsICJuYXRhc2hhIiwgImNsaW50IiwgIiciLCAicyIsICJtIiwgIjMiLCAiY2FzIiwgImNhbnQiLCAiZG9udCIsICJsaWtlIiwgIm11Y2giLCAicmVhbGx5IiwgImNhcyIsICJyZXlsbyIsICJiYWx0aGF6YXIiLCAiaW0iLCAiZCIsICJtYWtlIiwgIm1ha2VzIiwgImdldCIpCm0gPC0gdG1fbWFwKG0sIHJlbW92ZVdvcmRzLCBjKHN0b3B3b3JkcygiZW5nbGlzaCIpLCBteVN0b3BXb3JkcykpIApgYGAKIyMjIyMgQ3JlYXRpbmcgYSB0ZXJtIGRvY3VtZW50IG1hdHJpeDogdGVybXMgaW4gcm93cyBhbmQgZG9jdW1lbnRzIGFzIGNvbHVtbnMKYGBge3J9CnRkbSA8LSBUZXJtRG9jdW1lbnRNYXRyaXgobSwgY29udHJvbCA9IGxpc3Qod29yZExlbmd0aHMgPSBjKDEsIEluZikpKQp0ZG0KYGBgCiMjIyMjIEZpbmRpbmcgYXNzb2NpYXRlZCB0ZXJtcyB3aXRoIHRoZSB0ZXJtcyBvZiBpbnRlcmVzdCwgbGV2ZWwgb2YgY29ycmVsYXRpb24gaXMgc3BlY2lmaWVkIGFzIDAuMwpgYGB7cn0KZmluZEFzc29jcyh0ZG0sIGMoInR3aXR0ZXIiLCAidHVtYmxyIiwgImFvMyIsICJyZWFkaW5nIiwgIndyaXRpbmciLCAibGVhcm5pbmciLCAibG9va2luZyIsICJhcnQiLCAiZmFuYXJ0IiwgImRyZXNzZXMiLCAibXVsdGltZWRpYSIsICJtZXRhIiwgImZhbm9uIiwgImNhbm9uIiwgImRpc2NvcmQiLCAicGlsbG93Zm9ydCIsICJkcmVhbXdpZHRoIiwgImtvZmkiLCAiaW1hZ2luZSIsICJzZWUiLCAicmVjaXBlIiwgImxpbmtzIiwgIndlYnNpdGUiLCAid29va2llcGVkaWEiLCAiZXhwbGFpbiIsICJkZXNjcmliZSIsICJkZXNjcmlwdGlvbiIsICJ2aXN1YWxpemUiLCAiaGVhciIsICJzbWVsbCIsICJ0YXN0ZSIsICJ1bmRlcnN0YW5kIiwgImtub3ciKSwgY29ybGltaXQ9MC4zKQpgYGAKIyMjIyMgRXN0YWJsaXNoaW5nIGZyZXF1ZW50IHRlcm1zLiBUYWtpbmcgYSBzdWJzZXQgb2YgdGVybXMgdGhhdCBhcHBlYXIgbW9yZSB0aGFuIGFuZCBlcXVhbCB0byAxNSB0aW1lcwpgYGB7cn0KKGZyZXEudGVybXMgPC0gZmluZEZyZXFUZXJtcyh0ZG0sIGxvd2ZyZXEgPSAxNSkpIAp0ZXJtLmZyZXEgPC0gcm93U3Vtcyhhcy5tYXRyaXgodGRtKSkgIAp0ZXJtLmZyZXEgPC0gc3Vic2V0KHRlcm0uZnJlcSwgdGVybS5mcmVxID49IDE1KSAKYGBgCiMjIyMjIENvbnZlcnRpbmcgZnJlcXVlbnQgdGVybXMgaW50byBhIGRhdGEgZnJhbWUgZm9ybWF0IGZvciBlYXNpZXIgcGxvdHRpbmcKYGBge3J9CmRmIDwtIGRhdGEuZnJhbWUodGVybSA9IG5hbWVzKHRlcm0uZnJlcSksIGZyZXEgPSB0ZXJtLmZyZXEpIApnZ3Bsb3QoZGYsIGFlcyh4ID0gcmVvcmRlcih0ZXJtLCAtZnJlcSksIHkgPSBmcmVxKSkgKyBnZW9tX2JhcihzdGF0ID0gImlkZW50aXR5IikgKyAgdGhlbWUoYXhpcy50ZXh0Lng9ZWxlbWVudF90ZXh0KGFuZ2xlPTQ1LCBoanVzdD0xKSkKYGBgCiMjIyMgRXN0YWJsaXNoaW5nIHdvcmQgZnJlcXVlbmN5LiBNaW5pbXVtIHRlcm0gZnJlcXVlbmN5IGZvciB0aGUgd29yZCBjbG91ZCBwbG90IGlzIDIwCmBgYHtyfQp3bSA8LSBhcy5tYXRyaXgodGRtKSAKd29yZC5mcmVxIDwtIHNvcnQocm93U3Vtcyh3bSksIGRlY3JlYXNpbmcgPSBUKQpwYWwgPC0gYnJld2VyLnBhbCg5LCAiQnVHbiIpWy0oMTo0KV0Kd29yZGNsb3VkKHdvcmRzID0gbmFtZXMod29yZC5mcmVxKSwgZnJlcSA9IHdvcmQuZnJlcSwgbWluLmZyZXEgPSAyMCwgcmFuZG9tLm9yZGVyID0gRiwgY29sb3JzID0gcGFsKSAKYGBgCiMjIyMjIFRyYW5zcG9zaW5nIHRoZSB0ZXJtIGRvY3VtZW50IG1hdHJpeCBhbmQgY29udmVydGluZyBpdCBpbnRvIGEgZG9jdW1lbnQgdGVybSBtYXRyaXggKGRvY3VtZW50cyBhcyByb3dzIGFuZCBjb2x1bXMgYXMgdGVybXMpCmBgYHtyfQpkdG0gPC0gYXMuRG9jdW1lbnRUZXJtTWF0cml4KHRkbSkgCmBgYAojIyMjIyBTdW1taW5nIGZyZXF1ZW5jaWVzIGZvciBkb2N1bWVudHMvaW5kaXZpZHVhbCBjb21tZW50cyBhbmQgZWxpbWluYXRpbmcgdGhvc2UgdGhhdCBkbyBub3QgaGF2ZSBhbnkgdGVybXMKYGBge3J9CnJvd190b3RhbCA9IGFwcGx5KGR0bSwgMSwgc3VtKSAKZHRtLm5ldyA9IGR0bVtyb3dfdG90YWw+MCxdIApgYGAKIyMjIyMgUnVubmluZyB0b3BpYyBtb2RlbGluZyAoTGF0ZW50IERpcmljaGxldCBBbGxvY2F0aW9uKSBvbiB0aGUgZG9jdW1lbnQgdGVybSBtYXRyaXggYW5kIHJlcXVlc3RpbmcgOCB0b3BpY3MuIFNwZWNpZnlpbmcgdGhhdCBlYWNoIHRvcGljIG1vZGVsIHNob3VsZCBjb25zaXN0IG9mIDEwIHdvcmRzLgpgYGB7cn0KbGRhIDwtIExEQShkdG0ubmV3LCBrID0gOCkgCnRlcm0gPC0gdGVybXMobGRhLCAxMCkgCnRlcm0KYGBgCiMjIyMjIyA1MCBtb3N0IGZyZXF1ZW50IHdvcmRzIGFuZCB0aGVpciBmcmVxdWVuY2llcwpgYGB7cn0KZmluZE1vc3RGcmVxVGVybXMoZHRtLCBuID0gNTAsIElOREVYID0gcmVwKDEsIGR0bSRucm93KSlbWzFdXQpgYGAK