library(reshape2)
library(ggplot2)
library(tm) 
Loading required package: NLP

Attaching package: 'NLP'
The following object is masked from 'package:ggplot2':

    annotate
library(topicmodels) 
library(RColorBrewer)
library(wordcloud) 
Reading the file that contains scraped comments and converting it from wide to long format
c <- read.csv("/Users/analucic/Documents/comments_work2.csv", stringsAsFactors=FALSE)
d <- melt(c, variable.name = "key", value.names = "value", id.vars = c("id"), factorsAsStrings=F)
head(d)
Keeping the column that contains comments only and pre-processing comments:converting terms to lower case, removing punctuation, and whitespace
text <- d$value 
text<- text[!is.na(text)]
text <- gsub("'", '', text)
text <- gsub("’", '', text)
text <- gsub("“", '', text)
text <- gsub("”", '', text)
text <- gsub("’re", '', text)
text <- gsub("'", '', text)
m <- Corpus(VectorSource(text))  
m <- tm_map(m, tolower)
Warning in tm_map.SimpleCorpus(m, tolower): transformation drops documents
m <- tm_map(m, removePunctuation) 
Warning in tm_map.SimpleCorpus(m, removePunctuation): transformation drops
documents
m <- tm_map(m, stripWhitespace)
Warning in tm_map.SimpleCorpus(m, stripWhitespace): transformation drops
documents
Removing certain words that we don’t want in the data set, in addition to the default text mining library stopwords list
myStopWords <- c("thank", "thanks", "use", "see", "used", "via", "amp", "kylo", "ben", "solo", "rey", "leia", "luke", "skywalker", "han", "solo", "chewie", "dean", "winchester", "sam", "castiel", "love", "heart", "yay", "omg", "captain", "america", "steve", "rogers", "bucky", "barnes", "winter", "soldier", "sam", "wilson", "tony", "stark", "ironman", "falcon", "black", "widow", "natasha", "clint", "s", "m", "3", "cas", "cant", "like", "much", "really", "cas", "reylo", "balthazar", "im", "d")
m <- tm_map(m, removeWords, c(stopwords("english"), myStopWords)) 
Warning in tm_map.SimpleCorpus(m, removeWords, c(stopwords("english"),
myStopWords)): transformation drops documents
Creating a term document matrix: terms in rows and documents as columns
tdm <- TermDocumentMatrix(m, control = list(wordLengths = c(1, Inf)))
tdm
<<TermDocumentMatrix (terms: 3913, documents: 9128)>>
Non-/sparse entries: 16213/35701651
Sparsity           : 100%
Maximal term length: 32
Weighting          : term frequency (tf)
Finding associated terms with the terms of interest, level of correlation is specified as 0.3
findAssocs(tdm, c("twitter", "tumblr", "ao3", "reading", "writing", "learning", "looking", "art", "fanart", "dresses", "multimedia", "meta", "fanon", "canon", "discord", "pillowfort", "dreamwidth", "kofi", "imagine", "see", "recipe", "links", "website", "wookiepedia", "explain", "describe", "description", "visualize", "hear", "smell", "taste", "understand", "know"), corlimit=0.3)
$twitter
numeric(0)

$tumblr
  noise     pop pitched 
   0.50    0.50    0.35 

$ao3
dumb 
0.32 

$reading
numeric(0)

$writing
     alisa       mwah          ❤️ appreciate 
      0.44       0.43       0.35       0.35 

$learning
numeric(0)

$looking
   forward      alisa       mwah appreciate          ❤️ 
      0.83       0.64       0.63       0.54       0.52 

$art
mentioning    softest   mastered boyswonder   origamii 
      0.35       0.35       0.35       0.35       0.35 

$fanart
numeric(0)

$dresses
numeric(0)

$multimedia
numeric(0)

$meta
numeric(0)

$fanon
numeric(0)

$canon
clapping  besides  delight 
    0.71     0.50     0.32 

$discord
numeric(0)

$pillowfort
numeric(0)

$dreamwidth
numeric(0)

$kofi
numeric(0)

$imagine
      relief        drawn      sparkly particularly         pink 
        0.71         0.71         0.71         0.50         0.50 
     worried 
        0.35 

$see
numeric(0)

$recipe
chicken    soup 
   0.35    0.35 

$links
numeric(0)

$website
numeric(0)

$wookiepedia
numeric(0)

$explain
   caught     guess     check hopefully 
     0.45      0.35      0.33      0.32 

$describe
   fuzzies     filled incredible      words 
      0.50       0.35       0.32       0.31 

$description
exact 
 0.71 

$visualize
numeric(0)

$hear
numeric(0)

$smell
numeric(0)

$taste
numeric(0)

$understand
concern    hits 
   0.50    0.35 

$know
dont 
0.34 
Establishing frequent terms. Taking a subset of those that appear more than and equal to 50 times
(freq.terms <- findFreqTerms(tdm, lowfreq = 50)) 
 [1] "loved"     "now"       "oh"        "chapter"   "good"     
 [6] "read"      "man"       "sweet"     "just"      "well"     
[11] "aw"        "better"    "glad"      "youre"     "fic"      
[16] "best"      "one"       "happy"     "right"     "christmas"
[21] "hope"      "new"       "story"     "buckys"    "amazing"  
[26] "writing"   "way"       "forward"   "looking"   "will"     
[31] "little"    "make"      "get"       "life"      "reading"  
[36] "also"      "dont"      "even"      "enjoying"  "gonna"    
[41] "going"     "next"      "time"      "know"      "can"      
[46] "got"       "♥"         "think"     "made"      "ive"      
[51] "still"    
loved

now

oh

chapter

good

read

man

sweet

just

well

aw

better

glad

youre

fic

best

one

happy

right

christmas

hope

new

story

buckys

amazing

writing

way

forward

looking

will

little

make

get

life

reading

also

dont

even

enjoying

gonna

going

next

time

know

can

got

♥

think

made

ive

still
term.freq <- rowSums(as.matrix(tdm))  
term.freq <- subset(term.freq, term.freq >= 50) 
Converting frequent terms into a data frame format for easier plotting
df <- data.frame(term = names(term.freq), freq = term.freq) 
ggplot(df, aes(x = reorder(term, -freq), y = freq)) + geom_bar(stat = "identity") +  theme(axis.text.x=element_text(angle=45, hjust=1))

Establishing word frequency. Specified minimum frequency for the word cloud plot is 50

wm <- as.matrix(tdm) 
word.freq <- sort(rowSums(wm), decreasing = T)
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 50, random.order = F, colors = pal) 

Transposing the term document matrix and converting it into a document term matrix (documents as rows and colums as terms)
dtm <- as.DocumentTermMatrix(tdm) 
summing frequencies for documents/individual comments and eliminating those that do not have any terms
#dtm = removeSparseTerms(dtm, 0.99)
#dtm
row_total = apply(dtm, 1, sum) 
dtm.new = dtm[row_total>0,] 
dtm.new
<<DocumentTermMatrix (documents: 3240, terms: 3913)>>
Non-/sparse entries: 16213/12661907
Sparsity           : 100%
Maximal term length: 32
Weighting          : term frequency (tf)
Running topic modeling (Latent Dirichlet Allocation) on the document term matrix and requeting 8 topics. Specifying that each topic model should consist of 10 words.
lda <- LDA(dtm.new, k = 8) 
term <- terms(lda, 10) 
term
      Topic 1  Topic 2    Topic 3  Topic 4   Topic 5     Topic 6     
 [1,] "buckys" "just"     "story"  "chapter" "oh"        "glad"      
 [2,] "make"   "loved"    "aw"     "know"    "happy"     "youre"     
 [3,] "ive"    "made"     "now"    "dont"    "christmas" "writing"   
 [4,] "yes"    "can"      "right"  "next"    "well"      "little"    
 [5,] "thing"  "hes"      "man"    "time"    "sweet"     "forward"   
 [6,] "back"   "say"      "going"  "even"    "hope"      "enjoying"  
 [7,] "go"     "two"      "♥"      "think"   "best"      "looking"   
 [8,] "god"    "chapters" "life"   "every"   "new"       "still"     
 [9,] "ever"   "getting"  "great"  "wait"    "hard"      "gonna"     
[10,] "kudos"  "makes"    "theyre" "stella"  "wonderful" "appreciate"
      Topic 7   Topic 8  
 [1,] "fic"     "reading"
 [2,] "read"    "one"    
 [3,] "will"    "good"   
 [4,] "better"  "get"    
 [5,] "also"    "enjoyed"
 [6,] "amazing" "got"    
 [7,] "perfect" "things" 
 [8,] "feel"    "boys"   
 [9,] "cute"    "write"  
[10,] "nice"    "advent" 
buckys

make

ive

yes

thing

back

go

god

ever

kudos

just

loved

made

can

hes

say

two

chapters

getting

makes

story

aw

now

right

man

going

♥

life

great

theyre

chapter

know

dont

next

time

even

think

every

wait

stella

oh

happy

christmas

well

sweet

hope

best

new

hard

wonderful

glad

youre

writing

little

forward

enjoying

looking

still

gonna

appreciate

fic

read

will

better

also

amazing

perfect

feel

cute

nice

reading

one

good

get

enjoyed

got

things

boys

write

advent
50 most frequent words and their frequencies
findMostFreqTerms(dtm, n = 50, INDEX = rep(1, dtm$nrow))[[1]]
  reading      glad      just       fic   chapter        oh     story 
      183       172       164       151       148       146       118 
    youre     happy      good        aw   writing       one christmas 
      110       110       106       102        98        94        93 
      now      know       get      well     loved      read      will 
       89        89        84        82        81        81        81 
     dont    little     right     sweet    better      hope      next 
       76        75        74        73        72        70        70 
   buckys       man     going      made      also      time       way 
       66        64        64        64        63        62        61 
     best     think       can      make      even       got   forward 
       59        59        56        55        54        54        53 
     life  enjoying       ive   looking     still       new   amazing 
       53        53        52        51        51        50        50 
    gonna 
       50 
LS0tCnRpdGxlOiAiY29tbWVudHNfd29yazIuUm1kIgphdXRob3I6ICJBbmEgTHVjaWMgJiBMYXVyZW4gUm91c2UiCmRhdGU6ICIxNCBNYXkgMjAxOSIKb3V0cHV0OiAiaHRtbF9ub3RlYm9vayIKLS0tCmBgYHtyfQpsaWJyYXJ5KHJlc2hhcGUyKQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkodG0pIApsaWJyYXJ5KHRvcGljbW9kZWxzKSAKbGlicmFyeShSQ29sb3JCcmV3ZXIpCmxpYnJhcnkod29yZGNsb3VkKSAKYGBgCgojIyMjIyBSZWFkaW5nIHRoZSBmaWxlIHRoYXQgY29udGFpbnMgc2NyYXBlZCBjb21tZW50cyBhbmQgY29udmVydGluZyBpdCBmcm9tIHdpZGUgdG8gbG9uZyBmb3JtYXQKYGBge3J9CmMgPC0gcmVhZC5jc3YoIi9Vc2Vycy9hbmFsdWNpYy9Eb2N1bWVudHMvY29tbWVudHNfd29yazIuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycz1GQUxTRSkKZCA8LSBtZWx0KGMsIHZhcmlhYmxlLm5hbWUgPSAia2V5IiwgdmFsdWUubmFtZXMgPSAidmFsdWUiLCBpZC52YXJzID0gYygiaWQiKSwgZmFjdG9yc0FzU3RyaW5ncz1GKQpoZWFkKGQpCmBgYAojIyMjIyBLZWVwaW5nIHRoZSBjb2x1bW4gdGhhdCBjb250YWlucyBjb21tZW50cyBvbmx5IGFuZCBwcmUtcHJvY2Vzc2luZyBjb21tZW50czpjb252ZXJ0aW5nIHRlcm1zIHRvIGxvd2VyIGNhc2UsIHJlbW92aW5nIHB1bmN0dWF0aW9uLCBhbmQgd2hpdGVzcGFjZQpgYGB7cn0KdGV4dCA8LSBkJHZhbHVlIAp0ZXh0PC0gdGV4dFshaXMubmEodGV4dCldCnRleHQgPC0gZ3N1YigiJyIsICcnLCB0ZXh0KQp0ZXh0IDwtIGdzdWIoIuKAmSIsICcnLCB0ZXh0KQp0ZXh0IDwtIGdzdWIoIuKAnCIsICcnLCB0ZXh0KQp0ZXh0IDwtIGdzdWIoIuKAnSIsICcnLCB0ZXh0KQp0ZXh0IDwtIGdzdWIoIuKAmXJlIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1YigiJyIsICcnLCB0ZXh0KQptIDwtIENvcnB1cyhWZWN0b3JTb3VyY2UodGV4dCkpICAKbSA8LSB0bV9tYXAobSwgdG9sb3dlcikKbSA8LSB0bV9tYXAobSwgcmVtb3ZlUHVuY3R1YXRpb24pIAptIDwtIHRtX21hcChtLCBzdHJpcFdoaXRlc3BhY2UpCmBgYAoKIyMjIyMgUmVtb3ZpbmcgY2VydGFpbiB3b3JkcyB0aGF0IHdlIGRvbid0IHdhbnQgaW4gdGhlIGRhdGEgc2V0LCBpbiBhZGRpdGlvbiB0byB0aGUgZGVmYXVsdCB0ZXh0IG1pbmluZyBsaWJyYXJ5IHN0b3B3b3JkcyBsaXN0CmBgYHtyfQpteVN0b3BXb3JkcyA8LSBjKCJ0aGFuayIsICJ0aGFua3MiLCAidXNlIiwgInNlZSIsICJ1c2VkIiwgInZpYSIsICJhbXAiLCAia3lsbyIsICJiZW4iLCAic29sbyIsICJyZXkiLCAibGVpYSIsICJsdWtlIiwgInNreXdhbGtlciIsICJoYW4iLCAic29sbyIsICJjaGV3aWUiLCAiZGVhbiIsICJ3aW5jaGVzdGVyIiwgInNhbSIsICJjYXN0aWVsIiwgImxvdmUiLCAiaGVhcnQiLCAieWF5IiwgIm9tZyIsICJjYXB0YWluIiwgImFtZXJpY2EiLCAic3RldmUiLCAicm9nZXJzIiwgImJ1Y2t5IiwgImJhcm5lcyIsICJ3aW50ZXIiLCAic29sZGllciIsICJzYW0iLCAid2lsc29uIiwgInRvbnkiLCAic3RhcmsiLCAiaXJvbm1hbiIsICJmYWxjb24iLCAiYmxhY2siLCAid2lkb3ciLCAibmF0YXNoYSIsICJjbGludCIsICJzIiwgIm0iLCAiMyIsICJjYXMiLCAiY2FudCIsICJsaWtlIiwgIm11Y2giLCAicmVhbGx5IiwgImNhcyIsICJyZXlsbyIsICJiYWx0aGF6YXIiLCAiaW0iLCAiZCIpCm0gPC0gdG1fbWFwKG0sIHJlbW92ZVdvcmRzLCBjKHN0b3B3b3JkcygiZW5nbGlzaCIpLCBteVN0b3BXb3JkcykpIApgYGAKIyMjIyMgQ3JlYXRpbmcgYSB0ZXJtIGRvY3VtZW50IG1hdHJpeDogdGVybXMgaW4gcm93cyBhbmQgZG9jdW1lbnRzIGFzIGNvbHVtbnMKYGBge3J9CnRkbSA8LSBUZXJtRG9jdW1lbnRNYXRyaXgobSwgY29udHJvbCA9IGxpc3Qod29yZExlbmd0aHMgPSBjKDEsIEluZikpKQp0ZG0KYGBgCiMjIyMjIEZpbmRpbmcgYXNzb2NpYXRlZCB0ZXJtcyB3aXRoIHRoZSB0ZXJtcyBvZiBpbnRlcmVzdCwgbGV2ZWwgb2YgY29ycmVsYXRpb24gaXMgc3BlY2lmaWVkIGFzIDAuMwpgYGB7cn0KZmluZEFzc29jcyh0ZG0sIGMoInR3aXR0ZXIiLCAidHVtYmxyIiwgImFvMyIsICJyZWFkaW5nIiwgIndyaXRpbmciLCAibGVhcm5pbmciLCAibG9va2luZyIsICJhcnQiLCAiZmFuYXJ0IiwgImRyZXNzZXMiLCAibXVsdGltZWRpYSIsICJtZXRhIiwgImZhbm9uIiwgImNhbm9uIiwgImRpc2NvcmQiLCAicGlsbG93Zm9ydCIsICJkcmVhbXdpZHRoIiwgImtvZmkiLCAiaW1hZ2luZSIsICJzZWUiLCAicmVjaXBlIiwgImxpbmtzIiwgIndlYnNpdGUiLCAid29va2llcGVkaWEiLCAiZXhwbGFpbiIsICJkZXNjcmliZSIsICJkZXNjcmlwdGlvbiIsICJ2aXN1YWxpemUiLCAiaGVhciIsICJzbWVsbCIsICJ0YXN0ZSIsICJ1bmRlcnN0YW5kIiwgImtub3ciKSwgY29ybGltaXQ9MC4zKQpgYGAKIyMjIyMgRXN0YWJsaXNoaW5nIGZyZXF1ZW50IHRlcm1zLiBUYWtpbmcgYSBzdWJzZXQgb2YgdGhvc2UgdGhhdCBhcHBlYXIgbW9yZSB0aGFuIGFuZCBlcXVhbCB0byA1MCB0aW1lcwpgYGB7cn0KKGZyZXEudGVybXMgPC0gZmluZEZyZXFUZXJtcyh0ZG0sIGxvd2ZyZXEgPSA1MCkpIAp0ZXJtLmZyZXEgPC0gcm93U3Vtcyhhcy5tYXRyaXgodGRtKSkgIAp0ZXJtLmZyZXEgPC0gc3Vic2V0KHRlcm0uZnJlcSwgdGVybS5mcmVxID49IDUwKSAKYGBgCiMjIyMjIENvbnZlcnRpbmcgZnJlcXVlbnQgdGVybXMgaW50byBhIGRhdGEgZnJhbWUgZm9ybWF0IGZvciBlYXNpZXIgcGxvdHRpbmcKYGBge3J9CmRmIDwtIGRhdGEuZnJhbWUodGVybSA9IG5hbWVzKHRlcm0uZnJlcSksIGZyZXEgPSB0ZXJtLmZyZXEpIApnZ3Bsb3QoZGYsIGFlcyh4ID0gcmVvcmRlcih0ZXJtLCAtZnJlcSksIHkgPSBmcmVxKSkgKyBnZW9tX2JhcihzdGF0ID0gImlkZW50aXR5IikgKyAgdGhlbWUoYXhpcy50ZXh0Lng9ZWxlbWVudF90ZXh0KGFuZ2xlPTQ1LCBoanVzdD0xKSkKYGBgCiMjIyMgRXN0YWJsaXNoaW5nIHdvcmQgZnJlcXVlbmN5LiBTcGVjaWZpZWQgbWluaW11bSBmcmVxdWVuY3kgZm9yIHRoZSB3b3JkIGNsb3VkIHBsb3QgaXMgNTAKYGBge3J9CndtIDwtIGFzLm1hdHJpeCh0ZG0pIAp3b3JkLmZyZXEgPC0gc29ydChyb3dTdW1zKHdtKSwgZGVjcmVhc2luZyA9IFQpCnBhbCA8LSBicmV3ZXIucGFsKDksICJCdUduIilbLSgxOjQpXQp3b3JkY2xvdWQod29yZHMgPSBuYW1lcyh3b3JkLmZyZXEpLCBmcmVxID0gd29yZC5mcmVxLCBtaW4uZnJlcSA9IDUwLCByYW5kb20ub3JkZXIgPSBGLCBjb2xvcnMgPSBwYWwpIApgYGAKIyMjIyMgVHJhbnNwb3NpbmcgdGhlIHRlcm0gZG9jdW1lbnQgbWF0cml4IGFuZCBjb252ZXJ0aW5nIGl0IGludG8gYSBkb2N1bWVudCB0ZXJtIG1hdHJpeCAoZG9jdW1lbnRzIGFzIHJvd3MgYW5kIGNvbHVtcyBhcyB0ZXJtcykKYGBge3J9CmR0bSA8LSBhcy5Eb2N1bWVudFRlcm1NYXRyaXgodGRtKSAKYGBgCiMjIyMjIHN1bW1pbmcgZnJlcXVlbmNpZXMgZm9yIGRvY3VtZW50cy9pbmRpdmlkdWFsIGNvbW1lbnRzIGFuZCBlbGltaW5hdGluZyB0aG9zZSB0aGF0IGRvIG5vdCBoYXZlIGFueSB0ZXJtcwpgYGB7cn0KI2R0bSA9IHJlbW92ZVNwYXJzZVRlcm1zKGR0bSwgMC45OSkKI2R0bQpyb3dfdG90YWwgPSBhcHBseShkdG0sIDEsIHN1bSkgCmR0bS5uZXcgPSBkdG1bcm93X3RvdGFsPjAsXSAKZHRtLm5ldwoKYGBgCiMjIyMjIFJ1bm5pbmcgdG9waWMgbW9kZWxpbmcgKExhdGVudCBEaXJpY2hsZXQgQWxsb2NhdGlvbikgb24gdGhlIGRvY3VtZW50IHRlcm0gbWF0cml4IGFuZCByZXF1ZXRpbmcgOCB0b3BpY3MuIFNwZWNpZnlpbmcgdGhhdCBlYWNoIHRvcGljIG1vZGVsIHNob3VsZCBjb25zaXN0IG9mIDEwIHdvcmRzLgpgYGB7cn0KbGRhIDwtIExEQShkdG0ubmV3LCBrID0gOCkgCnRlcm0gPC0gdGVybXMobGRhLCAxMCkgCnRlcm0KYGBgCiMjIyMjIDUwIG1vc3QgZnJlcXVlbnQgd29yZHMgYW5kIHRoZWlyIGZyZXF1ZW5jaWVzCmBgYHtyfQpmaW5kTW9zdEZyZXFUZXJtcyhkdG0sIG4gPSA1MCwgSU5ERVggPSByZXAoMSwgZHRtJG5yb3cpKVtbMV1dCmBgYAoK