comments_work1.Rmd

library(reshape2)
library(ggplot2)
library(tm)

Loading required package: NLP


Attaching package: 'NLP'

The following object is masked from 'package:ggplot2':

    annotate

library(topicmodels) 
library(RColorBrewer)
library(wordcloud)

Reading the file that contains scraped comments from the web site and converting the file from wide to long format

c <- read.csv("/Users/analucic/Documents/comments_work1.csv", stringsAsFactors=FALSE)
d <- melt(c, variable.name = "key", value.names = "value", id.vars = c("id"), factorsAsStrings=F)
head(d)

Keeping the column that contains comments only and pre-processing comments:converting terms to lower case, removing punctuation, and whitespace

text <- d$value 
text<- text[!is.na(text)]
text <- gsub("’", '', text)
text <- gsub("'", '', text)
text <- gsub("“", '', text)
text <- gsub("”", '', text)
text <- gsub("’re", '', text)
text <- gsub("'", '', text)
m <- Corpus(VectorSource(text))  
m <- tm_map(m, tolower)

Warning in tm_map.SimpleCorpus(m, tolower): transformation drops documents

m <- tm_map(m, removePunctuation)

Warning in tm_map.SimpleCorpus(m, removePunctuation): transformation drops
documents

m <- tm_map(m, stripWhitespace)

Warning in tm_map.SimpleCorpus(m, stripWhitespace): transformation drops
documents

Removing certain words that we don’t want in the data set, in addition to the default text mining library stopwords list

myStopWords <- c("thank", "thanks", "use", "see", "used", "via", "amp", "kylo", "ben", "solo", "rey", "leia", "luke", "skywalker", "han", "solo", "chewie", "dean", "winchester", "sam", "castiel", "love", "heart", "yay", "omg", "captain", "america", "steve", "rogers", "bucky", "barnes", "winter", "soldier", "sam", "wilson", "tony", "stark", "ironman", "falcon", "black", "widow", "natasha", "clint", "'", "s", "m", "3", "cas", "cant", "dont", "like", "much", "really", "cas", "reylo", "balthazar", "im", "d", "make", "makes", "get")
m <- tm_map(m, removeWords, c(stopwords("english"), myStopWords))

Warning in tm_map.SimpleCorpus(m, removeWords, c(stopwords("english"),
myStopWords)): transformation drops documents

Creating a term document matrix: terms in rows and documents as columns

tdm <- TermDocumentMatrix(m, control = list(wordLengths = c(1, Inf)))
tdm

<<TermDocumentMatrix (terms: 2108, documents: 1830)>>
Non-/sparse entries: 4701/3852939
Sparsity           : 100%
Maximal term length: 36
Weighting          : term frequency (tf)

Finding associated terms with the terms of interest, level of correlation is specified as 0.3

findAssocs(tdm, c("twitter", "tumblr", "ao3", "reading", "writing", "learning", "looking", "art", "fanart", "dresses", "multimedia", "meta", "fanon", "canon", "discord", "pillowfort", "dreamwidth", "kofi", "imagine", "see", "recipe", "links", "website", "wookiepedia", "explain", "describe", "description", "visualize", "hear", "smell", "taste", "understand", "know"), corlimit=0.3)

$twitter
numeric(0)

$tumblr
numeric(0)

$ao3
numeric(0)

$reading
numeric(0)

$writing
domesticity    exercise 
       0.41        0.41 

$learning
numeric(0)

$looking
              forward                  spun                 sugar 
                 0.53                  0.50                  0.50 
                glass               pattern                   saw 
                 0.41                  0.35                  0.35 
         streetdoccas                ending                 folks 
                 0.35                  0.35                  0.35 
                 woot             delighted shadowrunsupernatural 
                 0.35                  0.35                  0.35 
                 made 
                 0.32 

$art
   lucky   pimmys  amazing   ageing    amiss   amount   gotten indecent 
    0.43     0.43     0.38     0.30     0.30     0.30     0.30     0.30 
  mécène    spent  casting     map—    spell   flames trinkets 
    0.30     0.30     0.30     0.30     0.30     0.30     0.30 

$fanart
numeric(0)

$dresses
numeric(0)

$multimedia
numeric(0)

$meta
numeric(0)

$fanon
accepted   grumpy     meat   planet absolute  morning throwing   person 
    1.00     1.00     1.00     1.00     0.71     0.58     0.58     0.50 
   least 
    0.38 

$canon
    moose  squirrel recognize 
     0.41      0.41      0.33 

$discord
         ago conversation           ho        quite      stopped 
        1.00         1.00         1.00         0.71         0.71 
    watching          hot        broom      morning          boy 
        0.71         0.71         0.71         0.58         0.58 
         spn         damn       closet          ive         part 
        0.58         0.58         0.58         0.50         0.35 
        long         time 
        0.32         0.32 

$pillowfort
numeric(0)

$dreamwidth
numeric(0)

$kofi
numeric(0)

$imagine
     dealing      streets       within  environment interactions 
        1.00         1.00         1.00         0.71         0.71 
   brilliant       giving    perfectly           us     everyone 
        0.58         0.58         0.45         0.38         0.35 
      moving         cool 
        0.32         0.30 

$see
numeric(0)

$recipe
numeric(0)

$links
numeric(0)

$website
numeric(0)

$wookiepedia
numeric(0)

$explain
numeric(0)

$describe
numeric(0)

$description
warm 
0.33 

$visualize
numeric(0)

$hear
numeric(0)

$smell
numeric(0)

$taste
alcohol chasing mmmmmmm  spiced   woody    edge 
   1.00    1.00    1.00    1.00    1.00    0.58 

$understand
history 
   0.58 

$know
    itll tomorrow 
    0.31     0.31

Establishing frequent terms. Taking a subset of terms that appear more than and equal to 15 times

(freq.terms <- findFreqTerms(tdm, lowfreq = 15))

 [1] "bit"       "world"     "glad"      "people"    "way"      
 [6] "just"      "didnt"     "know"      "still"     "said"     
[11] "chapter"   "well"      "youre"     "back"      "one"      
[16] "reading"   "amazing"   "made"      "poor"      "thats"    
[21] "story"     "done"      "things"    "good"      "even"     
[26] "now"       "little"    "oh"        "beautiful" "lot"      
[31] "think"     "great"     "man"

bit

world

glad

people

way

just

didnt

know

still

said

chapter

well

youre

back

one

reading

amazing

made

poor

thats

story

done

things

good

even

now

little

oh

beautiful

lot

think

great

man

term.freq <- rowSums(as.matrix(tdm))  
term.freq <- subset(term.freq, term.freq >= 15)

Converting frequent terms into a data frame format for easier plotting

df <- data.frame(term = names(term.freq), freq = term.freq) 
ggplot(df, aes(x = reorder(term, -freq), y = freq)) + geom_bar(stat = "identity") +  theme(axis.text.x=element_text(angle=45, hjust=1))

Establishing word frequency. Minimum term frequency for the word cloud plot is 20

wm <- as.matrix(tdm) 
word.freq <- sort(rowSums(wm), decreasing = T)
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 20, random.order = F, colors = pal)

Transposing the term document matrix and converting it into a document term matrix (documents as rows and colums as terms)

dtm <- as.DocumentTermMatrix(tdm)

Summing frequencies for documents/individual comments and eliminating those that do not have any terms

row_total = apply(dtm, 1, sum) 
dtm.new = dtm[row_total>0,]

Running topic modeling (Latent Dirichlet Allocation) on the document term matrix and requesting 8 topics. Specifying that each topic model should consist of 10 words.

lda <- LDA(dtm.new, k = 8) 
term <- terms(lda, 10) 
term

      Topic 1  Topic 2         Topic 3   Topic 4   Topic 5       Topic 6  
 [1,] "still"  "good"          "made"    "glad"    "know"        "said"   
 [2,] "lot"    "well"          "things"  "people"  "read"        "wanted" 
 [3,] "little" "beautiful"     "enough"  "worked"  "didnt"       "work"   
 [4,] "even"   "thats"         "think"   "bit"     "interesting" "kids"   
 [5,] "just"   "worldbuilding" "felt"    "many"    "world"       "bad"    
 [6,] "story"  "looked"        "looking" "write"   "deans"       "writing"
 [7,] "one"    "descriptions"  "scene"   "thing"   "voice"       "course" 
 [8,] "youre"  "chapter"       "need"    "nurse"   "brain"       "around" 
 [9,] "banter" "whole"         "line"    "creepy"  "djinn"       "eyes"   
[10,] "wasnt"  "just"          "go"      "tension" "yes"         "loved"  
      Topic 7   Topic 8  
 [1,] "oh"      "one"    
 [2,] "reading" "amazing"
 [3,] "now"     "art"    
 [4,] "great"   "back"   
 [5,] "got"     "imagery"
 [6,] "man"     "come"   
 [7,] "chapter" "fun"    
 [8,] "yeah"    "cool"   
 [9,] "okay"    "poor"   
[10,] "feels"   "take"

still

lot

little

even

just

story

one

youre

banter

wasnt

good

well

beautiful

thats

worldbuilding

looked

descriptions

chapter

whole

just

made

things

enough

think

felt

looking

scene

need

line

go

glad

people

worked

bit

many

write

thing

nurse

creepy

tension

know

read

didnt

interesting

world

deans

voice

brain

djinn

yes

said

wanted

work

kids

bad

writing

course

around

eyes

loved

oh

reading

now

great

got

man

chapter

yeah

okay

feels

one

amazing

art

back

imagery

come

fun

cool

poor

take

50 most frequent words and their frequencies

findMostFreqTerms(dtm, n = 50, INDEX = rep(1, dtm$nrow))[[1]]

       just         one          oh        good        well        glad 
         38          38          38          33          30          29 
      world         lot      people     chapter       thats         way 
         27          23          22          22          22          20 
       know       still      things         bit      little        said 
         20          20          20          19          19          17 
    reading       story         man       youre        back        poor 
         17          17          17          16          16          16 
       done         now       didnt     amazing        made        even 
         16          16          15          15          15          15 
  beautiful       think       great       scene        need       going 
         15          15          15          14          14          14 
       read        work         can      worked description  everything 
         14          14          13          13          12          12 
       feel        line        best         got       voice         fun 
         12          12          12          12          12          12 
      whole       loved 
         11          11

LS0tCnRpdGxlOiAiY29tbWVudHNfd29yazEuUm1kIgphdXRob3I6ICJBbmEgTHVjaWMgJiBMYXVyZW4gUm91c2UiCmRhdGU6ICI5IE1heSAyMDE5IgpvdXRwdXQ6ICJodG1sX25vdGVib29rIgotLS0KYGBge3J9CmxpYnJhcnkocmVzaGFwZTIpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeSh0bSkgCmxpYnJhcnkodG9waWNtb2RlbHMpIApsaWJyYXJ5KFJDb2xvckJyZXdlcikKbGlicmFyeSh3b3JkY2xvdWQpIApgYGAKCiMjIyMjIFJlYWRpbmcgdGhlIGZpbGUgdGhhdCBjb250YWlucyBzY3JhcGVkIGNvbW1lbnRzIGZyb20gdGhlIHdlYiBzaXRlIGFuZCBjb252ZXJ0aW5nIHRoZSBmaWxlIGZyb20gd2lkZSB0byBsb25nIGZvcm1hdApgYGB7cn0KYyA8LSByZWFkLmNzdigiL1VzZXJzL2FuYWx1Y2ljL0RvY3VtZW50cy9jb21tZW50c193b3JrMS5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzPUZBTFNFKQpkIDwtIG1lbHQoYywgdmFyaWFibGUubmFtZSA9ICJrZXkiLCB2YWx1ZS5uYW1lcyA9ICJ2YWx1ZSIsIGlkLnZhcnMgPSBjKCJpZCIpLCBmYWN0b3JzQXNTdHJpbmdzPUYpCmhlYWQoZCkKYGBgCiMjIyMjIEtlZXBpbmcgdGhlIGNvbHVtbiB0aGF0IGNvbnRhaW5zIGNvbW1lbnRzIG9ubHkgYW5kIHByZS1wcm9jZXNzaW5nIGNvbW1lbnRzOmNvbnZlcnRpbmcgdGVybXMgdG8gbG93ZXIgY2FzZSwgcmVtb3ZpbmcgcHVuY3R1YXRpb24sIGFuZCB3aGl0ZXNwYWNlCmBgYHtyfQp0ZXh0IDwtIGQkdmFsdWUgCnRleHQ8LSB0ZXh0WyFpcy5uYSh0ZXh0KV0KdGV4dCA8LSBnc3ViKCLigJkiLCAnJywgdGV4dCkKdGV4dCA8LSBnc3ViKCInIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCcIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCdIiwgJycsIHRleHQpCnRleHQgPC0gZ3N1Yigi4oCZcmUiLCAnJywgdGV4dCkKdGV4dCA8LSBnc3ViKCInIiwgJycsIHRleHQpCm0gPC0gQ29ycHVzKFZlY3RvclNvdXJjZSh0ZXh0KSkgIAptIDwtIHRtX21hcChtLCB0b2xvd2VyKQptIDwtIHRtX21hcChtLCByZW1vdmVQdW5jdHVhdGlvbikgCm0gPC0gdG1fbWFwKG0sIHN0cmlwV2hpdGVzcGFjZSkKYGBgCgojIyMjIyBSZW1vdmluZyBjZXJ0YWluIHdvcmRzIHRoYXQgd2UgZG9uJ3Qgd2FudCBpbiB0aGUgZGF0YSBzZXQsIGluIGFkZGl0aW9uIHRvIHRoZSBkZWZhdWx0IHRleHQgbWluaW5nIGxpYnJhcnkgc3RvcHdvcmRzIGxpc3QKYGBge3J9Cm15U3RvcFdvcmRzIDwtIGMoInRoYW5rIiwgInRoYW5rcyIsICJ1c2UiLCAic2VlIiwgInVzZWQiLCAidmlhIiwgImFtcCIsICJreWxvIiwgImJlbiIsICJzb2xvIiwgInJleSIsICJsZWlhIiwgImx1a2UiLCAic2t5d2Fsa2VyIiwgImhhbiIsICJzb2xvIiwgImNoZXdpZSIsICJkZWFuIiwgIndpbmNoZXN0ZXIiLCAic2FtIiwgImNhc3RpZWwiLCAibG92ZSIsICJoZWFydCIsICJ5YXkiLCAib21nIiwgImNhcHRhaW4iLCAiYW1lcmljYSIsICJzdGV2ZSIsICJyb2dlcnMiLCAiYnVja3kiLCAiYmFybmVzIiwgIndpbnRlciIsICJzb2xkaWVyIiwgInNhbSIsICJ3aWxzb24iLCAidG9ueSIsICJzdGFyayIsICJpcm9ubWFuIiwgImZhbGNvbiIsICJibGFjayIsICJ3aWRvdyIsICJuYXRhc2hhIiwgImNsaW50IiwgIiciLCAicyIsICJtIiwgIjMiLCAiY2FzIiwgImNhbnQiLCAiZG9udCIsICJsaWtlIiwgIm11Y2giLCAicmVhbGx5IiwgImNhcyIsICJyZXlsbyIsICJiYWx0aGF6YXIiLCAiaW0iLCAiZCIsICJtYWtlIiwgIm1ha2VzIiwgImdldCIpCm0gPC0gdG1fbWFwKG0sIHJlbW92ZVdvcmRzLCBjKHN0b3B3b3JkcygiZW5nbGlzaCIpLCBteVN0b3BXb3JkcykpIApgYGAKIyMjIyMgQ3JlYXRpbmcgYSB0ZXJtIGRvY3VtZW50IG1hdHJpeDogdGVybXMgaW4gcm93cyBhbmQgZG9jdW1lbnRzIGFzIGNvbHVtbnMKYGBge3J9CnRkbSA8LSBUZXJtRG9jdW1lbnRNYXRyaXgobSwgY29udHJvbCA9IGxpc3Qod29yZExlbmd0aHMgPSBjKDEsIEluZikpKQp0ZG0KYGBgCiMjIyMjIEZpbmRpbmcgYXNzb2NpYXRlZCB0ZXJtcyB3aXRoIHRoZSB0ZXJtcyBvZiBpbnRlcmVzdCwgbGV2ZWwgb2YgY29ycmVsYXRpb24gaXMgc3BlY2lmaWVkIGFzIDAuMwpgYGB7cn0KZmluZEFzc29jcyh0ZG0sIGMoInR3aXR0ZXIiLCAidHVtYmxyIiwgImFvMyIsICJyZWFkaW5nIiwgIndyaXRpbmciLCAibGVhcm5pbmciLCAibG9va2luZyIsICJhcnQiLCAiZmFuYXJ0IiwgImRyZXNzZXMiLCAibXVsdGltZWRpYSIsICJtZXRhIiwgImZhbm9uIiwgImNhbm9uIiwgImRpc2NvcmQiLCAicGlsbG93Zm9ydCIsICJkcmVhbXdpZHRoIiwgImtvZmkiLCAiaW1hZ2luZSIsICJzZWUiLCAicmVjaXBlIiwgImxpbmtzIiwgIndlYnNpdGUiLCAid29va2llcGVkaWEiLCAiZXhwbGFpbiIsICJkZXNjcmliZSIsICJkZXNjcmlwdGlvbiIsICJ2aXN1YWxpemUiLCAiaGVhciIsICJzbWVsbCIsICJ0YXN0ZSIsICJ1bmRlcnN0YW5kIiwgImtub3ciKSwgY29ybGltaXQ9MC4zKQpgYGAKIyMjIyMgRXN0YWJsaXNoaW5nIGZyZXF1ZW50IHRlcm1zLiBUYWtpbmcgYSBzdWJzZXQgb2YgdGVybXMgdGhhdCBhcHBlYXIgbW9yZSB0aGFuIGFuZCBlcXVhbCB0byAxNSB0aW1lcwpgYGB7cn0KKGZyZXEudGVybXMgPC0gZmluZEZyZXFUZXJtcyh0ZG0sIGxvd2ZyZXEgPSAxNSkpIAp0ZXJtLmZyZXEgPC0gcm93U3Vtcyhhcy5tYXRyaXgodGRtKSkgIAp0ZXJtLmZyZXEgPC0gc3Vic2V0KHRlcm0uZnJlcSwgdGVybS5mcmVxID49IDE1KSAKYGBgCiMjIyMjIENvbnZlcnRpbmcgZnJlcXVlbnQgdGVybXMgaW50byBhIGRhdGEgZnJhbWUgZm9ybWF0IGZvciBlYXNpZXIgcGxvdHRpbmcKYGBge3J9CmRmIDwtIGRhdGEuZnJhbWUodGVybSA9IG5hbWVzKHRlcm0uZnJlcSksIGZyZXEgPSB0ZXJtLmZyZXEpIApnZ3Bsb3QoZGYsIGFlcyh4ID0gcmVvcmRlcih0ZXJtLCAtZnJlcSksIHkgPSBmcmVxKSkgKyBnZW9tX2JhcihzdGF0ID0gImlkZW50aXR5IikgKyAgdGhlbWUoYXhpcy50ZXh0Lng9ZWxlbWVudF90ZXh0KGFuZ2xlPTQ1LCBoanVzdD0xKSkKYGBgCiMjIyMgRXN0YWJsaXNoaW5nIHdvcmQgZnJlcXVlbmN5LiBNaW5pbXVtIHRlcm0gZnJlcXVlbmN5IGZvciB0aGUgd29yZCBjbG91ZCBwbG90IGlzIDIwCmBgYHtyfQp3bSA8LSBhcy5tYXRyaXgodGRtKSAKd29yZC5mcmVxIDwtIHNvcnQocm93U3Vtcyh3bSksIGRlY3JlYXNpbmcgPSBUKQpwYWwgPC0gYnJld2VyLnBhbCg5LCAiQnVHbiIpWy0oMTo0KV0Kd29yZGNsb3VkKHdvcmRzID0gbmFtZXMod29yZC5mcmVxKSwgZnJlcSA9IHdvcmQuZnJlcSwgbWluLmZyZXEgPSAyMCwgcmFuZG9tLm9yZGVyID0gRiwgY29sb3JzID0gcGFsKSAKYGBgCiMjIyMjIFRyYW5zcG9zaW5nIHRoZSB0ZXJtIGRvY3VtZW50IG1hdHJpeCBhbmQgY29udmVydGluZyBpdCBpbnRvIGEgZG9jdW1lbnQgdGVybSBtYXRyaXggKGRvY3VtZW50cyBhcyByb3dzIGFuZCBjb2x1bXMgYXMgdGVybXMpCmBgYHtyfQpkdG0gPC0gYXMuRG9jdW1lbnRUZXJtTWF0cml4KHRkbSkgCmBgYAojIyMjIyBTdW1taW5nIGZyZXF1ZW5jaWVzIGZvciBkb2N1bWVudHMvaW5kaXZpZHVhbCBjb21tZW50cyBhbmQgZWxpbWluYXRpbmcgdGhvc2UgdGhhdCBkbyBub3QgaGF2ZSBhbnkgdGVybXMKYGBge3J9CnJvd190b3RhbCA9IGFwcGx5KGR0bSwgMSwgc3VtKSAKZHRtLm5ldyA9IGR0bVtyb3dfdG90YWw+MCxdIApgYGAKIyMjIyMgUnVubmluZyB0b3BpYyBtb2RlbGluZyAoTGF0ZW50IERpcmljaGxldCBBbGxvY2F0aW9uKSBvbiB0aGUgZG9jdW1lbnQgdGVybSBtYXRyaXggYW5kIHJlcXVlc3RpbmcgOCB0b3BpY3MuIFNwZWNpZnlpbmcgdGhhdCBlYWNoIHRvcGljIG1vZGVsIHNob3VsZCBjb25zaXN0IG9mIDEwIHdvcmRzLgpgYGB7cn0KbGRhIDwtIExEQShkdG0ubmV3LCBrID0gOCkgCnRlcm0gPC0gdGVybXMobGRhLCAxMCkgCnRlcm0KYGBgCiMjIyMjIyA1MCBtb3N0IGZyZXF1ZW50IHdvcmRzIGFuZCB0aGVpciBmcmVxdWVuY2llcwpgYGB7cn0KZmluZE1vc3RGcmVxVGVybXMoZHRtLCBuID0gNTAsIElOREVYID0gcmVwKDEsIGR0bSRucm93KSlbWzFdXQpgYGAK