ikea<-read.csv(file="ikea_comments.csv", sep="\t", colClasses=c("Video.URL"="character", "Nickname"="character", "Text"="character"))
library(ggplot2)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(NLP)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(wordcloud)
## Loading required package: RColorBrewer
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:igraph':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dplyr)
comments <- ikea %>% select(Video.URL, Text)
ikea$Text <- tolower(ikea$Text)
head(ikea$Text)
## [1] "i love this so much it relaxed me a lot"
## [2] "nani"
## [3] "im going to sleep listening to ikea commercial.... nice"
## [4] "what pillow does a stomach sleeper use-"
## [5] "basic knowledge: ads can only be up to 30 seconds long-\n\nikea: hold my not put together table set-"
## [6] "why am i about to sub to ikea usa for one, 25 minute ad?"
ikea$Text <-gsub("[[:punct:]]", " ", ikea$Text)
head(ikea$Text, 10)
## [1] "i love this so much it relaxed me a lot"
## [2] "nani"
## [3] "im going to sleep listening to ikea commercial nice"
## [4] "what pillow does a stomach sleeper use "
## [5] "basic knowledge ads can only be up to 30 seconds long \n\nikea hold my not put together table set "
## [6] "why am i about to sub to ikea usa for one 25 minute ad "
## [7] "is this what pewdiepie falls asleep to"
## [8] "whispery voice to not wake up dorm mate "
## [9] " laying on an ikea pillow right now "
## [10] "imagine chick fil a whisper to you don t eat cows"
======matrix =============
corpus1 <-Corpus(VectorSource(ikea$Text))
corpus2 <-tm_map(corpus1, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus1, removeWords, stopwords("english")):
## transformation drops documents
tdm <-TermDocumentMatrix(corpus2, control=list(wordLengths=c(2, 100)))
tdm
## <<TermDocumentMatrix (terms: 5418, documents: 5548)>>
## Non-/sparse entries: 33819/30025245
## Sparsity : 100%
## Maximal term length: 48
## Weighting : term frequency (tf)
inspect(tdm)
## <<TermDocumentMatrix (terms: 5418, documents: 5548)>>
## Non-/sparse entries: 33819/30025245
## Sparsity : 100%
## Maximal term length: 48
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1145 1735 2629 2725 300 4396 5164 5298 547 834
## ad 0 0 0 0 0 0 0 0 0 0
## asmr 1 0 1 0 0 4 3 2 0 0
## can 3 0 0 0 0 0 0 0 0 0
## good 0 0 0 1 0 0 0 0 0 0
## ikea 0 4 1 1 0 0 0 1 0 0
## just 0 2 0 0 0 0 4 0 0 0
## like 2 2 0 0 0 0 1 0 0 0
## love 0 0 0 0 0 0 0 0 0 0
## now 0 1 0 3 0 0 0 0 0 0
## video 0 0 0 0 0 0 0 1 0 0
tdm.mx <-as.matrix(tdm)
dim(tdm.mx)
## [1] 5418 5548
comments <- sort(rowSums(tdm.mx), decreasing=TRUE)
df <-data.frame(word = names(comments), freq=comments)
head(df, 20)
## word freq
## ikea ikea 1800
## asmr asmr 1138
## like like 510
## video video 442
## just just 360
## love love 352
## ad ad 341
## now now 314
## good good 302
## can can 260
## make make 253
## one one 230
## really really 225
## want want 224
## actually actually 221
## well well 203
## voice voice 193
## get get 185
## bed bed 173
## please please 172
wordcloud(words=df$word, freq=df$freq, min.freq=2, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"), scale=c(3.5, 0.25))
tdm2 <-removeSparseTerms(tdm, 0.98)
tdm2.mx <-as.matrix(tdm2)
dim(tdm2.mx)
## [1] 43 5548
termMatrix <-tdm2.mx %*% t(tdm2.mx)
termMatrix[5:10, 5:10]
## Terms
## Terms ad one voice now don like
## ad 393 20 6 26 21 37
## one 20 264 10 19 15 46
## voice 6 10 223 5 3 34
## now 26 19 5 336 8 22
## don 21 15 3 8 140 39
## like 37 46 34 22 39 618
g <-graph.adjacency(termMatrix, weighted=T, mode="undirected")
g<-simplify(g)
V(g)$degree <- degree(g, mode="all")
plot(g, size=V(g)$degree, edge.arrow.size=0.05, edge.width=0.07)