set working directory

read your file

ikea<-read.csv(file="ikea_comments.csv", sep="\t", colClasses=c("Video.URL"="character", "Nickname"="character", "Text"="character"))

import required packages

library(ggplot2)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(NLP)
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(wordcloud)
## Loading required package: RColorBrewer
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:igraph':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

For now, let’s extract two variables “id” and “text”

library(dplyr)
comments <- ikea %>% select(Video.URL, Text)

Text data is messy. It needs a lot of prep.

First, let’s convert texts to lower case using tx package

ikea$Text <- tolower(ikea$Text)
head(ikea$Text)
## [1] "i love this so much it relaxed me a lot"                                                             
## [2] "nani"                                                                                                
## [3] "im going to sleep listening to ikea commercial.... nice"                                             
## [4] "what pillow does a stomach sleeper use-"                                                             
## [5] "basic knowledge: ads can only be up to 30 seconds long-\n\nikea: hold my not put together table set-"
## [6] "why am i about to sub to ikea usa for one, 25 minute ad?"

Remove unecessary words

ikea$Text <-gsub("[[:punct:]]", " ", ikea$Text)
head(ikea$Text, 10)
##  [1] "i love this so much it relaxed me a lot"                                                             
##  [2] "nani"                                                                                                
##  [3] "im going to sleep listening to ikea commercial     nice"                                             
##  [4] "what pillow does a stomach sleeper use "                                                             
##  [5] "basic knowledge  ads can only be up to 30 seconds long \n\nikea  hold my not put together table set "
##  [6] "why am i about to sub to ikea usa for one  25 minute ad "                                            
##  [7] "is this what pewdiepie falls asleep to"                                                              
##  [8] "whispery voice to not wake up dorm mate "                                                            
##  [9] " laying on an ikea pillow right now   "                                                              
## [10] "imagine chick fil a whisper to you don t eat cows"

There are other ways to remove special characters

See functions below

“Corpus” is a collection of text documents.

The function will make each text as a vector (a list of words)

corpus1 <-Corpus(VectorSource(ikea$Text))

Let’s remove stopwords (the english-based stopword list from the SMART information retrieval system)

see https://countwordsfree.com/stopwords

corpus2 <-tm_map(corpus1, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus1, removeWords, stopwords("english")):
## transformation drops documents

If you want, you can make your own stopwords list and use it

For example, my.stops <- c(“us”, “our”, “your”)

corpus2 <-tm_map(corpus1, removeWords, my.stops)

We need to transform a term-document matrix from a corpus

Inspecting a term-document matrix displays a sample

tdm <-TermDocumentMatrix(corpus2, control=list(wordLengths=c(2, 100)))
tdm
## <<TermDocumentMatrix (terms: 5418, documents: 5548)>>
## Non-/sparse entries: 33819/30025245
## Sparsity           : 100%
## Maximal term length: 48
## Weighting          : term frequency (tf)
inspect(tdm)
## <<TermDocumentMatrix (terms: 5418, documents: 5548)>>
## Non-/sparse entries: 33819/30025245
## Sparsity           : 100%
## Maximal term length: 48
## Weighting          : term frequency (tf)
## Sample             :
##        Docs
## Terms   1145 1735 2629 2725 300 4396 5164 5298 547 834
##   ad       0    0    0    0   0    0    0    0   0   0
##   asmr     1    0    1    0   0    4    3    2   0   0
##   can      3    0    0    0   0    0    0    0   0   0
##   good     0    0    0    1   0    0    0    0   0   0
##   ikea     0    4    1    1   0    0    0    1   0   0
##   just     0    2    0    0   0    0    4    0   0   0
##   like     2    2    0    0   0    0    1    0   0   0
##   love     0    0    0    0   0    0    0    0   0   0
##   now      0    1    0    3   0    0    0    0   0   0
##   video    0    0    0    0   0    0    0    1   0   0

Now we need to convert the term-document into a matrix

tdm.mx <-as.matrix(tdm)
dim(tdm.mx)
## [1] 5418 5548
comments <- sort(rowSums(tdm.mx), decreasing=TRUE)
df <-data.frame(word = names(comments), freq=comments)
head(df, 20)
##              word freq
## ikea         ikea 1800
## asmr         asmr 1138
## like         like  510
## video       video  442
## just         just  360
## love         love  352
## ad             ad  341
## now           now  314
## good         good  302
## can           can  260
## make         make  253
## one           one  230
## really     really  225
## want         want  224
## actually actually  221
## well         well  203
## voice       voice  193
## get           get  185
## bed           bed  173
## please     please  172

plotting word cloud

wordcloud(words=df$word, freq=df$freq, min.freq=2, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"), scale=c(3.5, 0.25))

Optional !!!

If you want to reduce your network size, you can remove words that appear very rarely

But you need to go back to the term-document (the step before the matrix)

If you have decied to go back, “removeSparseTerms” will do this job for you

If you set sparse = 0.95, this will remove terms that appear less than 5% of times

For example, a term that appears say just 3 times in 600 documents

This term will have a frequency of appearance of 0.5 =3/600.

tdm2 <-removeSparseTerms(tdm, 0.98)

Yes, we need to convert this new, reduced term-document into a matrix

tdm2.mx <-as.matrix(tdm2)
dim(tdm2.mx)
## [1]   43 5548

As you can see, the current matrix is a two-mode format

Let’s convert the two-mode into the one-mode, word-to-word matrix format

termMatrix <-tdm2.mx %*% t(tdm2.mx)
termMatrix[5:10, 5:10]
##        Terms
## Terms    ad one voice now don like
##   ad    393  20     6  26  21   37
##   one    20 264    10  19  15   46
##   voice   6  10   223   5   3   34
##   now    26  19     5 336   8   22
##   don    21  15     3   8 140   39
##   like   37  46    34  22  39  618

Now you know the drill !

Let’s convert the matrix into an igraph object using “graph.adjacency” function

g <-graph.adjacency(termMatrix, weighted=T, mode="undirected")
g<-simplify(g)

size of nodes based on degree

V(g)$degree <- degree(g, mode="all")

Plot the network

plot(g, size=V(g)$degree, edge.arrow.size=0.05, edge.width=0.07)