1 Install and Load R Packages

tweets <- read_csv("C:/Users/malvika mathur/Desktop/GreatLakes/web analytics/assignment/file_hashtag_justdoit_tweets.csv")
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   text = col_character(),
##   favorited = col_logical(),
##   favoriteCount = col_integer(),
##   replyToSN = col_character(),
##   created = col_datetime(format = ""),
##   truncated = col_logical(),
##   replyToSID = col_double(),
##   id = col_double(),
##   replyToUID = col_double(),
##   statusSource = col_character(),
##   screenName = col_character(),
##   retweetCount = col_integer(),
##   isRetweet = col_logical(),
##   retweeted = col_logical(),
##   longitude = col_double(),
##   latitude = col_double()
## )
#tweets<-tweets[1:1000,]
dim(tweets)
## [1] 2000   17
tweets$created <- as.Date(tweets$created, format= "%y-%m-%d")
tweets$text <- as.character(tweets$text)
str(tweets)
## Classes 'tbl_df', 'tbl' and 'data.frame':    2000 obs. of  17 variables:
##  $ X1           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ text         : chr  "RT @Nike: Don’t ask if your dreams are crazy. Ask if they’re crazy enough. #justdoit https://t.co/Wd5L42egV8" "RT @Statecraft_Huff: Paint Kap as a person for all the people even when he demonizes good cops and try’s to sel"| __truncated__ "RT @JessieJaneDuff: Believe in Jesus Christ. \r\n\r\nEven if it means sacrificing everything. #JustDoIt https:/"| __truncated__ "RT @Nike: Don’t ask if your dreams are crazy. Ask if they’re crazy enough. #justdoit https://t.co/Wd5L42egV8" ...
##  $ favorited    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ favoriteCount: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ replyToSN    : chr  NA NA NA NA ...
##  $ created      : Date, format: "2018-09-08" "2018-09-08" ...
##  $ truncated    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ replyToSID   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ id           : num  1.04e+18 1.04e+18 1.04e+18 1.04e+18 1.04e+18 ...
##  $ replyToUID   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ statusSource : chr  "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" ...
##  $ screenName   : chr  "canyourel8now" "Schoony76" "bk_nlg" "dingwallellie" ...
##  $ retweetCount : int  199724 1 1092 199724 199724 610 199724 199724 199724 199724 ...
##  $ isRetweet    : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ retweeted    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ longitude    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ latitude     : num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 17
##   .. ..$ X1           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ text         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ favorited    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_logical" "collector"
##   .. ..$ favoriteCount: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ replyToSN    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ created      :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
##   .. ..$ truncated    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_logical" "collector"
##   .. ..$ replyToSID   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ id           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ replyToUID   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ statusSource : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ screenName   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ retweetCount : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ isRetweet    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_logical" "collector"
##   .. ..$ retweeted    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_logical" "collector"
##   .. ..$ longitude    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ latitude     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
summary(tweets)
##        X1             text           favorited       favoriteCount   
##  Min.   :   1.0   Length:2000        Mode :logical   Min.   : 0.000  
##  1st Qu.: 500.8   Class :character   FALSE:2000      1st Qu.: 0.000  
##  Median :1000.5   Mode  :character                   Median : 0.000  
##  Mean   :1000.5                                      Mean   : 0.062  
##  3rd Qu.:1500.2                                      3rd Qu.: 0.000  
##  Max.   :2000.0                                      Max.   :50.000  
##                                                                      
##   replyToSN            created           truncated      
##  Length:2000        Min.   :2018-09-08   Mode :logical  
##  Class :character   1st Qu.:2018-09-08   FALSE:1919     
##  Mode  :character   Median :2018-09-08   TRUE :81       
##                     Mean   :2018-09-08                  
##                     3rd Qu.:2018-09-08                  
##                     Max.   :2018-09-08                  
##                                                         
##    replyToSID              id              replyToUID       
##  Min.   :1.037e+18   Min.   :1.038e+18   Min.   :1.918e+06  
##  1st Qu.:1.038e+18   1st Qu.:1.038e+18   1st Qu.:2.507e+07  
##  Median :1.038e+18   Median :1.038e+18   Median :2.340e+08  
##  Mean   :1.038e+18   Mean   :1.038e+18   Mean   :8.759e+16  
##  3rd Qu.:1.038e+18   3rd Qu.:1.038e+18   3rd Qu.:7.105e+08  
##  Max.   :1.038e+18   Max.   :1.038e+18   Max.   :1.003e+18  
##  NA's   :1965                            NA's   :1957       
##  statusSource        screenName         retweetCount    isRetweet      
##  Length:2000        Length:2000        Min.   :     0   Mode :logical  
##  Class :character   Class :character   1st Qu.:   195   FALSE:184      
##  Mode  :character   Mode  :character   Median :  2143   TRUE :1816     
##                                        Mean   : 90490                  
##                                        3rd Qu.:199728                  
##                                        Max.   :365972                  
##                                                                        
##  retweeted         longitude         latitude    
##  Mode :logical   Min.   :-95.41   Min.   :27.97  
##  FALSE:2000      1st Qu.:-85.93   1st Qu.:29.36  
##                  Median :-78.95   Median :34.91  
##                  Mean   :-81.07   Mean   :35.28  
##                  3rd Qu.:-74.10   3rd Qu.:40.82  
##                  Max.   :-70.97   Max.   :43.30  
##                  NA's   :1996     NA's   :1996

2 Create a corpus from the imported files.

A corpus is a collection of texts, usually by an author or on a subject. R programming uses the term to encompass a set of texts that you consider to be related

#Data Preprocessing
set.seed(100)

#length of the tweets
sample = sample(tweets, (length(tweets)))
sample
## # A tibble: 2,000 x 17
##    created    replyToSN      id    X1 truncated latitude retweeted
##    <date>     <chr>       <dbl> <int> <lgl>        <dbl> <lgl>    
##  1 2018-09-08 <NA>      1.04e18     1 FALSE           NA FALSE    
##  2 2018-09-08 <NA>      1.04e18     2 FALSE           NA FALSE    
##  3 2018-09-08 <NA>      1.04e18     3 FALSE           NA FALSE    
##  4 2018-09-08 <NA>      1.04e18     4 FALSE           NA FALSE    
##  5 2018-09-08 <NA>      1.04e18     5 FALSE           NA FALSE    
##  6 2018-09-08 <NA>      1.04e18     6 FALSE           NA FALSE    
##  7 2018-09-08 <NA>      1.04e18     7 FALSE           NA FALSE    
##  8 2018-09-08 <NA>      1.04e18     8 FALSE           NA FALSE    
##  9 2018-09-08 <NA>      1.04e18     9 FALSE           NA FALSE    
## 10 2018-09-08 <NA>      1.04e18    10 FALSE           NA FALSE    
## # ... with 1,990 more rows, and 10 more variables: favoriteCount <int>,
## #   longitude <dbl>, text <chr>, statusSource <chr>, screenName <chr>,
## #   replyToSID <dbl>, retweetCount <int>, favorited <lgl>,
## #   replyToUID <dbl>, isRetweet <lgl>
(n.tweet <- length(tweets))
## [1] 17
# Create document corpus with tweet text
corpus<- Corpus(VectorSource(tweets$text)) 
writeLines(strwrap(corpus[[750]]$content,60))
## RT @Nike: Don’t ask if your dreams are crazy. Ask if
## they’re crazy enough. #justdoit https://t.co/Wd5L42egV8

3 Data Preprocessing and cleaning

Preprocessing the text can dramatically improve the performance of the Bag of Words method (or for that matter, any method). The first step towards doing this is Creating a Corpus, which in simple terms, is nothing but a collection of text documents. Once the Corpus is created, we are ready for preprocessing.

First, let us remove Punctuations. The basic approach to deal with this is to remove everything that isn’t a standard number or letter. It should be borne in mind that sometimes punctuations can be really useful, like web addresses, where the punctuation often defines the web address. Therefore, the removal of punctuation should be tailored to the specific problem. In our case, we will remove all punctuations.

Next, we change the case of the word to lowercase so that same words are not counted as different because of lower or upper case.

Another preprocessing task we have to do is to remove unhelpful terms. Many words are frequently used but are only meaningful in a sentence. These are called stop words. Examples are ‘the’, ‘is’, ‘at’, and ‘which’. It’s unlikely that these words will improve our ability to understand sentiments, so we want to remove them to reduce the size of the data.

3.1 text preprocessing and cleaning

transformations are done via the tm_map() function which applies (maps) a function to all elements of the corpus. Basically, all transformations work on single text documents and tm_map() just applies them to all documents in a corpus.

3.1.1 Remove URL

library(rtweet)
## 
## Attaching package: 'rtweet'
## The following object is masked from 'package:syuzhet':
## 
##     get_tokens
## The following object is masked from 'package:purrr':
## 
##     flatten
## The following object is masked from 'package:twitteR':
## 
##     lookup_statuses
#remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(removeURL)):
## transformation drops documents

3.1.2 Remove Punctuation

#removing punctuation
remove_punct<-function(x)gsub("[^[:alpha:][:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(remove_punct))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(remove_punct)):
## transformation drops documents

3.1.3 Strip whitespace

#triming extra spaces
corpus = tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents

3.1.4 Remove the @ (usernames)

removeUsername <- function(x) gsub("@[^[:space:]]*", "", x)  
corpus <- tm_map(corpus, content_transformer(removeUsername))
## Warning in tm_map.SimpleCorpus(corpus,
## content_transformer(removeUsername)): transformation drops documents
writeLines(strwrap(corpus[[750]]$content,60))
## RT Nike Dont ask if your dreams are crazy Ask if theyre
## crazy enough justdoit

3.2 text normalization

The process of normalization broadly refers to the transformation of words into a more uniform form.

3.2.1 convert text to lowercase

#converting text to lower case
corpus <- tm_map(corpus, content_transformer(stri_trans_tolower)) 
## Warning in tm_map.SimpleCorpus(corpus,
## content_transformer(stri_trans_tolower)): transformation drops documents
writeLines(strwrap(corpus[[750]]$content,60))
## rt nike dont ask if your dreams are crazy ask if theyre
## crazy enough justdoit
# myCorpus = tm_map(myCorpus, content_transformer(tolower))

3.2.2 remove stopword

Stop words are just common words which we may not be interested in. If we look at the result of stopwords (“english”) we can see what is getting removed.

The information value of ‘stopwords’ is near zero due to the fact that they are so common in a language. Removing this kind of words is useful before further analysis.

#stopword treatment
corpus = tm_map(corpus, removeWords, stopwords('english'))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
writeLines(strwrap(corpus[[750]]$content,60))
## rt nike dont ask dreams crazy ask theyre crazy enough
## justdoit
# specify your stopwords as a character vector
corpus <- tm_map(corpus, removeWords, c("keep", "check", "can","just","isnt","hey","ask","theyr","dont","theyre","cmon","htt","everything","even","enough","rt")) 
## Warning in tm_map.SimpleCorpus(corpus, removeWords, c("keep", "check",
## "can", : transformation drops documents

3.2.3 Remove Single letter words

removeSingle <- function(x) gsub(" . ", " ", x)   
corpus <- tm_map(corpus, content_transformer(removeSingle))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(removeSingle)):
## transformation drops documents
writeLines(strwrap(corpus[[750]]$content,60))
## nike dreams crazy crazy justdoit

4 Build a term-document matrix and Explore frequent terms and their associations

Once we have preprocessed our data, we’re now ready to extract the word frequencies to be used in our prediction problem. The tm package provides a function called TermDocumentMatrix that generates a matrix where the rows correspond to documents, in our case tweets, and the columns correspond to words in those tweets. The values in the matrix are the counts of how many times that word appeared in each document. Document matrix is a table containing the frequency of the words. Column names are words and row names are documents. The function TermDocumentMatrix() from text mining package can be used as follow :

dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
datatable(head(d, 10))
tdm<- TermDocumentMatrix(corpus, control= list(wordLengths= c(1, Inf)))

4.1 Find the terms used most frequently

dtf <- DocumentTermMatrix(corpus)
freq.terms <- findFreqTerms(tdm, lowfreq = 25)
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >25)
df <- data.frame(term = names(term.freq), freq= term.freq)
ggplot(df, aes(reorder(term, freq),freq)) + theme_bw() + geom_bar(stat = "identity")  + coord_flip() +labs(list(title="Term Frequency Chart", x="Terms", y="Term Counts")) 

4.2 Frequency analysis

(freq.terms <- findFreqTerms(tdm, lowfreq = 10))
##   [1] "crazy"           "dreams"          "justdoit"       
##   [4] "nike"            "good"            "people"         
##   [7] "believe"         "christ"          "jessiejaneduff" 
##  [10] "jesus"           "means"           "sacrificing"    
##  [13] "catfish"         "nevschulman"     "seriously"      
##  [16] "constitution"    "defend"          "fought"         
##  [19] "protect"         "standing"        "states"         
##  [22] "thank"           "united"          "votevets"       
##  [25] "ad"              "colin"           "kaepernick"     
##  [28] "larryelder"      "campaign"        "captainslogo"   
##  [31] "company"         "data"            "jumped"         
##  [34] "nikes"           "online"          "sales"          
##  [37] "show"            "takeaknee"       "unveiled"       
##  [40] "unpatriotic"     "take"            "great"          
##  [43] "something"       "going"           "leave"          
##  [46] "right"           "soim"            "themaddyallen"  
##  [49] "look"            "brunoamato"      "co"             
##  [52] "control"         "final"           "go"             
##  [55] "held"            "history"         "houses"         
##  [58] "never"           "republicans"     "theyll"         
##  [61] "year"            "getting"         "pair"           
##  [64] "shoes"           "hilarious"       "nfl"            
##  [67] "removed"         "saturday"        "adambaldwin"    
##  [70] "dems"            "hard"            "platform"       
##  [73] "please"          "run"             "socialism"      
##  [76] "alexmorgan"      "family"          "part"           
##  [79] "proud"           "like"            "thats"          
##  [82] "yeah"            "amp"             "im"             
##  [85] "made"            "one"             "social"         
##  [88] "racist"          "back"            "ever"           
##  [91] "new"             "week"            "think"          
##  [94] "yall"            "vote"            "stand"          
##  [97] "america"         "buy"             "said"           
## [100] "lets"            "make"            "best"           
## [103] "nmupdates"       "dream"           "serenawilliams" 
## [106] "hating"          "forget"          "georgebaptiste" 
## [109] "boycottnike"     "get"             "lvnancy"        
## [112] "nothing"         "always"          "money"          
## [115] "know"            "trump"           "problem"        
## [118] "today"           "realdonaldtrump" "barely"         
## [121] "coditions"       "donating"        "esoundofsilence"
## [124] "homeless"        "many"            "tgirlmiller"    
## [127] "veterans"        "worn"            "barmore"        
## [130] "boys"            "dad"             "grab"           
## [133] "live"            "mom"             "need"           
## [136] "quit"            "soy"             "still"          
## [139] "worrying"        "yourse"          "youre"          
## [142] "asked"           "clerk"           "ii"             
## [145] "overheard"       "refund"          "returned"       
## [148] "wrong"           "yesterday"       "love"           
## [151] "side"            "sneakers"        "someone"        
## [154] "cop"
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq > 10)
df1 <- data.frame(term = names(term.freq), freq= term.freq)

(freq.terms <- findFreqTerms(tdm, lowfreq = 55))
##  [1] "crazy"          "dreams"         "justdoit"       "nike"          
##  [5] "people"         "believe"        "christ"         "jessiejaneduff"
##  [9] "jesus"          "means"          "sacrificing"    "catfish"       
## [13] "nevschulman"    "seriously"      "constitution"   "defend"        
## [17] "fought"         "protect"        "standing"       "states"        
## [21] "thank"          "united"         "votevets"       "kaepernick"    
## [25] "campaign"       "nikes"          "something"
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq > 55)
df2 <- data.frame(term = names(term.freq), freq= term.freq)

(freq.terms <- findFreqTerms(tdm, lowfreq = 85))
##  [1] "crazy"          "dreams"         "justdoit"       "nike"          
##  [5] "people"         "believe"        "christ"         "jessiejaneduff"
##  [9] "jesus"          "means"          "sacrificing"    "kaepernick"    
## [13] "something"
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq > 85)
df3 <- data.frame(term = names(term.freq), freq= term.freq)

4.3 plotting the graph of frequent terms

p1=ggplot(df1, aes(reorder(term, freq),freq)) + theme_bw() + geom_bar(stat = "identity")  + coord_flip() +labs(list(title="@10", x="Terms", y="Term Counts")) + theme(axis.text.y = element_text(size=7))


p2=ggplot(df, aes(reorder(term, freq),freq)) + theme_bw() + geom_bar(stat = "identity")  + coord_flip() +labs(list(title="@25", x="Terms", y="Term Counts"))+
  theme(axis.text.y = element_text(size=7))


p3=ggplot(df2, aes(reorder(term, freq),freq)) + theme_bw() + geom_bar(stat = "identity")  + coord_flip() +labs(list(title="@55", x="Terms", y="Term Counts"))

p4=ggplot(df3, aes(reorder(term, freq),freq)) + theme_bw() + geom_bar(stat = "identity")  + coord_flip() +labs(list(title="@85", x="Terms", y="Term Counts")) 

grid.arrange(p1,p2,ncol=2)

grid.arrange(p3,p4,ncol=2)

4.4 Plot word frequencies

barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

5 Generate the Word cloud

par(mfrow=c(3,2))
library(wordcloud)
m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency
word.freq <- sort(rowSums(m), decreasing = F)
# colors
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:4)]
# plot word cloud
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 3,
random.order = F, colors = pal)
word.freq <-sort(rowSums(as.matrix(tdm)), decreasing= F)
pal<- brewer.pal(8, "Dark2")
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 2, random.order = F, colors = pal, max.words = 100)

set.seed(1234)

wordcloud(words =names(word.freq) , freq = word.freq, min.freq = 10,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(6, "Dark2"))

wordcloud(words =names(word.freq) , freq = word.freq, min.freq = 30,
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(7, "Dark2"))

wordcloud(words =names(word.freq) , freq = word.freq, min.freq = 40,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))


wordcloud(words =names(word.freq) , freq = word.freq, min.freq = 20,
          max.words=25, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(3, "Dark2"))

5.1 Find association with a specific keyword in the tweets

list1<- findAssocs(tdm, "means", 0.2)
corrdf1 <- t(data.frame(t(sapply(list1,c))))
corrdf1
##                means
## believe         0.93
## sacrificing     0.92
## christ          0.71
## jessiejaneduff  0.71
## jesus           0.71
## something       0.54
## kaepernick      0.33
## adambaldwin     0.21
## dems            0.21
## platform        0.21
## run             0.21
## socialism       0.21
barplot(t(as.matrix(corrdf1)), beside=TRUE,xlab = "Words",ylab = "Corr",col = "blue",main = "nike",border = "black")

list1<- findAssocs(tdm, "believe", 0.2)
corrdf1 <- t(data.frame(t(sapply(list1,c))))
corrdf1
##                believe
## sacrificing       0.96
## means             0.93
## christ            0.75
## jessiejaneduff    0.74
## jesus             0.74
## something         0.53
## kaepernick        0.35
barplot(t(as.matrix(corrdf1)), beside=TRUE,xlab = "Words",ylab = "Corr",col = "yellow",main = "justdoit",border = "black")

list1<- findAssocs(tdm, "crazy", 0.2)
corrdf1 <- t(data.frame(t(sapply(list1,c))))
corrdf1
##          crazy
## dreams    0.99
## nike      0.65
## justdoit  0.31
barplot(t(as.matrix(corrdf1)), beside=TRUE,xlab = "Words",ylab = "Corr",col = "green",main = "crazi",border = "black")

6 Clustering

Since there are a large number of similar tweets getting generated with #nike, it becomes challenging to make meaningful interpretations from the huge volumes of data that need to be processed. We try to cluster similar tweets together. Hierarchical clustering attempts to build different levels of clusters. The R function, hclust() was used to perform hierarchical clustering. It uses the agglomerative method. To perform this operation, the corpus was converted into a matrix with each tweet. Extremely sparse rows with specified cutoff are removed. Ward’s method for hierarchical clustering is used. The results of hierarchical clustering is presented below in a dendrogram.

# remove sparse terms
tdm2 <- removeSparseTerms(tdm, sparse = 0.95)
m2 <- as.matrix(tdm2)
# cluster terms
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(fit)
rect.hclust(fit, k = 4,border = "red") # cut tree into 6 clusters

# library(stats)
# mymeans <- kmeans(dtm, 5)
# mymeans
# summary(mymeans)
# 
# freq <- findFreqTerms(dtm,10)
# freq
# m2 <- as.matrix(dtm)
# dm <- dist(scale(m2))
# fit <- hclust(dm, method="ward")
# plot(fit)

7 Topic modeling using LDA

#dtm <- as.DocumentTermMatrix(tdm)

rowTotals <- apply(tdm , 1, sum)

#NullDocs <- tdm[rowTotals==0, ]
dtm   <- tdm[rowTotals> 0, ]

# if (length(NullDocs$dimnames$Docs) > 0) {
#   tweets <- tweets[-as.numeric(NullDocs$dimnames$Docs),]
#  }

dim(tweets)
## [1] 2000   17
lda <- LDA(dtm, k = 5) # find 5 topic
term <- terms(lda, 7) # first 7 terms of every topic
(term <- apply(term, MARGIN = 2, paste, collapse = ", "))
##                                  Topic 1 
## "287, 590, 1094, 1161, 1882, 1967, 1939" 
##                                  Topic 2 
##    "1889, 753, 1732, 478, 75, 613, 1322" 
##                                  Topic 3 
##  "1630, 1806, 1663, 882, 256, 1001, 400" 
##                                  Topic 4 
## "1772, 1442, 1320, 1296, 181, 1708, 391" 
##                                  Topic 5 
##   "1970, 275, 1250, 346, 1187, 263, 496"
topics<- topics(lda)
# plot (date, ..count.., data=topics, geom ="density", fill= term[topic], position="stack")
# 
# qptopics<- data.frame(date=(tweets$created), topic = topics)
# qlot (date, ..count.., data=topics, geom ="density", fill= term[topic], alpha=I(0.9)) +  theme(legend.position="bottom") + scale_fill_manual(values=c("#D50000", "#3F51B5", "#8BC34A", "#FFC107", "#607D8B", "#00BCD4"), name="Topics", labels=names(term))

tidy_lda <- tidy(lda)
tidy_lda
## # A tibble: 10,000 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     1 1     1.24e- 3
##  2     2 1     1.90e-44
##  3     3 1     7.08e-38
##  4     4 1     1.49e-39
##  5     5 1     2.84e-49
##  6     1 2     8.28e-88
##  7     2 2     5.26e- 3
##  8     3 2     2.28e-18
##  9     4 2     8.56e-42
## 10     5 2     5.79e-30
## # ... with 9,990 more rows
top_terms <- tidy_lda %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms
## # A tibble: 857 x 3
##    topic term     beta
##    <int> <chr>   <dbl>
##  1     1 287   0.00199
##  2     1 590   0.00199
##  3     1 1094  0.00199
##  4     1 1161  0.00199
##  5     1 1882  0.00199
##  6     1 1967  0.00199
##  7     1 1939  0.00149
##  8     1 1     0.00124
##  9     1 4     0.00124
## 10     1 5     0.00124
## # ... with 847 more rows
top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  group_by(topic, term) %>%    
  arrange(desc(beta)) %>%  
  ungroup() %>%
  mutate(term = factor(paste(term, topic, sep = "__"), 
                       levels = rev(paste(term, topic, sep = "__")))) %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
  labs(title = "Top 10 terms in each LDA topic",
       x = NULL, y = expression(beta)) +
  facet_wrap(~ topic, ncol = 4, scales = "free")

#topic probability distribution
lda_gamma <- tidy(lda, matrix = "gamma")
lda_gamma
## # A tibble: 9,450 x 3
##    document  topic   gamma
##    <chr>     <int>   <dbl>
##  1 crazy         1 1.000  
##  2 dreams        1 1.000  
##  3 justdoit      1 0.468  
##  4 nike          1 0.711  
##  5 americans     1 0.0178 
##  6 cops          1 0.0281 
##  7 demonizes     1 0.0393 
##  8 false         1 0.0393 
##  9 good          1 0.00784
## 10 kap           1 0.0281 
## # ... with 9,440 more rows
ggplot(lda_gamma, aes(gamma)) +
  geom_histogram() +
  scale_y_log10() +
  labs(title = "Distribution of probabilities for all topics",
       y = "Number of documents", x = expression(gamma))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lda_gamma, aes(gamma, fill = as.factor(topic))) +
  geom_histogram(show.legend = FALSE) +
  facet_wrap(~ topic, ncol = 4) +
  scale_y_log10() +
  labs(title = "Distribution of probability for each topic",
       y = "Number of documents", x = expression(gamma))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 26 rows containing missing values (geom_bar).

print(term)
##                                  Topic 1 
## "287, 590, 1094, 1161, 1882, 1967, 1939" 
##                                  Topic 2 
##    "1889, 753, 1732, 478, 75, 613, 1322" 
##                                  Topic 3 
##  "1630, 1806, 1663, 882, 256, 1001, 400" 
##                                  Topic 4 
## "1772, 1442, 1320, 1296, 181, 1708, 391" 
##                                  Topic 5 
##   "1970, 275, 1250, 346, 1187, 263, 496"

8 sentiment analysis

mysentiment<-get_nrc_sentiment((tweets$text))

# Get the sentiment score for each emotion
mysentiment.positive =sum(mysentiment$positive)
mysentiment.anger =sum(mysentiment$anger)
mysentiment.anticipation =sum(mysentiment$anticipation)
mysentiment.disgust =sum(mysentiment$disgust)
mysentiment.fear =sum(mysentiment$fear)
mysentiment.joy =sum(mysentiment$joy)
mysentiment.sadness =sum(mysentiment$sadness)
mysentiment.surprise =sum(mysentiment$surprise)
mysentiment.trust =sum(mysentiment$trust)
mysentiment.negative =sum(mysentiment$negative)

# Create the bar chart
yAxis <- c(mysentiment.positive,
           + mysentiment.anger,
           + mysentiment.anticipation,
           + mysentiment.disgust,
           + mysentiment.fear,
           + mysentiment.joy,
           + mysentiment.sadness,
           + mysentiment.surprise,
           + mysentiment.trust,
           + mysentiment.negative)

xAxis <- c("Positive","Anger","Anticipation","Disgust","Fear","Joy","Sadness",
           "Surprise","Trust","Negative")
colors <- c("green","red","blue","orange","red","green","orange","blue","green","red")
yRange <- range(0,yAxis)
barplot(yAxis, names.arg = xAxis, 
        xlab = "Emotional valence", ylab = "Score", main = "Twitter sentiment", 
        sub = "Nike", col = colors, border = "black", xpd = F, ylim = yRange,
        axisnames = T, cex.axis = 0.8, cex.sub = 0.8, col.sub = "blue")