This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install necessary packages. Comment after installation

#install.packages('tm')
#install.packages('RColorBrewer')
#install.packages('wordcloud')

Include the packages.

library('tm')

## Loading required package: NLP

library('RColorBrewer')
library('wordcloud')

Process data

# entrepreneurshipData <- readRDS("entrepreneurship.RDS")
vData <- readRDS("Vaynerchuk.RDS")
# Etweets <- entrepreneurshipData$text
tweets <- vData$text


# swap out all non-alphanumeric characters
# Note that the definition of what constitutes a letter or a number or a punctuatution mark varies slightly depending upon your locale, so you may need to experiment a little to get exactly what you want.
# str_replace_all(tweets, "[^[:alnum:]]", " ")
# iconv(tweets, from = 'UTF-8', to = 'ASCII//TRANSLIT')
# Encoding(tweets)  <- "UTF-8"

# Function to clean tweets
clean.text = function(x)
{
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  # tolower
#  x = tolower(x)
  return(x)
}

# clean tweets
tweets = clean.text(tweets)

Create word cloud of tweets

corpus = Corpus(VectorSource(tweets))

# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, 
  # tolower may cause trouble on Window because UTF-8 encoding, changed to FALSE  
    tolower = FALSE) )

# convert as matrix. It may consume near 1g of your RAM
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

#check top 50 most mentioned words
head(word_freqs, 50)

##    Vaynerchuk          Gary         quote           The        legacy 
##           937           676           285           158           147 
##         every         think         youre       writing      RTPlease 
##           143           137           136           125           118 
##       dayGary           How           For     marketing           You 
##           118            94            61            55            53 
##          Make       content        become          best        mentor 
##            50            48            48            46            46 
##      Original          Your          will           one          ever 
##            44            43            43            43            42 
##        people          With       Passion    especially         cheap 
##            41            39            38            38            36 
##     Marketing       creator         model          name          role 
##            36            36            36            36            36 
##       special           can     RTTribute         Since       Empathy 
##            36            35            35            35            33 
##        Social          like pricelessGary        matter          work 
##            33            32            32            31            31 
##        social        hustle        talent           get         carry 
##            30            30            30            29            29

#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:2)]  #Here “1:5” is 1st-5th words in the list we want to remove 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 50)

##         quote           The        legacy         every         think 
##           285           158           147           143           137 
##         youre       writing      RTPlease       dayGary           How 
##           136           125           118           118            94 
##           For     marketing           You          Make       content 
##            61            55            53            50            48 
##        become          best        mentor      Original          Your 
##            48            46            46            44            43 
##          will           one          ever        people          With 
##            43            43            42            41            39 
##       Passion    especially         cheap     Marketing       creator 
##            38            38            36            36            36 
##         model          name          role       special           can 
##            36            36            36            36            35 
##     RTTribute         Since       Empathy        Social          like 
##            35            35            33            33            32 
## pricelessGary        matter          work        social        hustle 
##            32            31            31            30            30 
##        talent           get         carry      strategy           far 
##            30            29            29            29            28

# I see some words I don't know or understand, so I retrieve the tweets that have the words
# I retrieve all the tweets that have "nigeria" in it

I saw ‘marketing’ on the work cloud and want to tweets contain it

index = grep("marketing", tweets)
tweets[index]

##  [1] "Social marketing is now ajob Gary Vaynerchuk quote\naskTNT"                                                                    
##  [2] "RTSocial marketing is now ajob Gary Vaynerchuk quote"                                                                          
##  [3] "Social marketing is now ajob Gary Vaynerchuk quote"                                                                            
##  [4] "Social marketing is now ajob Gary Vaynerchuk quote"                                                                            
##  [5] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"                                                  
##  [6] "RTGary Vaynerchuk Serial entrepreneur Social media celebrity Digital marketing rockstar And opening keynote speaker a"         
##  [7] "Gary Vaynerchuk Serial entrepreneur Social media celebrity Digital marketing rockstar And opening keynote speak"               
##  [8] "The Most Powerful Mindset For SuccessGary VaynerchukMotivational Talk \nmarketing motivation growth"                           
##  [9] "of the biggest marketing legends in theSeth Godin and Gary Vaynerchuk"                                                         
## [10] "The best marketing Gurus another fantastic guide\n\nDo give it a read people \n\n Sales Tips on How to Se"                     
## [11] "The best marketing strategy ever CARE Gary Vaynerchuk ThursdayMotivation"                                                      
## [12] "The best marketing strategy ever CAREGary Vaynerchuk entrepreneur speaker amp marketing expe\n\nmarketing businesstips digital"
## [13] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"                                                  
## [14] "The best marketing strategy ever CARE Gary Vaynerchuk quote"                                                                   
## [15] "RTThe best marketing strategy ever CARE Gary Vaynerchuk WednesdayMotivation"                                                   
## [16] "The best marketing strategy ever CARE Gary Vaynerchuk WednesdayMotivation"                                                     
## [17] "The best marketing strategy ever CARE Gary Vaynerchuk FunFactFriday"                                                           
## [18] "Gary vaynerchuk has a great segment on why their marketing is so terrible and therefore"                                       
## [19] "What does little old me have in common with marketing gurus like Seth Godin Gary Vaynerchuk Steven Pressfield and"             
## [20] "The best marketing strategy ever CARE Gary Vaynerchuk"                                                                         
## [21] "The best marketing strategy ever CARE Gary Vaynerchuk quoteoftheday"                                                           
## [22] "The best marketing strategy ever CAREGary Vaynerchuk entrepreneur speaker amp marketing expe\n\nmarketing businesstips digital"
## [23] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"                                                  
## [24] "The best marketing strategy ever CARE Gary Vaynerchuk quote"                                                                   
## [25] "RTThe best marketing strategy ever CAREGary Vaynerchuk"                                                                        
## [26] "The best marketing strategy ever CAREGary VaynerchukAt the end of the day its all about the clients"                           
## [27] "RTThe best marketing strategy ever CARE Gary Vaynerchuk quote"                                                                 
## [28] "Dont have a marketing strategy yet Check out this keynote with Gary Vaynerchuk on using social media to build awa"             
## [29] "Social marketing is now ajob Gary Vaynerchuk quote"                                                                            
## [30] "Social marketing is now ajob Gary Vaynerchuk quote"                                                                            
## [31] "The best marketing strategy ever CARE Gary Vaynerchuk quote"                                                                   
## [32] "The best marketing strategy ever CARE Gary Vaynerchuk"                                                                         
## [33] "The best marketing strategy ever CARE Gary Vaynerchuk quote"                                                                   
## [34] "Winners amp Losers Instagram Live Video Bacardi and Gary Vaynerchuksmm marketing"                                              
## [35] "Public speaker Gary Vaynerchuk talks about failure social media socialmediamarketing"                                          
## [36] "The best marketing strategy ever CARE Gary Vaynerchuk quote\n\nTipTuesday"                                                     
## [37] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"                                                  
## [38] "The best marketing strategy ever CAREGary Vaynerchuk entrepreneur speaker amp marketing expe\n\nmarketing businesstips digital"
## [39] "Social marketing is now ajob Gary Vaynerchuk quote\nTipTuesday"                                                                
## [40] "The best marketing strategy ever CARE Gary Vaynerchuk quote"                                                                   
## [41] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"                                                  
## [42] "The best marketing strategy ever CARE Gary Vaynerchuk"                                                                         
## [43] "The best marketing strategy ever CARE Gary Vaynerchuk quote\n\naskTNT"                                                         
## [44] "Content is king but marketing is queen and runs the household \nGary Vaynerchuk"                                               
## [45] "RTGary Vaynerchuk is the Kim Kardashian of marketing strategy This channel is a pa reality TVPa tutorial which fo"             
## [46] "The best marketing strategy ever CAREGary Vaynerchuk entrepreneur speaker amp marketing expe\n\nmarketing businesstips digital"
## [47] "The best marketing strategy ever CARE Gary Vaynerchuk quote"                                                                   
## [48] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"                                                  
## [49] "RTDistribution is the Game Gary Vaynerchuk\n\nsocialmediamarketing garyvee garyvaynerchuk entrepreneur ent"

Prepare for Bigram

# Install the following packages 
library(dplyr)
library(tidyverse)      # data manipulation & plotting
library(stringr)        # text cleaning and regular expressions
library(tidytext)       # provides additional text mining functions

titles <- c("v")

books <- list(tweets)
  
series <- tibble()

 for(i in seq_along(titles)) {
        
        clean <- tibble(chapter = seq_along(books[[i]]),
                        text = books[[i]]) %>%
          # Number of gram
             unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
             mutate(book = titles[i]) %>%
             select(book, everything())

        series <- rbind(series, clean)
}

Bigram of the Data

temp1 = subset(series, book == 'v') %>%
        separate(bigram, c("word1", "word2"), sep = " ") %>%
        filter(!word1 %in% stop_words$word,
               !word2 %in% stop_words$word) %>%
        count(word1, word2, sort = TRUE)
temp1[1:20,]

## # A tibble: 20 x 3
##    word1                  word2          n
##    <chr>                  <chr>      <int>
##  1 gary                   vaynerchuk   606
##  2 vaynerchuk             quote        274
##  3 youre                  writing      119
##  4 daygary                vaynerchuk   118
##  5 vaynerchuk             original      55
##  6 social                 media         52
##  7 cheap                  passion       37
##  8 content                creator       36
##  9 role                   model         36
## 10 pricelessgary          vaynerchuk    32
## 11 hustle                 talent        29
## 12 marketing              strategy      26
## 13 decisions              gary          22
## 14 quote                  image         22
## 15 weekendgary            vaynerchuk    22
## 16 currencygary           vaynerchuk    21
## 17 original               film          21
## 18 image                  image         20
## 19 vaynerchukmotivational talk          20
## 20 care                   gary          19

See whether people talk about money in their tweets

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(dplyr)
library("plyr")

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise

## The following object is masked from 'package:purrr':
## 
##     compact

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library("stringi")

money.words = scan('moneyWords.txt', what='character', comment.char=';')
score.topic = function(sentences, dict, .progress='none')
{
  
  # we got a vector of sentences. plyr will handle a list
  # or a vector as an "l" for us
  # we want a simple array of scores back, so we use
  # "l" + "a" + "ply" = "laply":
  scores = laply(sentences, function(sentence, dict) {
    
    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    # sentence = tolower(sentence)
    
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    topic.matches = match(words, dict)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    topic.matches = !is.na(topic.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(topic.matches)
    
    return(score)
  }, dict, .progress=.progress )
  
  topicscores.df = data.frame(score=scores, text=sentences)
  return(topicscores.df)
}

topic.scores= score.topic(tweets, money.words, .progress='none')
# topic.scores= score.topic(Etweets, fear.words, .progress='none')

topic.mentioned = subset(topic.scores, score !=0)

N= nrow(topic.scores)
Nmentioned = nrow(topic.mentioned)

dftemp=data.frame(topic=c("Mentioned", "Not Mentioned"), 
                  number=c(Nmentioned,N-Nmentioned))

p <- plot_ly(data=dftemp, labels = ~topic, values = ~number, type = 'pie') %>%
  layout(title = 'Pie Chart of Tweets Talking about Money',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p

Check the emotion of the tweets

library(tidytext)

titles <- c("v")
books <- list(tweets)
series <- tibble()

# create a series of book with text lines
for(i in seq_along(titles)) {
  
  clean <- tibble(chapter = seq_along(books[[i]]),
                  text = books[[i]]) %>%
  #  unnest_tokens(word, text) %>%
    mutate(book = titles[i]) %>%
    select(book, everything())
  series <- rbind(series, clean)
}

# find tweets with "fear"
# other emotion to find
##########################
# anger     
# anticipation          
# disgust           
# fear              
# joy               
# sadness       
# surprise      
# trust
##########################

senti <- series %>%
        unnest_tokens(word, text) %>%
        inner_join(get_sentiments("nrc")) %>%
  filter(sentiment=="fear") %>%   # replace "fear" with other emotion words 
   group_by(chapter)

## Joining, by = "word"

sentitext = series[senti$chapter,]
sentitext$sentiment = senti$sentiment
sentitext

## # A tibble: 165 x 4
##    book  chapter text                                            sentiment
##    <chr>   <int> <chr>                                           <chr>    
##  1 v           3 "White lines in the sky \nDEDICATED TO OGGY\nP… fear     
##  2 v           6 "RTSuccessTRAIN\nCrushing It How Great Entrepr… fear     
##  3 v           7 "RTSuccessTRAIN\nCrushing It How Great Entrepr… fear     
##  4 v          31 "RTSuccessTRAIN\nCrushing It How Great Entrepr… fear     
##  5 v          32 "SuccessTRAIN\nCrushing It How Great Entrepren… fear     
##  6 v          35 "Hes RightIt boils down to this simple fact We… fear     
##  7 v          35 "Hes RightIt boils down to this simple fact We… fear     
##  8 v          35 "Hes RightIt boils down to this simple fact We… fear     
##  9 v          45 I blocked notorious charlatan Gary Vaynerchuk … fear     
## 10 v          45 I blocked notorious charlatan Gary Vaynerchuk … fear     
## # ... with 155 more rows

Create word cloud of tweets showing fear

corpus = Corpus(VectorSource(sentitext$text))

# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, 
  # tolower may cause trouble on Window because UTF-8 encoding, changed to FALSE  
    tolower = FALSE) )

# convert as matrix. It may consume near 1g of your RAM
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

#check top 50 most mentioned words
head(word_freqs, 50)

##     Vaynerchuk           Gary          quote            How          think 
##             83             80             22             19             18 
##         social        success         people        failure          media 
##             18             17             17             17             17 
##            Its         afraid            sta           hate         Afraid 
##             16             16             16             16             15 
##            Hes        RightIt          boils          crazy           fact 
##             15             15             15             15             15 
##         simple            win           want           like            You 
##             15             15             15             14             14 
##        speaker          talks           time        jakpost       Crushing 
##             14             14             12             12             11 
##           Your           shit       RTPublic RTSuccessTRAIN          There 
##             10             10             10              9              9 
##            bad            The businessunless       mediocre          never 
##              9              9              8              8              8 
##            one          WATCH          Watch           Dont            God 
##              8              8              8              7              7 
##        content            act         Public        Digital      Marketing 
##              7              7              7              7              7

#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:5)]  #Here “1:5” is 1st-5th words in the list we want to remove 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 50)

##              social             success              people 
##                  18                  17                  17 
##             failure               media                 Its 
##                  17                  17                  16 
##              afraid                 sta                hate 
##                  16                  16                  16 
##              Afraid                 Hes             RightIt 
##                  15                  15                  15 
##               boils               crazy                fact 
##                  15                  15                  15 
##              simple                 win                want 
##                  15                  15                  15 
##                like                 You             speaker 
##                  14                  14                  14 
##               talks                time             jakpost 
##                  14                  12                  12 
##            Crushing                Your                shit 
##                  11                  10                  10 
##            RTPublic      RTSuccessTRAIN               There 
##                  10                   9                   9 
##                 bad                 The      businessunless 
##                   9                   9                   8 
##            mediocre               never                 one 
##                   8                   8                   8 
##               WATCH               Watch                Dont 
##                   8                   8                   7 
##                 God             content                 act 
##                   7                   7                   7 
##              Public             Digital           Marketing 
##                   7                   7                   7 
##              Stream VaynerchukInterview               Build 
##                   7                   7                   6 
##                NONE              Theres 
##                   6                   6

# I see some words I don't know or understand, so I retrieve the tweets that have the words
# I retrieve all the tweets that have "nigeria" in it

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

CS695： working session R notebook

I saw ‘marketing’ on the work cloud and want to tweets contain it

See whether people talk about money in their tweets

Check the emotion of the tweets