This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
Install necessary packages. Comment after installation
#install.packages('tm')
#install.packages('RColorBrewer')
#install.packages('wordcloud')
Include the packages.
library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
Process data
# entrepreneurshipData <- readRDS("entrepreneurship.RDS")
vData <- readRDS("Vaynerchuk.RDS")
# Etweets <- entrepreneurshipData$text
tweets <- vData$text
# swap out all non-alphanumeric characters
# Note that the definition of what constitutes a letter or a number or a punctuatution mark varies slightly depending upon your locale, so you may need to experiment a little to get exactly what you want.
# str_replace_all(tweets, "[^[:alnum:]]", " ")
# iconv(tweets, from = 'UTF-8', to = 'ASCII//TRANSLIT')
# Encoding(tweets) <- "UTF-8"
# Function to clean tweets
clean.text = function(x)
{
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
# tolower
# x = tolower(x)
return(x)
}
# clean tweets
tweets = clean.text(tweets)
Create word cloud of tweets
corpus = Corpus(VectorSource(tweets))
# create term-document matrix
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE,
# tolower may cause trouble on Window because UTF-8 encoding, changed to FALSE
tolower = FALSE) )
# convert as matrix. It may consume near 1g of your RAM
tdm = as.matrix(tdm)
# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
#check top 50 most mentioned words
head(word_freqs, 50)
## Vaynerchuk Gary quote The legacy
## 937 676 285 158 147
## every think youre writing RTPlease
## 143 137 136 125 118
## dayGary How For marketing You
## 118 94 61 55 53
## Make content become best mentor
## 50 48 48 46 46
## Original Your will one ever
## 44 43 43 43 42
## people With Passion especially cheap
## 41 39 38 38 36
## Marketing creator model name role
## 36 36 36 36 36
## special can RTTribute Since Empathy
## 36 35 35 35 33
## Social like pricelessGary matter work
## 33 32 32 31 31
## social hustle talent get carry
## 30 30 30 29 29
#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:2)] #Here “1:5” is 1st-5th words in the list we want to remove
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
#Plot corpus in a clored graph; need RColorBrewer package
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
#check top 50 most mentioned words
head(word_freqs, 50)
## quote The legacy every think
## 285 158 147 143 137
## youre writing RTPlease dayGary How
## 136 125 118 118 94
## For marketing You Make content
## 61 55 53 50 48
## become best mentor Original Your
## 48 46 46 44 43
## will one ever people With
## 43 43 42 41 39
## Passion especially cheap Marketing creator
## 38 38 36 36 36
## model name role special can
## 36 36 36 36 35
## RTTribute Since Empathy Social like
## 35 35 33 33 32
## pricelessGary matter work social hustle
## 32 31 31 30 30
## talent get carry strategy far
## 30 29 29 29 28
# I see some words I don't know or understand, so I retrieve the tweets that have the words
# I retrieve all the tweets that have "nigeria" in it
index = grep("marketing", tweets)
tweets[index]
## [1] "Social marketing is now ajob Gary Vaynerchuk quote\naskTNT"
## [2] "RTSocial marketing is now ajob Gary Vaynerchuk quote"
## [3] "Social marketing is now ajob Gary Vaynerchuk quote"
## [4] "Social marketing is now ajob Gary Vaynerchuk quote"
## [5] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"
## [6] "RTGary Vaynerchuk Serial entrepreneur Social media celebrity Digital marketing rockstar And opening keynote speaker a"
## [7] "Gary Vaynerchuk Serial entrepreneur Social media celebrity Digital marketing rockstar And opening keynote speak"
## [8] "The Most Powerful Mindset For SuccessGary VaynerchukMotivational Talk \nmarketing motivation growth"
## [9] "of the biggest marketing legends in theSeth Godin and Gary Vaynerchuk"
## [10] "The best marketing Gurus another fantastic guide\n\nDo give it a read people \n\n Sales Tips on How to Se"
## [11] "The best marketing strategy ever CARE Gary Vaynerchuk ThursdayMotivation"
## [12] "The best marketing strategy ever CAREGary Vaynerchuk entrepreneur speaker amp marketing expe\n\nmarketing businesstips digital"
## [13] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"
## [14] "The best marketing strategy ever CARE Gary Vaynerchuk quote"
## [15] "RTThe best marketing strategy ever CARE Gary Vaynerchuk WednesdayMotivation"
## [16] "The best marketing strategy ever CARE Gary Vaynerchuk WednesdayMotivation"
## [17] "The best marketing strategy ever CARE Gary Vaynerchuk FunFactFriday"
## [18] "Gary vaynerchuk has a great segment on why their marketing is so terrible and therefore"
## [19] "What does little old me have in common with marketing gurus like Seth Godin Gary Vaynerchuk Steven Pressfield and"
## [20] "The best marketing strategy ever CARE Gary Vaynerchuk"
## [21] "The best marketing strategy ever CARE Gary Vaynerchuk quoteoftheday"
## [22] "The best marketing strategy ever CAREGary Vaynerchuk entrepreneur speaker amp marketing expe\n\nmarketing businesstips digital"
## [23] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"
## [24] "The best marketing strategy ever CARE Gary Vaynerchuk quote"
## [25] "RTThe best marketing strategy ever CAREGary Vaynerchuk"
## [26] "The best marketing strategy ever CAREGary VaynerchukAt the end of the day its all about the clients"
## [27] "RTThe best marketing strategy ever CARE Gary Vaynerchuk quote"
## [28] "Dont have a marketing strategy yet Check out this keynote with Gary Vaynerchuk on using social media to build awa"
## [29] "Social marketing is now ajob Gary Vaynerchuk quote"
## [30] "Social marketing is now ajob Gary Vaynerchuk quote"
## [31] "The best marketing strategy ever CARE Gary Vaynerchuk quote"
## [32] "The best marketing strategy ever CARE Gary Vaynerchuk"
## [33] "The best marketing strategy ever CARE Gary Vaynerchuk quote"
## [34] "Winners amp Losers Instagram Live Video Bacardi and Gary Vaynerchuksmm marketing"
## [35] "Public speaker Gary Vaynerchuk talks about failure social media socialmediamarketing"
## [36] "The best marketing strategy ever CARE Gary Vaynerchuk quote\n\nTipTuesday"
## [37] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"
## [38] "The best marketing strategy ever CAREGary Vaynerchuk entrepreneur speaker amp marketing expe\n\nmarketing businesstips digital"
## [39] "Social marketing is now ajob Gary Vaynerchuk quote\nTipTuesday"
## [40] "The best marketing strategy ever CARE Gary Vaynerchuk quote"
## [41] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"
## [42] "The best marketing strategy ever CARE Gary Vaynerchuk"
## [43] "The best marketing strategy ever CARE Gary Vaynerchuk quote\n\naskTNT"
## [44] "Content is king but marketing is queen and runs the household \nGary Vaynerchuk"
## [45] "RTGary Vaynerchuk is the Kim Kardashian of marketing strategy This channel is a pa reality TVPa tutorial which fo"
## [46] "The best marketing strategy ever CAREGary Vaynerchuk entrepreneur speaker amp marketing expe\n\nmarketing businesstips digital"
## [47] "The best marketing strategy ever CARE Gary Vaynerchuk quote"
## [48] "Content is king but marketing is queen and runs the householdGary Vaynerchuk"
## [49] "RTDistribution is the Game Gary Vaynerchuk\n\nsocialmediamarketing garyvee garyvaynerchuk entrepreneur ent"
Prepare for Bigram
# Install the following packages
library(dplyr)
library(tidyverse) # data manipulation & plotting
library(stringr) # text cleaning and regular expressions
library(tidytext) # provides additional text mining functions
titles <- c("v")
books <- list(tweets)
series <- tibble()
for(i in seq_along(titles)) {
clean <- tibble(chapter = seq_along(books[[i]]),
text = books[[i]]) %>%
# Number of gram
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
mutate(book = titles[i]) %>%
select(book, everything())
series <- rbind(series, clean)
}
Bigram of the Data
temp1 = subset(series, book == 'v') %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
temp1[1:20,]
## # A tibble: 20 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 gary vaynerchuk 606
## 2 vaynerchuk quote 274
## 3 youre writing 119
## 4 daygary vaynerchuk 118
## 5 vaynerchuk original 55
## 6 social media 52
## 7 cheap passion 37
## 8 content creator 36
## 9 role model 36
## 10 pricelessgary vaynerchuk 32
## 11 hustle talent 29
## 12 marketing strategy 26
## 13 decisions gary 22
## 14 quote image 22
## 15 weekendgary vaynerchuk 22
## 16 currencygary vaynerchuk 21
## 17 original film 21
## 18 image image 20
## 19 vaynerchukmotivational talk 20
## 20 care gary 19
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
library("plyr")
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:purrr':
##
## compact
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library("stringi")
money.words = scan('moneyWords.txt', what='character', comment.char=';')
score.topic = function(sentences, dict, .progress='none')
{
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, dict) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
# sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
topic.matches = match(words, dict)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
topic.matches = !is.na(topic.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(topic.matches)
return(score)
}, dict, .progress=.progress )
topicscores.df = data.frame(score=scores, text=sentences)
return(topicscores.df)
}
topic.scores= score.topic(tweets, money.words, .progress='none')
# topic.scores= score.topic(Etweets, fear.words, .progress='none')
topic.mentioned = subset(topic.scores, score !=0)
N= nrow(topic.scores)
Nmentioned = nrow(topic.mentioned)
dftemp=data.frame(topic=c("Mentioned", "Not Mentioned"),
number=c(Nmentioned,N-Nmentioned))
p <- plot_ly(data=dftemp, labels = ~topic, values = ~number, type = 'pie') %>%
layout(title = 'Pie Chart of Tweets Talking about Money',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
library(tidytext)
titles <- c("v")
books <- list(tweets)
series <- tibble()
# create a series of book with text lines
for(i in seq_along(titles)) {
clean <- tibble(chapter = seq_along(books[[i]]),
text = books[[i]]) %>%
# unnest_tokens(word, text) %>%
mutate(book = titles[i]) %>%
select(book, everything())
series <- rbind(series, clean)
}
# find tweets with "fear"
# other emotion to find
##########################
# anger
# anticipation
# disgust
# fear
# joy
# sadness
# surprise
# trust
##########################
senti <- series %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment=="fear") %>% # replace "fear" with other emotion words
group_by(chapter)
## Joining, by = "word"
sentitext = series[senti$chapter,]
sentitext$sentiment = senti$sentiment
sentitext
## # A tibble: 165 x 4
## book chapter text sentiment
## <chr> <int> <chr> <chr>
## 1 v 3 "White lines in the sky \nDEDICATED TO OGGY\nP… fear
## 2 v 6 "RTSuccessTRAIN\nCrushing It How Great Entrepr… fear
## 3 v 7 "RTSuccessTRAIN\nCrushing It How Great Entrepr… fear
## 4 v 31 "RTSuccessTRAIN\nCrushing It How Great Entrepr… fear
## 5 v 32 "SuccessTRAIN\nCrushing It How Great Entrepren… fear
## 6 v 35 "Hes RightIt boils down to this simple fact We… fear
## 7 v 35 "Hes RightIt boils down to this simple fact We… fear
## 8 v 35 "Hes RightIt boils down to this simple fact We… fear
## 9 v 45 I blocked notorious charlatan Gary Vaynerchuk … fear
## 10 v 45 I blocked notorious charlatan Gary Vaynerchuk … fear
## # ... with 155 more rows
Create word cloud of tweets showing fear
corpus = Corpus(VectorSource(sentitext$text))
# create term-document matrix
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE,
# tolower may cause trouble on Window because UTF-8 encoding, changed to FALSE
tolower = FALSE) )
# convert as matrix. It may consume near 1g of your RAM
tdm = as.matrix(tdm)
# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
#check top 50 most mentioned words
head(word_freqs, 50)
## Vaynerchuk Gary quote How think
## 83 80 22 19 18
## social success people failure media
## 18 17 17 17 17
## Its afraid sta hate Afraid
## 16 16 16 16 15
## Hes RightIt boils crazy fact
## 15 15 15 15 15
## simple win want like You
## 15 15 15 14 14
## speaker talks time jakpost Crushing
## 14 14 12 12 11
## Your shit RTPublic RTSuccessTRAIN There
## 10 10 10 9 9
## bad The businessunless mediocre never
## 9 9 8 8 8
## one WATCH Watch Dont God
## 8 8 8 7 7
## content act Public Digital Marketing
## 7 7 7 7 7
#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:5)] #Here “1:5” is 1st-5th words in the list we want to remove
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
#Plot corpus in a clored graph; need RColorBrewer package
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
#check top 50 most mentioned words
head(word_freqs, 50)
## social success people
## 18 17 17
## failure media Its
## 17 17 16
## afraid sta hate
## 16 16 16
## Afraid Hes RightIt
## 15 15 15
## boils crazy fact
## 15 15 15
## simple win want
## 15 15 15
## like You speaker
## 14 14 14
## talks time jakpost
## 14 12 12
## Crushing Your shit
## 11 10 10
## RTPublic RTSuccessTRAIN There
## 10 9 9
## bad The businessunless
## 9 9 8
## mediocre never one
## 8 8 8
## WATCH Watch Dont
## 8 8 7
## God content act
## 7 7 7
## Public Digital Marketing
## 7 7 7
## Stream VaynerchukInterview Build
## 7 7 6
## NONE Theres
## 6 6
# I see some words I don't know or understand, so I retrieve the tweets that have the words
# I retrieve all the tweets that have "nigeria" in it
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.