This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
Install necessary packages. Comment after installation
# install.packages('tm')
# install.packages('RColorBrewer')
# install.packages('wordcloud')
# installed.packages('tidytext')
# installed.packages('dplyr')
# install.packages("readr")
# install.packages("plyr")
# install.packages("stringr")
# install.packages("stringi")
# install.packages('plotly')
Include the packages.
library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
library('readr')
library('tidytext')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("plyr")
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library("stringr")
library("stringi")
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Process data
entrepreneurshipData <- readRDS("entrepreneurship.RDS")
BreneData <- readRDS("BreneBrown.RDS")
Etweets <- entrepreneurshipData$text
Btweets <- BreneData$text
# Read dictionaries
money.words = scan('moneyWords.txt', what='character', comment.char=';')
fear.words = scan('fearWords.txt', what='character', comment.char=';')
pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')
# Clean data
clean.text = function(x)
{
# remove rt
x = gsub("rt", "", x)
# remove Rt
x = gsub("Rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove https
x = gsub("https", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
# tolower
# x = tolower(x)
return(x)
}
# clean tweets
Etweets = clean.text(Etweets)
Btweets = clean.text(Btweets)
score.topic = function(sentences, dict, .progress='none')
{
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, dict) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
topic.matches = match(words, dict)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
topic.matches = !is.na(topic.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(topic.matches)
return(score)
}, dict, .progress=.progress )
topicscores.df = data.frame(score=scores, text=sentences)
return(topicscores.df)
}
#topic.scores= score.topic(Btweets, money.words, .progress='none')
topic.scores= score.topic(Etweets, fear.words, .progress='none')
topic.mentioned = subset(topic.scores, score !=0)
N= nrow(topic.scores)
Nmentioned = nrow(topic.mentioned)
dftemp=data.frame(topic=c("Mentioned", "Not Mentioned"),
number=c(Nmentioned,N-Nmentioned))
p <- plot_ly(data=dftemp, labels = ~topic, values = ~number, type = 'pie') %>%
layout(title = 'Pie Chart of Tweets Mentioning fear',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
sentiment.scores= score.sentiment(Etweets, pos.words, neg.words, .progress='none')
score <- sentiment.scores$score
p <- plot_ly(x = ~score, type = "histogram")
p
require(tm)
require(wordcloud)
require(RColorBrewer)
negativeTweets = subset(sentiment.scores, score < 0)$text
corpus = Corpus(VectorSource(negativeTweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE, tolower = FALSE) )
# convert as matrix
tdm = as.matrix(tdm)
# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
#word_freqs = word_freqs[-(1:12)]
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
#Plot corpus in a clored graph; need RColorBrewer package
wordcloud(head(dm$word, 100), head(dm$freq, 100), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
#check top 50 most mentioned words
head(word_freqs, 50)
## help entrepreneurship business entrepreneurs
## 338 315 267 212
## Entrepreneurship fellow
youth
## 153 132 128 121
## Marketing can RTBusinesses The
## 115 110 104 100
## atwe passion will people
## 80 76 75 67
## This amp execute RTWhat
## 63 58 57 56
## gather entrepreneur pro
thinkshould
## 56 56 55 55
## mentorship two otherwe Delhi
## 54 53 52 52
## lazy DelhiGovtSchool Education answer
## 49 47 47 47
## star teachers Ministeralong RTToday
## 47 46 45 45
## leadership question
students create
## 44 42 41 41
## Your just real story
## 40 39 39 39
## RTThe customer grit serve
## 38 38 38 38
## How problem
## 38 38
#Grouping of some examples of tweets with negatively coded words, like 'lazy'. Keyword may be replaced for different insights as needed
index = grep("lazy", negativeTweets)
negativeTweets[index]
## [1] Its definitely not for the lazy and entrepreneurship or corporate business
## [2] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [3] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [4] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre
## [5] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [6] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [7] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre
## [8] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre
## [9] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre
## [10] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre
## [11] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre
## [12] Better to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entrepreneurship content
## [13] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [14] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [15] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [16] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [17] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [18] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [19] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [20] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [21] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [22] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [23] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [24] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [25] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [26] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [27] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [28] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [29] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [30] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [31] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [32] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [33] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [34] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [35] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [36] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [37] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [38] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [39] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [40] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [41] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told
## [42] Not all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being tol
## 6242 Levels: we aim to suppo and promote SMEs every step of the way\n\nBB Entrepreneurship SmallBiz
...
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.