This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install necessary packages. Comment after installation

# install.packages('tm')
# install.packages('RColorBrewer')
# install.packages('wordcloud')
# installed.packages('tidytext')
# installed.packages('dplyr')
# install.packages("readr")
# install.packages("plyr")
# install.packages("stringr")
# install.packages("stringi")
# install.packages('plotly')

Include the packages.

library('tm')

## Loading required package: NLP

library('RColorBrewer')
library('wordcloud')
library('readr')
library('tidytext')
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("plyr")

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library("stringr")
library("stringi")
library(plotly)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Process data

entrepreneurshipData <- readRDS("entrepreneurship.RDS")
BreneData <- readRDS("BreneBrown.RDS")
Etweets <- entrepreneurshipData$text
Btweets <- BreneData$text

# Read dictionaries
money.words = scan('moneyWords.txt', what='character', comment.char=';')
fear.words = scan('fearWords.txt', what='character', comment.char=';')
pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')

# Clean data
clean.text = function(x)
{
  # remove rt
  x = gsub("rt", "", x)
  # remove Rt
  x = gsub("Rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove https
  x = gsub("https", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  # tolower
#  x = tolower(x)
  return(x)
}

# clean tweets
Etweets = clean.text(Etweets)
Btweets = clean.text(Btweets)

Topic Analysis

score.topic = function(sentences, dict, .progress='none')
{
  
  # we got a vector of sentences. plyr will handle a list
  # or a vector as an "l" for us
  # we want a simple array of scores back, so we use
  # "l" + "a" + "ply" = "laply":
  scores = laply(sentences, function(sentence, dict) {
    
    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    sentence = tolower(sentence)
    
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    topic.matches = match(words, dict)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    topic.matches = !is.na(topic.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(topic.matches)
    
    return(score)
  }, dict, .progress=.progress )
  
  topicscores.df = data.frame(score=scores, text=sentences)
  return(topicscores.df)
}

#topic.scores= score.topic(Btweets, money.words, .progress='none')
topic.scores= score.topic(Etweets, fear.words, .progress='none')

topic.mentioned = subset(topic.scores, score !=0)

N= nrow(topic.scores)
Nmentioned = nrow(topic.mentioned)

dftemp=data.frame(topic=c("Mentioned", "Not Mentioned"), 
                  number=c(Nmentioned,N-Nmentioned))

p <- plot_ly(data=dftemp, labels = ~topic, values = ~number, type = 'pie') %>%
  layout(title = 'Pie Chart of Tweets Mentioning fear',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p

Sentiment Analysis

score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
  
  # we got a vector of sentences. plyr will handle a list
  # or a vector as an "l" for us
  # we want a simple array of scores back, so we use
  # "l" + "a" + "ply" = "laply":
  scores = laply(sentences, function(sentence, pos.words, neg.words) {
    
    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    sentence = tolower(sentence)
    
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(pos.matches) - sum(neg.matches)
    
    return(score)
  }, pos.words, neg.words, .progress=.progress )
  
  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}

sentiment.scores= score.sentiment(Etweets, pos.words, neg.words, .progress='none')

score <- sentiment.scores$score
p <- plot_ly(x = ~score, type = "histogram")
p

Word cloud of negative tweets

require(tm)
require(wordcloud)
require(RColorBrewer)

negativeTweets = subset(sentiment.scores, score < 0)$text

corpus = Corpus(VectorSource(negativeTweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, tolower = FALSE) )

# convert as matrix
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 
#word_freqs = word_freqs[-(1:12)]
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 100), head(dm$freq, 100), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 50)

##             help entrepreneurship         business    entrepreneurs 
##              338              315              267              212 
## Entrepreneurship           fellow                            youth 
##              153              132              128              121 
##        Marketing              can     RTBusinesses              The 
##              115              110              104              100 
##             atwe          passion             will           people 
##               80               76               75               67 
##             This              amp          execute           RTWhat 
##               63               58               57               56 
##           gather     entrepreneur             pro      thinkshould 
##               56               56               55               55 
##       mentorship              two          otherwe            Delhi 
##               54               53               52               52 
##             lazy  DelhiGovtSchool        Education           answer 
##               49               47               47               47 
##             star         teachers    Ministeralong          RTToday 
##               47               46               45               45 
##       leadership        question         students           create 
##               44               42               41               41 
##             Your             just             real            story 
##               40               39               39               39 
##            RTThe         customer             grit            serve 
##               38               38               38               38 
##              How          problem 
##               38               38

#Grouping of some examples of tweets with negatively coded words, like 'lazy'. Keyword may be replaced for different insights as needed
index = grep("lazy", negativeTweets)
negativeTweets[index]

##  [1] Its definitely not for the lazy and entrepreneurship or corporate business                                                           
##  [2] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
##  [3] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
##  [4] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre                
##  [5] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
##  [6] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
##  [7] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre                
##  [8] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre                
##  [9] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre                
## [10] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre                
## [11] RTBetter to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entre                
## [12] Better to be lazy amp know youre lazy\n\nThan think youre hustling when youre really just addicted to FeelGood Entrepreneurship content
## [13] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [14] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [15] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [16] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [17] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [18] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [19] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [20] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [21] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [22] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [23] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [24] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [25] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [26] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [27] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [28] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [29] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [30] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [31] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [32] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [33] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [34] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [35] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [36] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [37] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [38] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [39] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [40] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [41] RTNot all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being told                 
## [42] Not all young people are lazy and feel entitled This is the real story of entrepreneurship that is not being tol                    
## 6242 Levels:  we aim to suppo and promote SMEs every step of the way\n\nBB Entrepreneurship SmallBiz ...

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

Week 6: Topic and Sentiment Analysis R Notebook

Richard Shang

2/26/2019

Topic Analysis

Sentiment Analysis

Word cloud of negative tweets