This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
Install necessary packages. Comment after installation
#install.packages('tm')
#install.packages('RColorBrewer')
#install.packages('wordcloud')
#install.packages('dplyr')
#install.packages('tidyverse')
#install.packages('stringr')
#install.packages('tidytext')
Include the packages.
library('tm')
library('RColorBrewer')
library('wordcloud')
Process data
entrepreneurshipData <- readRDS("entrepreneurship.RDS")
BreneData <- readRDS("BreneBrown.RDS")
Etweets <- entrepreneurshipData$text
Btweets <- BreneData$text
# swap out all non-alphanumeric characters
# Note that the definition of what constitutes a letter or a number or a punctuatution mark varies slightly depending upon your locale, so you may need to experiment a little to get exactly what you want.
# str_replace_all(tweets, "[^[:alnum:]]", " ")
# iconv(tweets, from = 'UTF-8', to = 'ASCII//TRANSLIT')
# Encoding(tweets) <- "UTF-8"
# Function to clean tweets
clean.text = function(x)
{
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
# tolower
# x = tolower(x)
return(x)
}
# clean tweets
Etweets = clean.text(Etweets)
Btweets = clean.text(Btweets)
Etweets = paste(Etweets, collapse=" ")
Btweets = paste(Btweets, collapse=" ")
Etweets = as.character(Etweets)
Btweets = as.character(Btweets)
Create TermDocument of tweets
all = c(Etweets, Btweets)
# create corpus
corpus = Corpus(VectorSource(all))
# create term-document matrix
tdm = TermDocumentMatrix(corpus)
# convert as matrix
tdm = as.matrix(tdm)
# add column names
colnames(tdm) = c("Entrepreneurship", "BreneBrown")
Create a comparison cloud and commonality cloud
comparison.cloud(tdm, random.order=FALSE,
colors = c("#00B2FF", "red"),
title.size=1,
max.words=100)
Create a commonality cloud
commonality.cloud(tdm, random.order=FALSE,
colors = brewer.pal(8, "Dark2"),
max.words=100)
Prepare for Bigram
# Install the following packages
library(dplyr)
library(tidyverse) # data manipulation & plotting
library(stringr) # text cleaning and regular expressions
library(tidytext) # provides additional text mining functions
titles <- c("Ent", "Brene")
books <- list(Etweets, Btweets)
series <- tibble()
for(i in seq_along(titles)) {
clean <- tibble(chapter = seq_along(books[[i]]),
text = books[[i]]) %>%
# Number of gram
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
mutate(book = titles[i]) %>%
select(book, everything())
series <- rbind(series, clean)
}
Bigram of Entrepreneurship Data
temp1 = subset(series, book == 'Ent') %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
temp1[1:20,]
## # A tibble: 20 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 government appointment 295
## 2 fast grow 294
## 3 rtin nigeria 294
## 4 vijay mallya 185
## 5 rtdear vijay 184
## 6 skill development 154
## 7 development amp 145
## 8 natural gas 145
## 9 amp natural 144
## 10 petroleum amp 144
## 11 amp entrepren 140
## 12 rtsmtand shrihonble 140
## 13 shrihonble minister 140
## 14 social entrepreneurship 136
## 15 fellow entrepreneurs 133
## 16 amp entrepreneurship 132
## 17 starup docsie 120
## 18 business marketing 114
## 19 docs technicaldocumentation 114
## 20 hair super 105
Bigram of Brene Brown Data
temp2 = subset(series, book == 'Brene') %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
temp2[1:20,]
## # A tibble: 20 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 brave awkward 537
## 2 stay brave 535
## 3 shit ton 532
## 4 practice stay 529
## 5 week ps 523
## 6 doesnt change 55
## 7 ass kicked 37
## 8 brene brown 36
## 9 unknowingly lead 36
## 10 mentalhealth bellletstalkā 35
## 11 wor rtyes 35
## 12 daring greatly 34
## 13 rtwithout selfawareness 34
## 14 occasion im 30
## 15 choosing courage 29
## 16 rt brenebrown 29
## 17 fun fast 28
## 18 amazing animator 25
## 19 atthe rsa 25
## 20 illustrator katy 25
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.