CS695 Week 5 Comparison Cloud and Text Analysis Notebook

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install necessary packages. Comment after installation

#install.packages('tm')
#install.packages('RColorBrewer')
#install.packages('wordcloud')
#install.packages('dplyr')
#install.packages('tidyverse')
#install.packages('stringr')
#install.packages('tidytext')

Include the packages.

library('tm')
library('RColorBrewer')
library('wordcloud')

Process data

entrepreneurshipData <- readRDS("entrepreneurship.RDS")
BreneData <- readRDS("BreneBrown.RDS")
Etweets <- entrepreneurshipData$text
Btweets <- BreneData$text

# swap out all non-alphanumeric characters
# Note that the definition of what constitutes a letter or a number or a punctuatution mark varies slightly depending upon your locale, so you may need to experiment a little to get exactly what you want.
# str_replace_all(tweets, "[^[:alnum:]]", " ")
# iconv(tweets, from = 'UTF-8', to = 'ASCII//TRANSLIT')
# Encoding(tweets)  <- "UTF-8"

# Function to clean tweets
clean.text = function(x)
{
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  # tolower
#  x = tolower(x)
  return(x)
}

# clean tweets
Etweets = clean.text(Etweets)
Btweets = clean.text(Btweets)

Etweets = paste(Etweets, collapse=" ")
Btweets = paste(Btweets, collapse=" ")

Etweets = as.character(Etweets)
Btweets = as.character(Btweets)

Create TermDocument of tweets

all = c(Etweets, Btweets)

# create corpus
corpus = Corpus(VectorSource(all))

# create term-document matrix
tdm = TermDocumentMatrix(corpus)

# convert as matrix
tdm = as.matrix(tdm)

# add column names
colnames(tdm) = c("Entrepreneurship", "BreneBrown")

Create a comparison cloud and commonality cloud

comparison.cloud(tdm, random.order=FALSE, 
                 colors = c("#00B2FF", "red"),
                 title.size=1, 
                 max.words=100)

Create a commonality cloud

commonality.cloud(tdm, random.order=FALSE, 
                  colors = brewer.pal(8, "Dark2"),
                  max.words=100)

Prepare for Bigram

# Install the following packages 
library(dplyr)
library(tidyverse)      # data manipulation & plotting
library(stringr)        # text cleaning and regular expressions
library(tidytext)       # provides additional text mining functions

titles <- c("Ent", "Brene")

books <- list(Etweets, Btweets)
  
series <- tibble()

 for(i in seq_along(titles)) {
        
        clean <- tibble(chapter = seq_along(books[[i]]),
                        text = books[[i]]) %>%
          # Number of gram
             unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
             mutate(book = titles[i]) %>%
             select(book, everything())

        series <- rbind(series, clean)
}

Bigram of Entrepreneurship Data

temp1 = subset(series, book == 'Ent') %>%
        separate(bigram, c("word1", "word2"), sep = " ") %>%
        filter(!word1 %in% stop_words$word,
               !word2 %in% stop_words$word) %>%
        count(word1, word2, sort = TRUE)
temp1[1:20,]

## # A tibble: 20 x 3
##    word1       word2                      n
##    <chr>       <chr>                  <int>
##  1 government  appointment              295
##  2 fast        grow                     294
##  3 rtin        nigeria                  294
##  4 vijay       mallya                   185
##  5 rtdear      vijay                    184
##  6 skill       development              154
##  7 development amp                      145
##  8 natural     gas                      145
##  9 amp         natural                  144
## 10 petroleum   amp                      144
## 11 amp         entrepren                140
## 12 rtsmtand    shrihonble               140
## 13 shrihonble  minister                 140
## 14 social      entrepreneurship         136
## 15 fellow      entrepreneurs            133
## 16 amp         entrepreneurship         132
## 17 starup      docsie                   120
## 18 business    marketing                114
## 19 docs        technicaldocumentation   114
## 20 hair        super                    105

Bigram of Brene Brown Data

temp2 = subset(series, book == 'Brene') %>%
        separate(bigram, c("word1", "word2"), sep = " ") %>%
        filter(!word1 %in% stop_words$word,
               !word2 %in% stop_words$word) %>%
        count(word1, word2, sort = TRUE)
temp2[1:20,]

## # A tibble: 20 x 3
##    word1        word2             n
##    <chr>        <chr>         <int>
##  1 brave        awkward         537
##  2 stay         brave           535
##  3 shit         ton             532
##  4 practice     stay            529
##  5 week         ps              523
##  6 doesnt       change           55
##  7 ass          kicked           37
##  8 brene        brown            36
##  9 unknowingly  lead             36
## 10 mentalhealth bellletstalk⁠     35
## 11 wor          rtyes            35
## 12 daring       greatly          34
## 13 rtwithout    selfawareness    34
## 14 occasion     im               30
## 15 choosing     courage          29
## 16 rt           brenebrown       29
## 17 fun          fast             28
## 18 amazing      animator         25
## 19 atthe        rsa              25
## 20 illustrator  katy             25

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).