This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install necessary packages. Comment after installation

#install.packages('tm')
#install.packages('RColorBrewer')
#install.packages('wordcloud')

Include the packages.

library('tm')

## Loading required package: NLP

library('RColorBrewer')
library('wordcloud')

Process data

# entrepreneurshipData <- readRDS("entrepreneurship.RDS")
Data <- readRDS("YCombinator.RDS")
# Etweets <- entrepreneurshipData$text
tweets <- Data$text


# swap out all non-alphanumeric characters
# Note that the definition of what constitutes a letter or a number or a punctuatution mark varies slightly depending upon your locale, so you may need to experiment a little to get exactly what you want.
# str_replace_all(tweets, "[^[:alnum:]]", " ")
# iconv(tweets, from = 'UTF-8', to = 'ASCII//TRANSLIT')
# Encoding(tweets)  <- "UTF-8"

# Function to clean tweets
clean.text = function(x)
{
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  # tolower
#  x = tolower(x)
  return(x)
}

# clean tweets
tweets = clean.text(tweets)

Create word cloud of tweets

corpus = Corpus(VectorSource(tweets))

# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, 
  # tolower may cause trouble on Window because UTF-8 encoding, changed to FALSE  
    tolower = FALSE) )

# convert as matrix. It may consume near 1g of your RAM
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

#check top 50 most mentioned words
head(word_freqs, 50)

##   Combinator       Altman          Sam      podcast         bets 
##          837          253          228          208          202 
##      booming       Airbnb    president  transcribed       Equity 
##          201          158          157          144          136 
##        staup     industry HotelTonight        world          Day 
##          127          123          107           90           88 
##         Demo   capitalism          two         love    Berkshire 
##           88           88           86           85           84 
##   energizing       events    Hathaways         RTIf          RTY 
##           84           84           83           83           78 
##          San     stepping        steps    Francisco  accelerator 
##           57           56           56           55           50 
##      Silicon        Staup  Combinators        money       making 
##           50           48           45           44           43 
##          Can    President      changes         amid       series 
##           43           43           41           40           40 
##      Changes          The        model      arguing       Valley 
##           40           39           39           37           36 
##         move    incubator    investors       staups         step 
##           35           34           34           32           32

#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:2)]  #Here “1:5” is 1st-5th words in the list we want to remove 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 50)

##          Sam      podcast         bets      booming       Airbnb 
##          228          208          202          201          158 
##    president  transcribed       Equity        staup     industry 
##          157          144          136          127          123 
## HotelTonight        world          Day         Demo   capitalism 
##          107           90           88           88           88 
##          two         love    Berkshire   energizing       events 
##           86           85           84           84           84 
##    Hathaways         RTIf          RTY          San     stepping 
##           83           83           78           57           56 
##        steps    Francisco  accelerator      Silicon        Staup 
##           56           55           50           50           48 
##  Combinators        money       making          Can    President 
##           45           44           43           43           43 
##      changes         amid       series      Changes          The 
##           41           40           40           40           39 
##        model      arguing       Valley         move    incubator 
##           39           37           36           35           34 
##    investors       staups         step        RTSam       OpenAI 
##           34           32           32           31           30

# I see some words I don't know or understand, so I retrieve the tweets that have the words
# I retrieve all the tweets that have "nigeria" in it

I saw ‘booming’ on the work cloud and want to tweets contain it

index = grep("booming", tweets)
tweets[index]

##   [1] "Y Combinator bets on the booming podcast industryvia"                                                                
##   [2] "Y Combinator bets on the booming podcast industry"                                                                   
##   [3] "Y Combinator bets on the booming podcast industry"                                                                   
##   [4] "Y Combinator bets on the booming podcast industry"                                                                   
##   [5] "RTY Combinator bets on the booming podcast industrytech staup"                                                       
##   [6] "Y Combinator bets on the booming podcast industry"                                                                   
##   [7] "RTY Combinator bets on the booming podcast industryby"                                                               
##   [8] "Y Combinator bets on the booming podcast industry\n\nStaup News Tech"                                                
##   [9] "Tech News Y Combinator bets on the booming podcast industry \nBrew helps creators get paid to podcast with a subscr" 
##  [10] "Y Combinator bets on the booming podcast industry"                                                                   
##  [11] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
##  [12] "Y Combinator bets on the booming podcast industry\n\nStaup News Tech"                                                
##  [13] "Y Combinator bets on the booming podcast industryvia"                                                                
##  [14] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
##  [15] "Y Combinator bets on the booming podcast industryTechCrunch"                                                         
##  [16] "Y Combinator bets on the booming podcast industry"                                                                   
##  [17] "Y Combinator bets on the booming podcast industrytech"                                                               
##  [18] "RTStrato Y Combinator bets on the booming podcast industrystaup entrepreneur"                                        
##  [19] "Strato Y Combinator bets on the booming podcast industrystaup entrepreneur"                                          
##  [20] "Tech Crunch Staup news Y Combinator bets on the booming podcast industry"                                            
##  [21] "Y Combinator bets on the booming podcast industryTechCrunch"                                                         
##  [22] "RTY Combinator bets on the booming podcast industryStaup TechCrunch"                                                 
##  [23] "Y Combinator bets on the booming podcast industry"                                                                   
##  [24] "Y Combinator bets on the booming podcast industry"                                                                   
##  [25] "Y Combinator bets on the booming podcast industry"                                                                   
##  [26] "Y Combinator bets on the booming podcast industry"                                                                   
##  [27] "Y Combinator bets on the booming podcast industry"                                                                   
##  [28] "Y Combinator bets on the booming podcast industry"                                                                   
##  [29] "Y Combinator bets on the booming podcast industry"                                                                   
##  [30] "RTY Combinator bets on the booming podcast industryStaups Tech entrepreneurs"                                        
##  [31] "Y Combinator bets on the booming podcast industryStaups Tech entrepreneurs"                                          
##  [32] "Y Combinator bets on the booming podcast industry"                                                                   
##  [33] "RTY Combinator bets on the booming podcast industryby"                                                               
##  [34] "Y Combinator bets on the booming podcast industryStaups Tech entrepreneurs"                                          
##  [35] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
##  [36] "Y Combinator bets on the booming podcast industry"                                                                   
##  [37] "Y Combinator bets on the booming podcast industry\n"                                                                 
##  [38] "Y Combinator bets on the booming podcast industry"                                                                   
##  [39] "Y Combinator bets on the booming podcast industryviastaup"                                                           
##  [40] "Y Combinator bets on the booming podcast industryStaup"                                                              
##  [41] "Y Combinator bets on the booming podcast industry"                                                                   
##  [42] "Y Combinator bets on the booming podcast industry"                                                                   
##  [43] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
##  [44] "Y Combinator bets on the booming podcast industry \n\nPodcasts are exploding in popularity and Y Combinator the star"
##  [45] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
##  [46] "Y Combinator bets on the booming podcast industrydi"                                                                 
##  [47] "Y Combinator bets on the booming podcast industry"                                                                   
##  [48] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
##  [49] "Y Combinator bets on the booming podcast industry staups news"                                                       
##  [50] "Y Combinator bets on the booming podcast industryviaTechnology Love"                                                 
##  [51] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
##  [52] "Y Combinator bets on the booming podcast industry\n"                                                                 
##  [53] "Y Combinator bets on the booming podcast industryTech"                                                               
##  [54] "Y Combinator bets on the booming podcast industry"                                                                   
##  [55] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
##  [56] "Y Combinator bets on the booming podcast industry  TechCrunch"                                                       
##  [57] "Y Combinator bets on the booming podcast industrytechnology"                                                         
##  [58] "Y Combinator bets on the booming podcast industry"                                                                   
##  [59] "Y Combinator bets on the booming podcast industry"                                                                   
##  [60] "Y Combinator bets on the booming podcast industrytech staup"                                                         
##  [61] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
##  [62] "Y Combinator bets on the booming podcast industry  TechCrunch"                                                       
##  [63] "Y Combinator bets on the booming podcast industryVentureCanvas"                                                      
##  [64] "Y Combinator bets on the booming podcast industry staup marketing business creativelysma"                            
##  [65] "Y Combinator bets on the booming podcast industryTechCrunch"                                                         
##  [66] "Y Combinator bets on the booming podcast industry"                                                                   
##  [67] "Y Combinator bets on the booming podcast industrybusiness staup"                                                     
##  [68] "Y Combinator bets on the booming podcast industry"                                                                   
##  [69] "Y Combinator bets on the booming podcast industry"                                                                   
##  [70] "Y Combinator bets on the booming podcast industry"                                                                   
##  [71] "Y Combinator bets on the booming podcast industry"                                                                   
##  [72] "Y Combinator bets on the booming podcast industry"                                                                   
##  [73] "Y Combinator bets on the booming podcast industrytech business money"                                                
##  [74] "Y Combinator bets on the booming podcast industry staup marketing business creativelysma"                            
##  [75] "Y Combinator bets on the booming podcast industryStaup TechCrunch"                                                   
##  [76] "Y Combinator bets on the booming podcast industryOPSItalia"                                                          
##  [77] "Y Combinator bets on the booming podcast industry"                                                                   
##  [78] "Y Combinator bets on the booming podcast industry"                                                                   
##  [79] "staups Y Combinator bets on the booming podcast industry"                                                            
##  [80] "Y Combinator bets on the booming podcast industrystaup shenzhen troublemaker"                                        
##  [81] "Y Combinator bets on the booming podcast industry"                                                                   
##  [82] "Y Combinator bets on the booming podcast industry"                                                                   
##  [83] "Y Combinator bets on the booming podcast industry"                                                                   
##  [84] "Y Combinator bets on the booming podcast industry"                                                                   
##  [85] "Y Combinator bets on the booming podcast industry"                                                                   
##  [86] "Y Combinator bets on the booming podcast industrystaups\n\nPodcasts are exploding in popu"                           
##  [87] "Y Combinator bets on the booming podcast industrywhat do you think of this"                                          
##  [88] "TC Staup newsY Combinator bets on the booming podcast industry"                                                      
##  [89] "Y Combinator bets on the booming podcast industry"                                                                   
##  [90] "sta up Y Combinator bets on the booming podcast industry"                                                            
##  [91] "Y Combinator bets on the booming podcast industry"                                                                   
##  [92] "Y Combinator bets on the booming podcast industrymarketing leaderhship entrepreneur"                                 
##  [93] "Y Combinator bets on the booming podcast industry"                                                                   
##  [94] "Y Combinator bets on the booming podcast industry TechCrunchStaups"                                                  
##  [95] "Y Combinator bets on the booming podcast industry"                                                                   
##  [96] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
##  [97] "Combinator bets on the booming podcast industry"                                                                     
##  [98] "my repost see you Y Combinator bets on the booming podcast industry"                                                 
##  [99] "Y Combinator bets on the booming podcast industry staup staupvic"                                                    
## [100] "Y Combinator bets on the booming podcast industry"                                                                   
## [101] "Y Combinator bets on the booming podcast industrystaups"                                                             
## [102] "Y Combinator bets on the booming podcast industry"                                                                   
## [103] "Y Combinator bets on the booming podcast industry"                                                                   
## [104] "Y Combinator bets on the booming podcast industry"                                                                   
## [105] "Y Combinator bets on the booming podcast industry"                                                                   
## [106] "Y Combinator bets on the booming podcast industrystaup investment market"                                            
## [107] "Y Combinator bets on the booming podcast industryStaup"                                                              
## [108] "Y Combinator bets on the booming podcast industrystaups entrepreneurship via Kate Clark"                             
## [109] "Y Combinator bets on the booming podcast industryAngelNews staup"                                                    
## [110] "Y Combinator bets on the booming podcast industry"                                                                   
## [111] "Y Combinator bets on the booming podcast industrystaups"                                                             
## [112] "Y Combinator bets on the booming podcast industry"                                                                   
## [113] "Y Combinator bets on the booming podcast industry"                                                                   
## [114] "Y Combinator bets on the booming podcast industry\n\nPodcasts are exploding in popularity and Y Combinator the sta"  
## [115] "Y Combinator bets on the booming podcast industry"                                                                   
## [116] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
## [117] "Y Combinator bets on the booming podcast industry"                                                                   
## [118] "bets on the booming podcast industryTechCrunch"                                                                      
## [119] "Y Combinator bets on the booming podcast industry"                                                                   
## [120] "Y Combinator bets on the booming podcast industry"                                                                   
## [121] "Y Combinator bets on the booming podcast industrystaup"                                                              
## [122] "Y Combinator bets on the booming podcast industry\nMaybe twitter founderwas ahead of his time with Odeo"             
## [123] "Y Combinator bets on the booming podcast industry Kate ClarkTechCrunch"                                              
## [124] "RTY Combinator bets on the booming podcast industry featuring Ws"                                                    
## [125] "Y Combinator bets on the booming podcast industry   by kateclarktweets"                                              
## [126] "Y Combinator bets on the booming podcast industry featuring Ws"                                                      
## [127] "Y Combinator bets on the booming podcast industry"                                                                   
## [128] "Y Combinator bets on the booming podcast industry"                                                                   
## [129] "Y Combinator bets on the booming podcast industry"                                                                   
## [130] "Y Combinator bets on the booming podcast industry"                                                                   
## [131] "Y Combinator bets on the booming podcast industry"                                                                   
## [132] "RTgeraldbaderY Combinator bets on the booming podcast industry via Kate ClarkML AI Analytic"                         
## [133] "geraldbaderY Combinator bets on the booming podcast industry via Kate ClarkML AI"                                    
## [134] "Y Combinator bets on the booming podcast industry"                                                                   
## [135] "Y Combinator bets on the booming podcast industry"                                                                   
## [136] "Y Combinator bets on the booming podcast industry"                                                                   
## [137] "RTY Combinator bets on the booming podcast industry"                                                                 
## [138] "Y Combinator bets on the booming podcast industry"                                                                   
## [139] "Y Combinator bets on the booming podcast industry"                                                                   
## [140] "Y Combinator bets on the booming podcast industry"                                                                   
## [141] "Y Combinator bets on the booming podcast industry"                                                                   
## [142] "Y Combinator bets on the booming podcast industry"                                                                   
## [143] "Y Combinator bets on the booming podcast industry"                                                                   
## [144] "Y Combinator bets on the booming podcast industry"                                                                   
## [145] "Y Combinator bets on the booming podcast industry"                                                                   
## [146] "Y Combinator bets on the booming podcast industry"                                                                   
## [147] "TECHCRUNCH Y Combinator bets on the booming podcast industry"                                                        
## [148] "Y Combinator bets on the booming podcast industry"                                                                   
## [149] "Y Combinator bets on the booming podcast industry via Kate ClarkML AI Analytics"                                     
## [150] "Y Combinator bets on the booming podcast industry"                                                                   
## [151] "Y Combinator bets on the booming podcast industry"                                                                   
## [152] "knownews techcrunch Y Combinator bets on the booming podcast industryMore tech news at"                              
## [153] "Y Combinator bets on the booming podcast industry"                                                                   
## [154] "technews Y Combinator bets on the booming podcast industry"                                                          
## [155] "Y Combinator bets on the booming podcast industryVentureCapital Enteainment SanFrancisco Accelerator Funding"        
## [156] "Y Combinator bets on the booming podcast industry"                                                                   
## [157] "Y Combinator bets on the booming podcast industryseedaccelerator launchmarketing staups"                             
## [158] "Y Combinator bets on the booming podcast industry\n\nBrew helps creators get paid to podcast w"                      
## [159] "Y Combinator bets on the booming podcast industry"                                                                   
## [160] "Y Combinator bets on the booming podcast industryby TechCrunch infosec software technology"                          
## [161] "Y Combinator bets on the booming podcast industry Z"                                                                 
## [162] "Y Combinator bets on the booming podcast industry"                                                                   
## [163] "RTY Combinator bets on the booming podcast industryby"                                                               
## [164] "RTY Combinator bets on the booming podcast industryby"                                                               
## [165] "Y Combinator bets on the booming podcast industrystaups"                                                             
## [166] "RTtechcrunch staup Y Combinator bets on the booming podcast industry"                                                
## [167] "Y Combinator bets on the booming podcast industry  TechCrunch"                                                       
## [168] "Y Combinator bets on the booming podcast industrystaups"                                                             
## [169] "Y Combinator bets on the booming podcast industrystaups"                                                             
## [170] "techcrunch staup Y Combinator bets on the booming podcast industry"                                                  
## [171] "Y Combinator bets on the booming podcast industry"                                                                   
## [172] "RT TechCrunch Y Combinator bets on the booming podcast industryby kateclarktweets"                                   
## [173] "Y Combinator bets on the booming podcast industrygrowcyber"                                                          
## [174] "Y Combinator bets on the booming podcast industry"                                                                   
## [175] "RTY Combinator bets on the booming podcast industryby"                                                               
## [176] "New TechCrunch Aicle Y Combinator bets on the booming podcast industry"                                              
## [177] "Y Combinator bets on the booming podcast industry"                                                                   
## [178] "Y Combinator bets on the booming podcast industry"                                                                   
## [179] "Y Combinator bets on the booming podcast industry \n\nPodcasts are exploding in popularity and"                      
## [180] "RT TechCrunchY Combinator bets on the booming podcast industryby kateclarktweets"                                    
## [181] "Y Combinator bets on the booming podcast industry  TechCrunch"                                                       
## [182] "RTY Combinator bets on the booming podcast industryby"                                                               
## [183] "Y Combinator bets on the booming podcast industry"                                                                   
## [184] "Y Combinator bets on the booming podcast industry"                                                                   
## [185] "Y Combinator bets on the booming podcast industry"                                                                   
## [186] "Y Combinator bets on the booming podcast industrynews Techcrunch Technology"                                         
## [187] "Y Combinator bets on the booming podcast industrystaups techcrunch venturecapital"                                   
## [188] "Y Combinator bets on the booming podcast industrystaup MBADMB"                                                       
## [189] "Y Combinator bets on the booming podcast industry"                                                                   
## [190] "Y Combinator bets on the booming podcast industryVia"                                                                
## [191] "RTY Combinator bets on the booming podcast industryby"                                                               
## [192] "Y Combinator bets on the booming podcast industryby kateclarktweets"                                                 
## [193] "Y Combinator bets on the booming podcast industryAccelerator Enteainment Funding"                                    
## [194] "Y Combinator bets on the booming podcast industry"                                                                   
## [195] "Y Combinator bets on the booming podcast industrystaup"                                                              
## [196] "RTY Combinator bets on the booming podcast industryby"                                                               
## [197] "RTY Combinator bets on the booming podcast industryby"                                                               
## [198] "Y Combinator bets on the booming podcast industryviaretweet retweetplease"                                           
## [199] "RTY Combinator bets on the booming podcast industryby"                                                               
## [200] "RTY Combinator bets on the booming podcast industryby"                                                               
## [201] "Y Combinator bets on the booming podcast industryby"

Prepare for Bigram

# Install the following packages 
library(dplyr)
library(tidyverse)      # data manipulation & plotting
library(stringr)        # text cleaning and regular expressions
library(tidytext)       # provides additional text mining functions

titles <- c("v")

books <- list(tweets)
  
series <- tibble()

 for(i in seq_along(titles)) {
        
        clean <- tibble(chapter = seq_along(books[[i]]),
                        text = books[[i]]) %>%
          # Number of gram
             unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
             mutate(book = titles[i]) %>%
             select(book, everything())

        series <- rbind(series, clean)
}

Bigram of the Data

temp1 = subset(series, book == 'v') %>%
        separate(bigram, c("word1", "word2"), sep = " ") %>%
      #  filter(!word1 %in% stop_words$word,
      #         !word2 %in% stop_words$word) %>%
        count(word1, word2, sort = TRUE)
temp1[1:20,]

## # A tibble: 20 x 3
##    word1       word2            n
##    <chr>       <chr>        <int>
##  1 y           combinator     769
##  2 sam         altman         215
##  3 on          the            212
##  4 bets        on             201
##  5 booming     podcast        201
##  6 the         booming        201
##  7 combinator  bets           200
##  8 airbnb      and            144
##  9 combinator  airbnb         144
## 10 transcribed y              139
## 11 podcast     industry       134
## 12 equity      transcribed    123
## 13 and         hoteltonight   109
## 14 in          the            109
## 15 combinator  president      108
## 16 president   sam            105
## 17 altman      is             102
## 18 demo        day             88
## 19 the         world           88
## 20 combinator  demo            85

See whether people talk about money in their tweets

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(dplyr)
library("plyr")

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise

## The following object is masked from 'package:purrr':
## 
##     compact

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library("stringi")

money.words = scan('moneyWords.txt', what='character', comment.char=';')
score.topic = function(sentences, dict, .progress='none')
{
  
  # we got a vector of sentences. plyr will handle a list
  # or a vector as an "l" for us
  # we want a simple array of scores back, so we use
  # "l" + "a" + "ply" = "laply":
  scores = laply(sentences, function(sentence, dict) {
    
    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    # sentence = tolower(sentence)
    
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    topic.matches = match(words, dict)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    topic.matches = !is.na(topic.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(topic.matches)
    
    return(score)
  }, dict, .progress=.progress )
  
  topicscores.df = data.frame(score=scores, text=sentences)
  return(topicscores.df)
}

topic.scores= score.topic(tweets, money.words, .progress='none')
# topic.scores= score.topic(Etweets, fear.words, .progress='none')

topic.mentioned = subset(topic.scores, score !=0)

N= nrow(topic.scores)
Nmentioned = nrow(topic.mentioned)

dftemp=data.frame(topic=c("Mentioned", "Not Mentioned"), 
                  number=c(Nmentioned,N-Nmentioned))

p <- plot_ly(data=dftemp, labels = ~topic, values = ~number, type = 'pie') %>%
  layout(title = 'Pie Chart of Tweets Talking about Money',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p

Check the emotion of the tweets

library(tidytext)

titles <- c("v")
books <- list(tweets)
series <- tibble()

# create a series of book with text lines
for(i in seq_along(titles)) {
  
  clean <- tibble(chapter = seq_along(books[[i]]),
                  text = books[[i]]) %>%
  #  unnest_tokens(word, text) %>%
    mutate(book = titles[i]) %>%
    select(book, everything())
  series <- rbind(series, clean)
}

# find tweets with "fear"
# other emotion to find
##########################
# anger     
# anticipation          
# disgust           
# fear              
# joy               
# sadness       
# surprise      
# trust
##########################

senti <- series %>%
        unnest_tokens(word, text) %>%
        inner_join(get_sentiments("nrc")) %>%
  filter(sentiment=="fear") %>%   # replace "fear" with other emotion words 
   group_by(chapter)

## Joining, by = "word"

sentitext = series[senti$chapter,]
sentitext$sentiment = senti$sentiment
sentitext

## # A tibble: 60 x 4
##    book  chapter text                                            sentiment
##    <chr>   <int> <chr>                                           <chr>    
##  1 v          18 Great job as always byon finding the diamonds … fear     
##  2 v          40 RTbacked byis solving the podcast monetization… fear     
##  3 v          64 RTbacked byis solving the podcast monetization… fear     
##  4 v          86 RTbacked byis solving the podcast monetization… fear     
##  5 v         205 RTbacked byis solving the podcast monetization… fear     
##  6 v         207 RTbacked byis solving the podcast monetization… fear     
##  7 v         212 backed byis solving the podcast monetization p… fear     
##  8 v         267 unexpected lessons Ive learned after going thr… fear     
##  9 v         273 The odds of getting from launch to liquidity w… fear     
## 10 v         392 I was hoping you were joking but oh god he get… fear     
## # ... with 50 more rows

Create word cloud of tweets showing fear

corpus = Corpus(VectorSource(sentitext$text))

# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, 
  # tolower may cause trouble on Window because UTF-8 encoding, changed to FALSE  
    tolower = FALSE) )

# convert as matrix. It may consume near 1g of your RAM
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

#check top 50 most mentioned words
head(word_freqs, 50)

##      Combinator      Bottomless          Altman             Sam 
##              21              15              11              11 
##       president         problem        solution             one 
##              10               9               9               8 
##         addicts          coffee            lazy           worse 
##               8               8               8               7 
##            idea            plus          staups            byis 
##               7               7               7               6 
##    monetization         podcast         solving        building 
##               6               6               6               6 
##        RTbacked           staup      combinator    cappedprofit 
##               5               5               5               5 
##      companyand        terrible             Did          better 
##               5               5               5               5 
##            make        revealed       yesterday           Great 
##               5               5               5               4 
##             The     Combinators           Kahik            like 
##               4               4               4               4 
##           Staup        calculus             god            Beto 
##               4               4               3               3 
##          Beware            Folk WellIntentioned         network 
##               3               3               3               3 
##        powerful            tech           women          Former 
##               3               3               3               3 
##           First            Only 
##               3               3

#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:5)]  #Here “1:5” is 1st-5th words in the list we want to remove 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 50)

##         problem        solution             one         addicts 
##               9               9               8               8 
##          coffee            lazy           worse            idea 
##               8               8               7               7 
##            plus          staups            byis    monetization 
##               7               7               6               6 
##         podcast         solving        building        RTbacked 
##               6               6               6               5 
##           staup      combinator    cappedprofit      companyand 
##               5               5               5               5 
##        terrible             Did          better            make 
##               5               5               5               5 
##        revealed       yesterday           Great             The 
##               5               5               4               4 
##     Combinators           Kahik            like           Staup 
##               4               4               4               4 
##        calculus             god            Beto          Beware 
##               4               3               3               3 
##            Folk WellIntentioned         network        powerful 
##               3               3               3               3 
##            tech           women          Former           First 
##               3               3               3               3 
##            Only         burning           great            need 
##               3               3               3               3 
##         succeed           takes 
##               3               3

# I see some words I don't know or understand, so I retrieve the tweets that have the words
# I retrieve all the tweets that have "nigeria" in it

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

CS695： working session R notebook

I saw ‘booming’ on the work cloud and want to tweets contain it

See whether people talk about money in their tweets

Check the emotion of the tweets