This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
Install necessary packages. Comment after installation
#install.packages('tm')
#install.packages('RColorBrewer')
#install.packages('wordcloud')
Include the packages.
library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
Process data
# entrepreneurshipData <- readRDS("entrepreneurship.RDS")
Data <- readRDS("YCombinator.RDS")
# Etweets <- entrepreneurshipData$text
tweets <- Data$text
# swap out all non-alphanumeric characters
# Note that the definition of what constitutes a letter or a number or a punctuatution mark varies slightly depending upon your locale, so you may need to experiment a little to get exactly what you want.
# str_replace_all(tweets, "[^[:alnum:]]", " ")
# iconv(tweets, from = 'UTF-8', to = 'ASCII//TRANSLIT')
# Encoding(tweets) <- "UTF-8"
# Function to clean tweets
clean.text = function(x)
{
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
# tolower
# x = tolower(x)
return(x)
}
# clean tweets
tweets = clean.text(tweets)
Create word cloud of tweets
corpus = Corpus(VectorSource(tweets))
# create term-document matrix
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE,
# tolower may cause trouble on Window because UTF-8 encoding, changed to FALSE
tolower = FALSE) )
# convert as matrix. It may consume near 1g of your RAM
tdm = as.matrix(tdm)
# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
#check top 50 most mentioned words
head(word_freqs, 50)
## Combinator Altman Sam podcast bets
## 837 253 228 208 202
## booming Airbnb president transcribed Equity
## 201 158 157 144 136
## staup industry HotelTonight world Day
## 127 123 107 90 88
## Demo capitalism two love Berkshire
## 88 88 86 85 84
## energizing events Hathaways RTIf RTY
## 84 84 83 83 78
## San stepping steps Francisco accelerator
## 57 56 56 55 50
## Silicon Staup Combinators money making
## 50 48 45 44 43
## Can President changes amid series
## 43 43 41 40 40
## Changes The model arguing Valley
## 40 39 39 37 36
## move incubator investors staups step
## 35 34 34 32 32
#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:2)] #Here “1:5” is 1st-5th words in the list we want to remove
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
#Plot corpus in a clored graph; need RColorBrewer package
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
#check top 50 most mentioned words
head(word_freqs, 50)
## Sam podcast bets booming Airbnb
## 228 208 202 201 158
## president transcribed Equity staup industry
## 157 144 136 127 123
## HotelTonight world Day Demo capitalism
## 107 90 88 88 88
## two love Berkshire energizing events
## 86 85 84 84 84
## Hathaways RTIf RTY San stepping
## 83 83 78 57 56
## steps Francisco accelerator Silicon Staup
## 56 55 50 50 48
## Combinators money making Can President
## 45 44 43 43 43
## changes amid series Changes The
## 41 40 40 40 39
## model arguing Valley move incubator
## 39 37 36 35 34
## investors staups step RTSam OpenAI
## 34 32 32 31 30
# I see some words I don't know or understand, so I retrieve the tweets that have the words
# I retrieve all the tweets that have "nigeria" in it
index = grep("booming", tweets)
tweets[index]
## [1] "Y Combinator bets on the booming podcast industryvia"
## [2] "Y Combinator bets on the booming podcast industry"
## [3] "Y Combinator bets on the booming podcast industry"
## [4] "Y Combinator bets on the booming podcast industry"
## [5] "RTY Combinator bets on the booming podcast industrytech staup"
## [6] "Y Combinator bets on the booming podcast industry"
## [7] "RTY Combinator bets on the booming podcast industryby"
## [8] "Y Combinator bets on the booming podcast industry\n\nStaup News Tech"
## [9] "Tech News Y Combinator bets on the booming podcast industry \nBrew helps creators get paid to podcast with a subscr"
## [10] "Y Combinator bets on the booming podcast industry"
## [11] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [12] "Y Combinator bets on the booming podcast industry\n\nStaup News Tech"
## [13] "Y Combinator bets on the booming podcast industryvia"
## [14] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [15] "Y Combinator bets on the booming podcast industryTechCrunch"
## [16] "Y Combinator bets on the booming podcast industry"
## [17] "Y Combinator bets on the booming podcast industrytech"
## [18] "RTStrato Y Combinator bets on the booming podcast industrystaup entrepreneur"
## [19] "Strato Y Combinator bets on the booming podcast industrystaup entrepreneur"
## [20] "Tech Crunch Staup news Y Combinator bets on the booming podcast industry"
## [21] "Y Combinator bets on the booming podcast industryTechCrunch"
## [22] "RTY Combinator bets on the booming podcast industryStaup TechCrunch"
## [23] "Y Combinator bets on the booming podcast industry"
## [24] "Y Combinator bets on the booming podcast industry"
## [25] "Y Combinator bets on the booming podcast industry"
## [26] "Y Combinator bets on the booming podcast industry"
## [27] "Y Combinator bets on the booming podcast industry"
## [28] "Y Combinator bets on the booming podcast industry"
## [29] "Y Combinator bets on the booming podcast industry"
## [30] "RTY Combinator bets on the booming podcast industryStaups Tech entrepreneurs"
## [31] "Y Combinator bets on the booming podcast industryStaups Tech entrepreneurs"
## [32] "Y Combinator bets on the booming podcast industry"
## [33] "RTY Combinator bets on the booming podcast industryby"
## [34] "Y Combinator bets on the booming podcast industryStaups Tech entrepreneurs"
## [35] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [36] "Y Combinator bets on the booming podcast industry"
## [37] "Y Combinator bets on the booming podcast industry\n"
## [38] "Y Combinator bets on the booming podcast industry"
## [39] "Y Combinator bets on the booming podcast industryviastaup"
## [40] "Y Combinator bets on the booming podcast industryStaup"
## [41] "Y Combinator bets on the booming podcast industry"
## [42] "Y Combinator bets on the booming podcast industry"
## [43] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [44] "Y Combinator bets on the booming podcast industry \n\nPodcasts are exploding in popularity and Y Combinator the star"
## [45] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [46] "Y Combinator bets on the booming podcast industrydi"
## [47] "Y Combinator bets on the booming podcast industry"
## [48] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [49] "Y Combinator bets on the booming podcast industry staups news"
## [50] "Y Combinator bets on the booming podcast industryviaTechnology Love"
## [51] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [52] "Y Combinator bets on the booming podcast industry\n"
## [53] "Y Combinator bets on the booming podcast industryTech"
## [54] "Y Combinator bets on the booming podcast industry"
## [55] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [56] "Y Combinator bets on the booming podcast industry TechCrunch"
## [57] "Y Combinator bets on the booming podcast industrytechnology"
## [58] "Y Combinator bets on the booming podcast industry"
## [59] "Y Combinator bets on the booming podcast industry"
## [60] "Y Combinator bets on the booming podcast industrytech staup"
## [61] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [62] "Y Combinator bets on the booming podcast industry TechCrunch"
## [63] "Y Combinator bets on the booming podcast industryVentureCanvas"
## [64] "Y Combinator bets on the booming podcast industry staup marketing business creativelysma"
## [65] "Y Combinator bets on the booming podcast industryTechCrunch"
## [66] "Y Combinator bets on the booming podcast industry"
## [67] "Y Combinator bets on the booming podcast industrybusiness staup"
## [68] "Y Combinator bets on the booming podcast industry"
## [69] "Y Combinator bets on the booming podcast industry"
## [70] "Y Combinator bets on the booming podcast industry"
## [71] "Y Combinator bets on the booming podcast industry"
## [72] "Y Combinator bets on the booming podcast industry"
## [73] "Y Combinator bets on the booming podcast industrytech business money"
## [74] "Y Combinator bets on the booming podcast industry staup marketing business creativelysma"
## [75] "Y Combinator bets on the booming podcast industryStaup TechCrunch"
## [76] "Y Combinator bets on the booming podcast industryOPSItalia"
## [77] "Y Combinator bets on the booming podcast industry"
## [78] "Y Combinator bets on the booming podcast industry"
## [79] "staups Y Combinator bets on the booming podcast industry"
## [80] "Y Combinator bets on the booming podcast industrystaup shenzhen troublemaker"
## [81] "Y Combinator bets on the booming podcast industry"
## [82] "Y Combinator bets on the booming podcast industry"
## [83] "Y Combinator bets on the booming podcast industry"
## [84] "Y Combinator bets on the booming podcast industry"
## [85] "Y Combinator bets on the booming podcast industry"
## [86] "Y Combinator bets on the booming podcast industrystaups\n\nPodcasts are exploding in popu"
## [87] "Y Combinator bets on the booming podcast industrywhat do you think of this"
## [88] "TC Staup newsY Combinator bets on the booming podcast industry"
## [89] "Y Combinator bets on the booming podcast industry"
## [90] "sta up Y Combinator bets on the booming podcast industry"
## [91] "Y Combinator bets on the booming podcast industry"
## [92] "Y Combinator bets on the booming podcast industrymarketing leaderhship entrepreneur"
## [93] "Y Combinator bets on the booming podcast industry"
## [94] "Y Combinator bets on the booming podcast industry TechCrunchStaups"
## [95] "Y Combinator bets on the booming podcast industry"
## [96] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [97] "Combinator bets on the booming podcast industry"
## [98] "my repost see you Y Combinator bets on the booming podcast industry"
## [99] "Y Combinator bets on the booming podcast industry staup staupvic"
## [100] "Y Combinator bets on the booming podcast industry"
## [101] "Y Combinator bets on the booming podcast industrystaups"
## [102] "Y Combinator bets on the booming podcast industry"
## [103] "Y Combinator bets on the booming podcast industry"
## [104] "Y Combinator bets on the booming podcast industry"
## [105] "Y Combinator bets on the booming podcast industry"
## [106] "Y Combinator bets on the booming podcast industrystaup investment market"
## [107] "Y Combinator bets on the booming podcast industryStaup"
## [108] "Y Combinator bets on the booming podcast industrystaups entrepreneurship via Kate Clark"
## [109] "Y Combinator bets on the booming podcast industryAngelNews staup"
## [110] "Y Combinator bets on the booming podcast industry"
## [111] "Y Combinator bets on the booming podcast industrystaups"
## [112] "Y Combinator bets on the booming podcast industry"
## [113] "Y Combinator bets on the booming podcast industry"
## [114] "Y Combinator bets on the booming podcast industry\n\nPodcasts are exploding in popularity and Y Combinator the sta"
## [115] "Y Combinator bets on the booming podcast industry"
## [116] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [117] "Y Combinator bets on the booming podcast industry"
## [118] "bets on the booming podcast industryTechCrunch"
## [119] "Y Combinator bets on the booming podcast industry"
## [120] "Y Combinator bets on the booming podcast industry"
## [121] "Y Combinator bets on the booming podcast industrystaup"
## [122] "Y Combinator bets on the booming podcast industry\nMaybe twitter founderwas ahead of his time with Odeo"
## [123] "Y Combinator bets on the booming podcast industry Kate ClarkTechCrunch"
## [124] "RTY Combinator bets on the booming podcast industry featuring Ws"
## [125] "Y Combinator bets on the booming podcast industry by kateclarktweets"
## [126] "Y Combinator bets on the booming podcast industry featuring Ws"
## [127] "Y Combinator bets on the booming podcast industry"
## [128] "Y Combinator bets on the booming podcast industry"
## [129] "Y Combinator bets on the booming podcast industry"
## [130] "Y Combinator bets on the booming podcast industry"
## [131] "Y Combinator bets on the booming podcast industry"
## [132] "RTgeraldbaderY Combinator bets on the booming podcast industry via Kate ClarkML AI Analytic"
## [133] "geraldbaderY Combinator bets on the booming podcast industry via Kate ClarkML AI"
## [134] "Y Combinator bets on the booming podcast industry"
## [135] "Y Combinator bets on the booming podcast industry"
## [136] "Y Combinator bets on the booming podcast industry"
## [137] "RTY Combinator bets on the booming podcast industry"
## [138] "Y Combinator bets on the booming podcast industry"
## [139] "Y Combinator bets on the booming podcast industry"
## [140] "Y Combinator bets on the booming podcast industry"
## [141] "Y Combinator bets on the booming podcast industry"
## [142] "Y Combinator bets on the booming podcast industry"
## [143] "Y Combinator bets on the booming podcast industry"
## [144] "Y Combinator bets on the booming podcast industry"
## [145] "Y Combinator bets on the booming podcast industry"
## [146] "Y Combinator bets on the booming podcast industry"
## [147] "TECHCRUNCH Y Combinator bets on the booming podcast industry"
## [148] "Y Combinator bets on the booming podcast industry"
## [149] "Y Combinator bets on the booming podcast industry via Kate ClarkML AI Analytics"
## [150] "Y Combinator bets on the booming podcast industry"
## [151] "Y Combinator bets on the booming podcast industry"
## [152] "knownews techcrunch Y Combinator bets on the booming podcast industryMore tech news at"
## [153] "Y Combinator bets on the booming podcast industry"
## [154] "technews Y Combinator bets on the booming podcast industry"
## [155] "Y Combinator bets on the booming podcast industryVentureCapital Enteainment SanFrancisco Accelerator Funding"
## [156] "Y Combinator bets on the booming podcast industry"
## [157] "Y Combinator bets on the booming podcast industryseedaccelerator launchmarketing staups"
## [158] "Y Combinator bets on the booming podcast industry\n\nBrew helps creators get paid to podcast w"
## [159] "Y Combinator bets on the booming podcast industry"
## [160] "Y Combinator bets on the booming podcast industryby TechCrunch infosec software technology"
## [161] "Y Combinator bets on the booming podcast industry Z"
## [162] "Y Combinator bets on the booming podcast industry"
## [163] "RTY Combinator bets on the booming podcast industryby"
## [164] "RTY Combinator bets on the booming podcast industryby"
## [165] "Y Combinator bets on the booming podcast industrystaups"
## [166] "RTtechcrunch staup Y Combinator bets on the booming podcast industry"
## [167] "Y Combinator bets on the booming podcast industry TechCrunch"
## [168] "Y Combinator bets on the booming podcast industrystaups"
## [169] "Y Combinator bets on the booming podcast industrystaups"
## [170] "techcrunch staup Y Combinator bets on the booming podcast industry"
## [171] "Y Combinator bets on the booming podcast industry"
## [172] "RT TechCrunch Y Combinator bets on the booming podcast industryby kateclarktweets"
## [173] "Y Combinator bets on the booming podcast industrygrowcyber"
## [174] "Y Combinator bets on the booming podcast industry"
## [175] "RTY Combinator bets on the booming podcast industryby"
## [176] "New TechCrunch Aicle Y Combinator bets on the booming podcast industry"
## [177] "Y Combinator bets on the booming podcast industry"
## [178] "Y Combinator bets on the booming podcast industry"
## [179] "Y Combinator bets on the booming podcast industry \n\nPodcasts are exploding in popularity and"
## [180] "RT TechCrunchY Combinator bets on the booming podcast industryby kateclarktweets"
## [181] "Y Combinator bets on the booming podcast industry TechCrunch"
## [182] "RTY Combinator bets on the booming podcast industryby"
## [183] "Y Combinator bets on the booming podcast industry"
## [184] "Y Combinator bets on the booming podcast industry"
## [185] "Y Combinator bets on the booming podcast industry"
## [186] "Y Combinator bets on the booming podcast industrynews Techcrunch Technology"
## [187] "Y Combinator bets on the booming podcast industrystaups techcrunch venturecapital"
## [188] "Y Combinator bets on the booming podcast industrystaup MBADMB"
## [189] "Y Combinator bets on the booming podcast industry"
## [190] "Y Combinator bets on the booming podcast industryVia"
## [191] "RTY Combinator bets on the booming podcast industryby"
## [192] "Y Combinator bets on the booming podcast industryby kateclarktweets"
## [193] "Y Combinator bets on the booming podcast industryAccelerator Enteainment Funding"
## [194] "Y Combinator bets on the booming podcast industry"
## [195] "Y Combinator bets on the booming podcast industrystaup"
## [196] "RTY Combinator bets on the booming podcast industryby"
## [197] "RTY Combinator bets on the booming podcast industryby"
## [198] "Y Combinator bets on the booming podcast industryviaretweet retweetplease"
## [199] "RTY Combinator bets on the booming podcast industryby"
## [200] "RTY Combinator bets on the booming podcast industryby"
## [201] "Y Combinator bets on the booming podcast industryby"
Prepare for Bigram
# Install the following packages
library(dplyr)
library(tidyverse) # data manipulation & plotting
library(stringr) # text cleaning and regular expressions
library(tidytext) # provides additional text mining functions
titles <- c("v")
books <- list(tweets)
series <- tibble()
for(i in seq_along(titles)) {
clean <- tibble(chapter = seq_along(books[[i]]),
text = books[[i]]) %>%
# Number of gram
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
mutate(book = titles[i]) %>%
select(book, everything())
series <- rbind(series, clean)
}
Bigram of the Data
temp1 = subset(series, book == 'v') %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
# filter(!word1 %in% stop_words$word,
# !word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
temp1[1:20,]
## # A tibble: 20 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 y combinator 769
## 2 sam altman 215
## 3 on the 212
## 4 bets on 201
## 5 booming podcast 201
## 6 the booming 201
## 7 combinator bets 200
## 8 airbnb and 144
## 9 combinator airbnb 144
## 10 transcribed y 139
## 11 podcast industry 134
## 12 equity transcribed 123
## 13 and hoteltonight 109
## 14 in the 109
## 15 combinator president 108
## 16 president sam 105
## 17 altman is 102
## 18 demo day 88
## 19 the world 88
## 20 combinator demo 85
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
library("plyr")
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:purrr':
##
## compact
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library("stringi")
money.words = scan('moneyWords.txt', what='character', comment.char=';')
score.topic = function(sentences, dict, .progress='none')
{
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, dict) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
# sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
topic.matches = match(words, dict)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
topic.matches = !is.na(topic.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(topic.matches)
return(score)
}, dict, .progress=.progress )
topicscores.df = data.frame(score=scores, text=sentences)
return(topicscores.df)
}
topic.scores= score.topic(tweets, money.words, .progress='none')
# topic.scores= score.topic(Etweets, fear.words, .progress='none')
topic.mentioned = subset(topic.scores, score !=0)
N= nrow(topic.scores)
Nmentioned = nrow(topic.mentioned)
dftemp=data.frame(topic=c("Mentioned", "Not Mentioned"),
number=c(Nmentioned,N-Nmentioned))
p <- plot_ly(data=dftemp, labels = ~topic, values = ~number, type = 'pie') %>%
layout(title = 'Pie Chart of Tweets Talking about Money',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
library(tidytext)
titles <- c("v")
books <- list(tweets)
series <- tibble()
# create a series of book with text lines
for(i in seq_along(titles)) {
clean <- tibble(chapter = seq_along(books[[i]]),
text = books[[i]]) %>%
# unnest_tokens(word, text) %>%
mutate(book = titles[i]) %>%
select(book, everything())
series <- rbind(series, clean)
}
# find tweets with "fear"
# other emotion to find
##########################
# anger
# anticipation
# disgust
# fear
# joy
# sadness
# surprise
# trust
##########################
senti <- series %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment=="fear") %>% # replace "fear" with other emotion words
group_by(chapter)
## Joining, by = "word"
sentitext = series[senti$chapter,]
sentitext$sentiment = senti$sentiment
sentitext
## # A tibble: 60 x 4
## book chapter text sentiment
## <chr> <int> <chr> <chr>
## 1 v 18 Great job as always byon finding the diamonds … fear
## 2 v 40 RTbacked byis solving the podcast monetization… fear
## 3 v 64 RTbacked byis solving the podcast monetization… fear
## 4 v 86 RTbacked byis solving the podcast monetization… fear
## 5 v 205 RTbacked byis solving the podcast monetization… fear
## 6 v 207 RTbacked byis solving the podcast monetization… fear
## 7 v 212 backed byis solving the podcast monetization p… fear
## 8 v 267 unexpected lessons Ive learned after going thr… fear
## 9 v 273 The odds of getting from launch to liquidity w… fear
## 10 v 392 I was hoping you were joking but oh god he get… fear
## # ... with 50 more rows
Create word cloud of tweets showing fear
corpus = Corpus(VectorSource(sentitext$text))
# create term-document matrix
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE,
# tolower may cause trouble on Window because UTF-8 encoding, changed to FALSE
tolower = FALSE) )
# convert as matrix. It may consume near 1g of your RAM
tdm = as.matrix(tdm)
# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
#check top 50 most mentioned words
head(word_freqs, 50)
## Combinator Bottomless Altman Sam
## 21 15 11 11
## president problem solution one
## 10 9 9 8
## addicts coffee lazy worse
## 8 8 8 7
## idea plus staups byis
## 7 7 7 6
## monetization podcast solving building
## 6 6 6 6
## RTbacked staup combinator cappedprofit
## 5 5 5 5
## companyand terrible Did better
## 5 5 5 5
## make revealed yesterday Great
## 5 5 5 4
## The Combinators Kahik like
## 4 4 4 4
## Staup calculus god Beto
## 4 4 3 3
## Beware Folk WellIntentioned network
## 3 3 3 3
## powerful tech women Former
## 3 3 3 3
## First Only
## 3 3
#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:5)] #Here “1:5” is 1st-5th words in the list we want to remove
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
#Plot corpus in a clored graph; need RColorBrewer package
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
#check top 50 most mentioned words
head(word_freqs, 50)
## problem solution one addicts
## 9 9 8 8
## coffee lazy worse idea
## 8 8 7 7
## plus staups byis monetization
## 7 7 6 6
## podcast solving building RTbacked
## 6 6 6 5
## staup combinator cappedprofit companyand
## 5 5 5 5
## terrible Did better make
## 5 5 5 5
## revealed yesterday Great The
## 5 5 4 4
## Combinators Kahik like Staup
## 4 4 4 4
## calculus god Beto Beware
## 4 3 3 3
## Folk WellIntentioned network powerful
## 3 3 3 3
## tech women Former First
## 3 3 3 3
## Only burning great need
## 3 3 3 3
## succeed takes
## 3 3
# I see some words I don't know or understand, so I retrieve the tweets that have the words
# I retrieve all the tweets that have "nigeria" in it
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.