This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install necessary packages. Comment after installation

#install.packages('tm')
#install.packages('RColorBrewer')
#install.packages('wordcloud')
#install.packages("slam", type = "binary")

Include the packages.

library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
library('slam')

Process data

entrepreneurshipData <- readRDS("entrepreneurship.RDS")
tweets <- entrepreneurshipData$text

# swap out all non-alphanumeric characters
# Note that the definition of what constitutes a letter or a number or a punctuatution mark varies slightly depending upon your locale, so you may need to experiment a little to get exactly what you want.
# str_replace_all(tweets, "[^[:alnum:]]", " ")
# iconv(tweets, from = 'UTF-8', to = 'ASCII//TRANSLIT')
# Encoding(tweets)  <- "UTF-8"

# Function to clean tweets
clean.text = function(x)
{
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  # tolower
  x = tolower(x)
  return(x)
}

# clean tweets
tweets = clean.text(tweets)

Create word cloud of tweets

corpus = Corpus(VectorSource(tweets))

# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, 
  # tolower may cause trouble on Window because UTF-8 encoding, changed to FALSE  
    tolower = FALSE) )

# convert as matrix. It may consume near 1g of your RAM
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

#check top 50 most mentioned words
head(word_freqs, 50)
## entrepreneurship         business              amp    entrepreneurs 
##             3641             1319              966              632 
##             will     entrepreneur             help              can 
##              589              516              474              437 
##            young              new       innovation        marketing 
##              409              395              371              354 
##              now             grow             know       government 
##              349              340              339              336 
##            staup           social          nigeria           person 
##              332              318              314              313 
##             rtin             fast      appointment           staups 
##              311              307              301              298 
##          epitome            think           people          success 
##              296              293              260              259 
##              get       leadership            great            women 
##              250              248              247              239 
##      development        education            rtthe          someone 
##              225              220              219              217 
##            youth             join              sta         students 
##              215              215              215              214 
##             like            first         entrepre         inspired 
##              213              209              208              205 
##       businesses             time           appeal             lose 
##              203              193              191              191 
##              one           mallya 
##              189              185
#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:5)]  #Here “1:5” is 1st-5th words in the list we want to remove 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 50)
## entrepreneur         help          can        young          new 
##          516          474          437          409          395 
##   innovation    marketing          now         grow         know 
##          371          354          349          340          339 
##   government        staup       social      nigeria       person 
##          336          332          318          314          313 
##         rtin         fast  appointment       staups      epitome 
##          311          307          301          298          296 
##        think       people      success          get   leadership 
##          293          260          259          250          248 
##        great        women  development    education        rtthe 
##          247          239          225          220          219 
##      someone        youth         join          sta     students 
##          217          215          215          215          214 
##         like        first     entrepre     inspired   businesses 
##          213          209          208          205          203 
##         time       appeal         lose          one       mallya 
##          193          191          191          189          185 
##       rtdear        vijay         just        skill     minister 
##          185          185          183          176          171
# I see some words I don't know or understand, so I retrieve the tweets that have the words
# I retrieve all the tweets that have "nigeria" in it

index = grep("solutions", tweets)
tweets[index]
##  [1] "rtpretty cool to seeglobal businessentrepreneurship and technology solutions interns together"                              
##  [2] "rtundp paners with tencent in an initiative connecting cities to solutions to foster innovation and entrepreneurship t"     
##  [3] "entrepreneurship and innovation to make provide lasting solutions to challenges facing kenya from food insecurity t"        
##  [4] "socent has reached beyond silicon valley amp utilizing its tools amp resources has found solutions to global problems"      
##  [5] "did the things this past week with a small group of ladiestalked entrepreneurship challenges and solutionsgav"              
##  [6] "rtpretty cool to seeglobal businessentrepreneurship and technology solutions interns together"                              
##  [7] "pretty cool to seeglobal businessentrepreneurship and technology solutions interns together"                                
##  [8] "rtcleantechcamp is a suppo program for entrepreneurship in the field of clean energy we look for solutions with"            
##  [9] "cleantechcamp is a suppo program for entrepreneurship in the field of clean energy we look for solutions w"                 
## [10] "rtgreat oppounity to promote entrepreneurship amp innovation in high school through designing solutions to improve mental h"
## [11] "rtgreat oppounity to promote entrepreneurship amp innovation in high school through designing solutions to improve mental h"
## [12] "rtgood morning\n\nrecruitment entrepreneurship teamtbs teamkhumo solutions wealth hireagraduate teamkhumo jobseekers"       
## [13] "congratulations you made it through january but how did you get on with your goals and resolutions heres how t"             
## [14] "rtgood morning\n\nrecruitment entrepreneurship teamtbs teamkhumo solutions wealth hireagraduate teamkhumo jobseekers"       
## [15] "good morning\n\nrecruitment entrepreneurship teamtbs teamkhumo solutions wealth hireagraduate teamkhumo"                    
## [16] "day two out of fivelooking forward to it yesterday was all about entrepreneurship problems and solutions"                   
## [17] "leading retirement solutions was recently accepted to the edward lowe foundation economic gardening program"                
## [18] "technoserve and argidius foundation expand panership to scale entrepreneurship solutions"                                   
## [19] "why does every comrade have to include these points in argument\n ad hominems\n government only solutions\n tax"            
## [20] "icymi lighting the way with innovative solutions five questions with latif jamani president of calgary lighting"            
## [21] "entrepreneurship practical solutions means a billion company entrepreneurship womenpreneurs womeninbiz"                     
## [22] "rtcreative solutions for food waste water conservation and physical therapy impress judges at rd annualpitch competition"   
## [23] "rthave you met our newest gsbi coho presenting our largest coho of socentsfocus on offgrid energy solutions"                
## [24] "percent of all new years resolutions never get fulfilled check out thisfor some t"                                          
## [25] "rtundp paners with tencent in an initiative connecting cities to solutions to foster innovation and entrepreneurship t"     
## [26] "creative solutions for food waste water conservation and physical therapy impress judges at rd annualpitch"                 
## [27] "rtattention calling on all students entrepreneurial structures at vut you are invited to engage in providing solutions"     
## [28] "e amp s home care solutions is hiring for various positions including home health aide registered nurse and more"           
## [29] "rtgreat oppounity to promote entrepreneurship amp innovation in high school through designing solutions to improve mental h"
## [30] "rtgreat oppounity to promote entrepreneurship amp innovation in high school through designing solutions to improve mental h"
## [31] "sta with problems not solutionsthe staupmedium\n\nentrepreneurship staup"

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.