In this natural language processing problem, we will create a word cloud from live Twitter data.
# Load packages
library(twitteR)
library(tm)
library(wordcloud)
library(e1071)
library(RColorBrewer)
library(class)
library(ggplot2)
football.tweets <- searchTwitter('football', n=1000, lang = 'en')
football.text <- sapply(football.tweets, function(x) x$getText())
Let’s remove emoticons and characters that are not in UTF-8 format
football.text <- iconv(football.text, 'UTF-8', 'ASCII')
Let’s look at some tweets from our data
football.text[1:10]
## [1] NA
## [2] "RT @RaveenTheDream: He's an Atlanta celebrity from Atlanta at a football game in Atlanta. https://t.co/xKC2V6IA7g"
## [3] "RT @saraholmesSTL: NFL commentary from @jthom1: 'All's well with the NFL if the rich get richer' https://t.co/mBKoFm0rAa | @stltoday #nfl"
## [4] "Im sorry but I hate football"
## [5] "Bill Snyder is college football&#039;s greatest asset https://t.co/kYoUacTNPL"
## [6] NA
## [7] "RT @kananmj: This makes my heart happy. Congrats to @CoachTimLester, to @WMU_Football and all of Bronco Nation! https://t.co/2S13MYIKOr"
## [8] "Sukoa Sports Ball Pump with Pin Needle - Soccer, Volleyball, Basketball, Rugby, Football - Superior.. https://t.co/LNkvJyfjQe"
## [9] "RT @HilariousRoasts: This is what football is all about. https://t.co/iWCt8NMhNx"
## [10] "#TEAM SATURDAYENTRY Football"
Let’s create a corpus by creating a vector source from the data
football.corpus <- Corpus(VectorSource(football.text))
football.term.doc <- TermDocumentMatrix(football.corpus,
control = list(removePunctuation = TRUE,
stopwords=c('football','soccer','girls','boys','like',
'came','get','game','play',
stopwords('en')),
removeNumbers=TRUE, tolower=TRUE))
Let’s look at some frequently used words in our twitter data and look at other words that are associated
# Find words with a frequency >= 20
findFreqTerms(football.term.doc, lowfreq = 20)
## [1] "atlanta" "barstoolsports" "boyfriend"
## [4] "celebrity" "hes" "httpstconrjiwitd"
## [7] "httpstcovmrtzieyl" "httpstcoxkcviag" "just"
## [10] "lil" "nfl" "raveenthedream"
## [13] "see" "texans" "watch"
## [16] "watching" "whatever" "worldstarfunny"
# Find words that are highly associated with the word ''
findAssocs(football.term.doc, 'atlanta', corlimit=0.4)
## $atlanta
## celebrity httpstcoxkcviag raveenthedream hes
## 1.00 1.00 1.00 0.91
The term document matrix is not a matrix. With this step, let’s convert it into a matrix
football.matrix <- as.matrix(football.term.doc)
Let’s get the word count in decreasing order of frequency
term.freq <- sort(rowSums(football.matrix), decreasing = TRUE)
Let’s create a data frame of the words and their frequencies
football.df <- data.frame(term = names(term.freq), freq = term.freq)
Let’s create a bar plot of words with frequencies >= 20
ggplot(subset(football.df, football.df$freq > 20), aes(term, freq, fill=freq)) +
geom_bar(stat='identity') + labs(x='Terms', y='Count', title='Term Frequencies') + coord_flip()
Let’s create a word cloud of 200 words with a minimum frequency of 5
wordcloud(football.df$term, football.df$freq, min.freq=5, max.words=200,
random.order=FALSE, colors=brewer.pal(8, 'Dark2'))