# load necessary packages
library(stringi)
library(NLP)
library(tm)
library(wordcloud)
library(BiocManager)
library(stringr)
library(RWeka)



Load Data

Load the data from all sources: blogs, tweets, and news.

# load the blogs
blogs <- readLines("final/en_US/en_US.blogs.txt") # read in the lines of the file
blogs <- as.data.frame(blogs) # convert the data to a data frame
dim(blogs)  # show the dimensions of the dataset
## [1] 899288      1
# load the tweets
tweets <- readLines("final/en_US/en_US.twitter.txt") # read in the lines of the file
tweets <- as.data.frame(tweets) # convert the data to a data frame
dim(tweets) # show the dimensions of the dataset
## [1] 2360148       1
# load the news
news <- readLines("final/en_US/en_US.news.txt") # read in the lines of the file
news <- as.data.frame(news) # convert the data to a data frame
dim(news) # show the dimenstions of the dataset
## [1] 77259     1


Because the dimensions of the data are large for the blogs, tweets, and news (899,288, 2,360,148, and 77,259, respectively), I’m going to take a subset of the data. This subset will be a random sample of 950 rows.

# take a random sample of 950 rows from each data source
set.seed(199)
blogsSample <- blogs[sample(nrow(blogs), 950), 1]
tweetsSample <- tweets[sample(nrow(tweets), 950), 1]
newsSample <- news[sample(nrow(news), 950), 1]
# convert the data to a matrix
blogsSample <- as.matrix(blogsSample)
tweetsSample <- as.matrix(tweetsSample)
newsSample <- as.matrix(newsSample)
# view the dimensions of the data
dim(blogsSample)
## [1] 950   1
dim(tweetsSample)
## [1] 950   1
dim(newsSample)
## [1] 950   1


Now, I have a sample of 950 rows per document, which will be easier to deal with in regards to processing time.

Clean Data

I want to find the most common words in the documents; however, I have to clean the datat prior to doing this to ensure accuracy. This first step in completing this is to convert contraction words to two-words(i.e. we’ve to we have, we’ll to we will, etc.)

clean.contract <- function(contract) {
  #contract <- gsub("â", "'", contract)
  contract <- gsub("â", "'", contract)
  doc <- gsub("â s", "", contract)
    contract <- gsub("won't", "will not", contract)
    contract <- gsub("n't", " not", contract)
    contract <- gsub("'ll", " will", contract)
    contract <- gsub("'re", " are", contract)
    contract <- gsub("'ve", " have", contract)
    contract <- gsub("'m", " am", contract)
    contract <- gsub("'s", "", contract)
    return(contract)
}
blogsSample <- clean.contract(blogsSample)
blogsSample <- gsub("[^[:alnum:]///' ]", "", blogsSample)
tweetsSample <- clean.contract(tweetsSample)
tweetsSample <- gsub("[^[:alnum:]///' ]", "", tweetsSample)
newsSample <- clean.contract(newsSample)
newsSample <- gsub("[^[:alnum:]///' ]", "", newsSample)


Now that I have replaced all the contraction words, I’m going to take the following steps to clean the data:

1. Remove all puncutation from the the documents
2. Turn all letters to lowercase
3. Remove all numbers
4. Remove all stop words
5. Remove all white space


# turn the sample tweets to a vector source then to a corpus
blogsSource <- VectorSource(blogsSample)
blogsCorpus <- VCorpus(blogsSource)
tweetsSource <- VectorSource(tweetsSample)
tweetsCorpus <- VCorpus(tweetsSource)
newsSource <- VectorSource(newsSample)
newsCorpus <- VCorpus(newsSource)
# create a function that will clean a corpus
cleanText <- function(corpus){
  corpus <- tm_map(corpus, removePunctuation)   # remove all punctuation
  corpus <- tm_map(corpus, content_transformer(tolower))  # turn all upper case letters to lower case letters
  corpus <- tm_map(corpus, removeNumbers)   # remove all numbers in the corpus
  corpus <- tm_map(corpus, removeWords, stopwords("en"))  # remove words having little informational content
  corpus <- tm_map(corpus, content_transformer(stripWhitespace))    # remove all the extra white space
  return(corpus)
 }
# clean the sample data by running it through the cleanTweets function
clean_Blogs <- cleanText(blogsCorpus)
clean_Tweets <- cleanText(tweetsCorpus)
clean_News <- cleanText(newsCorpus)
# view the first line of each cleaned data
clean_Blogs[[1]][1]
## $content
## [1] " one novels bookshops must hate hard enough spec fic weird enough fantasy realistic humour section yet humorous shelve easily lit fic suspect going prove charm read singular take world will either resonate leave cold can recommend try itif like distinctive fiction rings bells blows whistles creeps absurdities book will satisfy "
clean_Tweets[[1]][1]
## $content
## [1] "mature folks like mee d"
clean_News[[1]][1]
## $content
## [1] "mrs obama also joined husband lunch small group veterans anna pizza italian kitchen strip mall restaurant hampton va"
wc_blogs <- sum(stri_count_words(clean_Blogs))
wc_news <- sum(stri_count_words(clean_News))
wc_twitter <- sum(stri_count_words(clean_Tweets))

word_counts <- c(wc_blogs, wc_twitter, wc_news)
line_counts <- c(length(clean_Blogs), length(clean_Tweets), length(clean_News))

# building the summary table
summary_table <- data.frame(c('blogs','twitter','news'),line_counts,word_counts)
summary_table
##   c..blogs....twitter....news.. line_counts word_counts
## 1                         blogs         950       58964
## 2                       twitter         950       45054
## 3                          news         950       56303


The first line of the data shows that the punctuation, numbers, stopwords, and white space was removed. Also, there are no contractions or uppercase letters. Now, I can view the top words.

View Top Words

I want to view the most frequent words in each data sources. This will plot the top 20 used words.

# create a Term Document Matrix from the cleaned tweets
blogsTDM <- TermDocumentMatrix(clean_Blogs)
tweetsTDM <- TermDocumentMatrix(clean_Tweets)
newsTDM <- TermDocumentMatrix(clean_News)
# create a matrix from the Term Document Matrix
blogsMatrix <- as.matrix(blogsTDM)
tweetsMatrix <- as.matrix(tweetsTDM)
newsMatrix <- as.matrix(newsTDM)
# sort the row sums in decreasing order
blogsFreq <- sort(rowSums(blogsMatrix), decreasing = TRUE)
tweetsFreq <- sort(rowSums(tweetsMatrix), decreasing = TRUE)
newsFreq <- sort(rowSums(newsMatrix), decreasing = TRUE)
# plot the top 20 cleaned words
par(mfrow=c(2, 2))
barplot(blogsFreq[1:20], col = "hot pink", las = 2, main = "Top 20 Blog Words")
barplot(tweetsFreq[1:20], col = "hot pink", las = 2, main = "Top 20 Tweet Words")
barplot(newsFreq[1:20], col = "hot pink", las = 2, main = "Top 20 News Words")


The top words in each source shows some interesting characteristics. First, each source has a few of the same word such as, just and first. Each source has some uniqueness that seems obvious such as, percent, never, and black are top words news are said, will, and just.

View Top Two-Word Combinations

This will look at the top two-word combinations from each data source. The results will be in wordclouds to view which two-word combinations are more frequent than others, based on their size and color.

View Top Two-Word Combinations from All Data Sources Combined

Now, I’m going to view the top two-word combinations when all data sources are combined. The results will be put in wordclouds, and the highest frequency will be determined by size and color.

# create a 2 word tokenizer
library(rJava)
library(RWeka)
textTokenizer <- function(x)
  NGramTokenizer(x, Weka_control(min = 2, max = 2))
# apply the tokenizer to each data source
blogsDTM <- DocumentTermMatrix(
  clean_Blogs,
  control = list(tokenize = textTokenizer)
)
tweetsDTM <- DocumentTermMatrix(
  clean_Tweets,
  control = list(tokenize = textTokenizer)
)
newsDTM <- DocumentTermMatrix(
  clean_News,
  control = list(tokenize = textTokenizer)
)
# convert to matrix
blogsDTM_m <- as.matrix(blogsDTM)
tweetsDTM_m <- as.matrix(tweetsDTM)
newsDTM_m <- as.matrix(newsDTM)
# combine the above matrices to create one big matrix
allData_m <- cbind(blogsDTM_m, tweetsDTM_m)
allData_m <- cbind(allData_m, newsDTM_m)
# calculate the column sums of the matrices 
allDataFrequency <- colSums(allData_m)
# get the names of the top words
allDataBi <- names(allDataFrequency)
# create 2-word wordclouds
wordcloud(allDataBi, allDataFrequency, max.words = 20, colors = brewer.pal(4, "PiYG"))


The wordcloud shows that the highest frequency for all data sources combined is last year followed by new jersey and st louis.

View Top Three-Word Combinations of all data sources

Now, I’m going to view the top three-word combinations when all data sources are combined. The results will be put in wordclouds, and the highest frequency will be determined by size and color. I’m going to suppress that actual code; however, this follows the exact code from above but changing the min/max numbers in the tokenizer to 3.

The wordcloud shows that the highest frequency of three-word combinations for all data sources combined is new york city and plus b spc. I will have to deal with the a hat result, as it needs to be fixed.