First lets load the datasets and setup the environment
library(ggplot2)
library(NLP)
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(tm)
library(quanteda)
## Package version: 1.5.1
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
blog_con <- file("en_US/en_US.blogs.txt", "r")
blog_us <-readLines(blog_con) # Read the whole thing
close(blog_con) ## It's important to close the connection when you are done
twt_con <- file("en_US/en_US.twitter.txt", "r")
twt_us <-readLines(twt_con) # Read the whole thing
close(twt_con) ## It's important to close the connection when you are done
news_con <- file("en_US/en_US.news.txt", "r")
news_us <-readLines(news_con) # Read the whole thing
close(news_con) ## It's important to close the connection when you are done
curse_con <- file("cursewords.txt", "r")
cursewords <-readLines(curse_con) # Read the whole thing
close(curse_con) ## It's important to close the connection when you are done
## Check the line counts
summary(blog_us)
## Length Class Mode
## 899288 character character
summary(twt_us)
## Length Class Mode
## 2360148 character character
summary(news_us)
## Length Class Mode
## 77259 character character
## Check the word counts
sum(sapply(strsplit(blog_us, " "), length))
## [1] 37334131
sum(sapply(strsplit(twt_us, " "), length))
## [1] 30373543
sum(sapply(strsplit(news_us, " "), length))
## [1] 2643969
So from what we can see, the twitter dataset has the most number of elements. Our data seems to be massive so lets sample a portion of the dataset for efficiency and runtime purposes.
## Check the line counts
summary(new_blog)
## Length Class Mode
## 8993 character character
summary(new_twt)
## Length Class Mode
## 23602 character character
summary(new_news)
## Length Class Mode
## 773 character character
## Check the word counts
sum(sapply(strsplit(new_blog, " "), length))
## [1] 372659
sum(sapply(strsplit(new_twt, " "), length))
## [1] 304839
sum(sapply(strsplit(new_news, " "), length))
## [1] 26055
## Aggregate the data into a single data frame
list.set <- list(twitter = new_twt, blog = new_blog, news = new_news)
# Create corpus and clean the data
corpus <- VCorpus(VectorSource(list.set))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removeWords, cursewords)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
Lets check if any of the top unigrams are found in all 3 of the text sources
## Get the intersect of the 3
most_all <- data.frame(word = Reduce(intersect, list(most_blog$word, most_news$word, most_twitter$word)))
most_all <- arrange(most_all, word)
most_all
## word
## 1 back
## 2 can
## 3 get
## 4 going
## 5 good
## 6 just
## 7 like
## 8 make
## 9 new
## 10 now
## 11 one
## 12 people
## 13 time
There are 13 words found in all 3 text sources.
Now lets check the bigrams and trigrams with the highest frequency