First we need to load the data fomr the files we downloaded before.
news_data<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
blogs_data<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
twitter_data<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
Here we can have a look at the basic feature of the data.
library(stringi)
paste("Number of words for the blogs data : " , stri_stats_latex(blogs_data)[4])
## [1] "Number of words for the blogs data : 37570839"
paste("Number of words for the news data : " , stri_stats_latex(news_data)[4])
## [1] "Number of words for the news data : 2651432"
paste("Number of words for the twitter data : " , stri_stats_latex(twitter_data)[4])
## [1] "Number of words for the twitter data : 30451128"
paste("Length for the blogs data : " , length(blogs_data))
## [1] "Length for the blogs data : 899288"
paste("Length for the news data : " , length(news_data))
## [1] "Length for the news data : 77259"
paste("Length for the twitter data : " , length(twitter_data))
## [1] "Length for the twitter data : 2360148"
We can see here that the twitter data is the largest one.
blogs_data<-iconv(blogs_data,"latin1","ASCII",sub="")
news_data<-iconv(news_data,"latin1","ASCII",sub="")
twitter_data<-iconv(twitter_data,"latin1","ASCII",sub="")
We need to make sure that the data is encoded well. Encoding is the way characters are represented and they can differ by geography.
The data is huge and we will have to do our analysis on a sample.
set.seed(123)
blogs_data<-sample(blogs_data,length(blogs_data)*0.005)
news_data<-sample(news_data,length(news_data)*0.005)
twitter_data<-sample(twitter_data,length(twitter_data)*0.005)
Here we will build a corpus - our database of words. We will remove numbers and special characters etc..
library(tm)
## Loading required package: NLP
library(NLP)
corpus<-VCorpus(VectorSource(c(blogs_data,news_data,twitter_data)))
# Convert to lower case
corpus <- tm_map(corpus, content_transformer(tolower))
# Remove numbers
corpus <- tm_map(corpus, removeNumbers)
# Remove english common stopwords
corpus <- tm_map(corpus, removeWords, stopwords('english'))
# Remove punctuation
corpus <- tm_map(corpus, removePunctuation)
# Eliminate white spaces
corpus <- tm_map(corpus, stripWhitespace)
# Create plain text
corpus<-tm_map(corpus,PlainTextDocument)
#Strip back to the root of the word
library('SnowballC')
corpus <- tm_map(corpus, stemDocument)
Now we can build n-grams - thats basically sequences of n-words. We will see which combinations show up the most.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(RWeka)
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
my_tokinizer<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
tdm<-TermDocumentMatrix(corpus,control=list(tokenize=my_tokinizer))
termFreq <- rowSums(as.matrix(tdm[findFreqTerms(tdm,lowfreq=50),]))
termFreqVector <- as.list(termFreq)
dt <- data.frame(unlist(termFreqVector), stringsAsFactors=FALSE)
# A bit of an overkill here but will work well for larger datasets
setDT(dt, keep.rownames = TRUE)[]
## rn unlist.termFreqVector.
## 1: can get 50
## 2: feel like 70
## 3: last night 99
## 4: last year 52
## 5: look forward 72
## 6: look like 85
## 7: right now 136
## 8: thank follow 59
setnames(dt, 1, "term")
setnames(dt, 2, "freq")
ggplot(data=dt, aes(x=term, y=freq)) + geom_bar(stat="identity")