Introduction

The data set was a combination of 3 text files extracted from the social networks. It consists of twitter, blogs and news feed. It make use of the knowledge learned from the Data Science Specialization from Coursera to analyse a corpus of data set. The approach is to leverage the NLP to derive the most common words from the data set. In this exercise, only a subset of the combined data is used (90000 lines) to prove the model works.

Data Acquisition

The data sets are downloaded from the link - http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera%C2%ADSwiftKey.zip Only the 3 text files in English are used for analysis. They are namely een_US.blogs, en_US.news.txt & en_US.twitter.txt The dataset is then loaded in R and cleaned using R to filter unnecssary words.

Loading libraries

library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(RWeka)  
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate
library(lsa)
## Loading required package: SnowballC
library("qdapRegex") 
## 
## Attaching package: 'qdapRegex'
## 
## The following object is masked from 'package:ggplot2':
## 
##     %+%
library("slam")

Load & explore data

setwd("d:/ass")
blogs <- readLines("en_US/en_US.blogs.txt", encoding="UTF-8")
twitter <- readLines("en_US/en_US.twitter.txt", encoding="UTF-8")
## Warning in readLines("en_US/en_US.twitter.txt", encoding = "UTF-8"): line
## 167155 appears to contain an embedded nul
## Warning in readLines("en_US/en_US.twitter.txt", encoding = "UTF-8"): line
## 268547 appears to contain an embedded nul
## Warning in readLines("en_US/en_US.twitter.txt", encoding = "UTF-8"): line
## 1274086 appears to contain an embedded nul
## Warning in readLines("en_US/en_US.twitter.txt", encoding = "UTF-8"): line
## 1759032 appears to contain an embedded nul
con <- file("en_US/en_US.news.txt", open="rb")
news <- readLines(con, encoding="UTF-8")
close(con)
rm(con)
allText <- c(blogs,twitter,news)
summary(allText)
##    Length     Class      Mode 
##   4269678 character character

Generating Corpus from dataset

corpus <- VCorpus(VectorSource(allText))
corpus <-tm_map(corpus,removeNumbers) 
corpus <-tm_map(corpus,removePunctuation) 
corpus=tm_map(corpus, content_transformer(tolower)) 

Derive some of the most frequent words

dtm=TermDocumentMatrix(corpus) 
dtm <- rollup(dtm, 2, na.rm=TRUE)
td.mat=as.matrix(dtm) 
findFreqTerms(dtm, lowfreq=5)
##  [1] "followback"           "followme"             "followmeplease"      
##  [4] "goodluck"             "goodmorning"          "happybirthday"       
##  [7] "happyeaster"          "happymothersday"      "happythanksgiving"   
## [10] "havefun"              "heywhatsup"           "iagree"              
## [13] "ido"                  "iloveyou"             "imissyou"            
## [16] "loveit"               "march"                "metoo"               
## [19] "nodoubt"              "noproblem"            "ofcourse"            
## [22] "shutup"               "sotrue"               "textme"              
## [25] "thanksforfollowing"   "thanksforthefollow"   "thanksforthemention" 
## [28] "thanksfortheretweet"  "thanksforthert"       "thanksfortheshoutout"
## [31] "thankssomuch"         "thankyou"             "thankyouforthefollow"
## [34] "welcometotwitter"     "youknowit"            "yourewelcome"        
## [37] "yourwelcome"

Creating word cloud from the most frequent words

wordcloud(corpus, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : thankyouforthemention could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : happyeastereveryone could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : loveitwhenyoutalkthattalktome could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : thanksfortherthaveagreatday could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : thanksforthetweet could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : thankyousomuch could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : thankyouverymuch could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : welcomehome could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : whatthehell could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : yesmaam could not be fit on page. It will not be
## plotted.

Use RWeka package to generate n­grams 

df=data.frame(text=unlist(sapply(corpus, '[',"content")),stringsAsFactors=F) 
token_delim=" \\t\\r\\n.!?,;\"()" 
UnigramTokenizer=NGramTokenizer(df, Weka_control(min=1,max=1)) 
BigramTokenizer=NGramTokenizer(df, Weka_control(min=2,max=2, delimiters = token_delim))
TrigramTokenizer=NGramTokenizer(df, Weka_control(min=3,max=3, delimiters = token_delim))

unigramTable=data.frame(table(UnigramTokenizer)) 
bigramTable=data.frame(table(BigramTokenizer)) 
trigramTable=data.frame(table(TrigramTokenizer))

unigramTable=unigramTable[order(unigramTable$Freq,decreasing = TRUE),] 
bigramTable=bigramTable[order(bigramTable$Freq,decreasing = TRUE),] 
trigramTable=trigramTable[order(trigramTable$Freq,decreasing = TRUE),]

Plot top 5 most frequent words from the n-grams

ggplot(unigramTable[1:5,], aes(x=reorder(UnigramTokenizer,-Freq,sum),y=Freq), ) + geom_bar(stat="Identity",fill="green") + ggtitle("Top 5 Unigrams") +geom_text(aes(label=Freq), vjust=-0.4)

ggplot(bigramTable[1:5,], aes(x=reorder(BigramTokenizer,-Freq,sum),y=Freq), ) + geom_bar(stat="Identity", fill="blue") +ggtitle("Top 5 Bigrams")  +geom_text(aes(label=Freq), vjust=-0.4)

ggplot(trigramTable[1:5,], aes(x=reorder(TrigramTokenizer,-Freq,sum),y=Freq), ) + geom_bar(stat="Identity",fill="yellow") + ggtitle("Top 5 Trigrams") +geom_text(aes(label=Freq), vjust=-0.4)

Next Steps