Introduction

The data set was a combination of 3 text files extracted from the social networks. It consists of twitter, blogs and news feed. It make use of the knowledge learned from the Data Science Specialization from Coursera to analyse a corpus of data set. The approach is to leverage the NLP to derive the most common words from the data set. In this exercise, only a subset of the combined data is used (90000 lines) to prove the model works.

Data Acquisition

The data sets are downloaded from the link - http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera%C2%ADSwiftKey.zip Only the 3 text files in English are used for analysis. They are namely een_US.blogs, en_US.news.txt & en_US.twitter.txt The dataset is then loaded in R and cleaned using R to filter unnecssary words.

Loading libraries

library(tm)

## Loading required package: NLP

library(wordcloud)

## Loading required package: RColorBrewer

library(RWeka)  
library(ggplot2)

## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate

library(lsa)

## Loading required package: SnowballC

library("qdapRegex")

## 
## Attaching package: 'qdapRegex'
## 
## The following object is masked from 'package:ggplot2':
## 
##     %+%

library("slam")

Load & explore data

setwd("d:/ass")
blogs <- readLines("en_US/en_US.blogs.txt", encoding="UTF-8")
twitter <- readLines("en_US/en_US.twitter.txt", encoding="UTF-8")

## Warning in readLines("en_US/en_US.twitter.txt", encoding = "UTF-8"): line
## 167155 appears to contain an embedded nul

## Warning in readLines("en_US/en_US.twitter.txt", encoding = "UTF-8"): line
## 268547 appears to contain an embedded nul

## Warning in readLines("en_US/en_US.twitter.txt", encoding = "UTF-8"): line
## 1274086 appears to contain an embedded nul

## Warning in readLines("en_US/en_US.twitter.txt", encoding = "UTF-8"): line
## 1759032 appears to contain an embedded nul

con <- file("en_US/en_US.news.txt", open="rb")
news <- readLines(con, encoding="UTF-8")
close(con)
rm(con)
allText <- c(blogs,twitter,news)
summary(allText)

##    Length     Class      Mode 
##   4269678 character character

Remove unnecessary words like retweet,hashtag,URL,links

allText <- sample(allText,90000)  
allText=gsub(" #\\S*","",allText)
allText <- rm_url(allText)
allText=gsub("[^0-9A-Za-z///' ]", "", allText)
allText = gsub("http\\w+", "", allText)

Generating Corpus from dataset

corpus <- VCorpus(VectorSource(allText))
corpus <-tm_map(corpus,removeNumbers) 
corpus <-tm_map(corpus,removePunctuation) 
corpus=tm_map(corpus, content_transformer(tolower))

Derive some of the most frequent words

dtm=TermDocumentMatrix(corpus) 
dtm <- rollup(dtm, 2, na.rm=TRUE)
td.mat=as.matrix(dtm) 
findFreqTerms(dtm, lowfreq=5)

##  [1] "followback"           "followme"             "followmeplease"      
##  [4] "goodluck"             "goodmorning"          "happybirthday"       
##  [7] "happyeaster"          "happymothersday"      "happythanksgiving"   
## [10] "havefun"              "heywhatsup"           "iagree"              
## [13] "ido"                  "iloveyou"             "imissyou"            
## [16] "loveit"               "march"                "metoo"               
## [19] "nodoubt"              "noproblem"            "ofcourse"            
## [22] "shutup"               "sotrue"               "textme"              
## [25] "thanksforfollowing"   "thanksforthefollow"   "thanksforthemention" 
## [28] "thanksfortheretweet"  "thanksforthert"       "thanksfortheshoutout"
## [31] "thankssomuch"         "thankyou"             "thankyouforthefollow"
## [34] "welcometotwitter"     "youknowit"            "yourewelcome"        
## [37] "yourwelcome"

Creating word cloud from the most frequent words

wordcloud(corpus, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))

## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : thankyouforthemention could not be fit on page. It
## will not be plotted.

## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : happyeastereveryone could not be fit on page. It
## will not be plotted.

## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : loveitwhenyoutalkthattalktome could not be fit on
## page. It will not be plotted.

## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : thanksfortherthaveagreatday could not be fit on
## page. It will not be plotted.

## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : thanksforthetweet could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : thankyousomuch could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : thankyouverymuch could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : welcomehome could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : whatthehell could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(corpus, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : yesmaam could not be fit on page. It will not be
## plotted.

Use RWeka package to generate ngrams

df=data.frame(text=unlist(sapply(corpus, '[',"content")),stringsAsFactors=F) 
token_delim=" \\t\\r\\n.!?,;\"()" 
UnigramTokenizer=NGramTokenizer(df, Weka_control(min=1,max=1)) 
BigramTokenizer=NGramTokenizer(df, Weka_control(min=2,max=2, delimiters = token_delim))
TrigramTokenizer=NGramTokenizer(df, Weka_control(min=3,max=3, delimiters = token_delim))

unigramTable=data.frame(table(UnigramTokenizer)) 
bigramTable=data.frame(table(BigramTokenizer)) 
trigramTable=data.frame(table(TrigramTokenizer))

unigramTable=unigramTable[order(unigramTable$Freq,decreasing = TRUE),] 
bigramTable=bigramTable[order(bigramTable$Freq,decreasing = TRUE),] 
trigramTable=trigramTable[order(trigramTable$Freq,decreasing = TRUE),]

Plot top 5 most frequent words from the n-grams

ggplot(unigramTable[1:5,], aes(x=reorder(UnigramTokenizer,-Freq,sum),y=Freq), ) + geom_bar(stat="Identity",fill="green") + ggtitle("Top 5 Unigrams") +geom_text(aes(label=Freq), vjust=-0.4)

ggplot(bigramTable[1:5,], aes(x=reorder(BigramTokenizer,-Freq,sum),y=Freq), ) + geom_bar(stat="Identity", fill="blue") +ggtitle("Top 5 Bigrams")  +geom_text(aes(label=Freq), vjust=-0.4)

ggplot(trigramTable[1:5,], aes(x=reorder(TrigramTokenizer,-Freq,sum),y=Freq), ) + geom_bar(stat="Identity",fill="yellow") + ggtitle("Top 5 Trigrams") +geom_text(aes(label=Freq), vjust=-0.4)

Next Steps

Increase code performance to analyse the whole dataset
Evaluate accuracy in prediction
Build shinyapps to predict next words

The program uses Natural Language Processing (NLP) to predict the common words from a data set extracted from the social networking tools

25 July, 2015

Introduction

Data Acquisition

Loading libraries

Load & explore data

Remove unnecessary words like retweet,hashtag,URL,links

Generating Corpus from dataset

Derive some of the most frequent words

Creating word cloud from the most frequent words

Use RWeka package to generate ngrams

Plot top 5 most frequent words from the n-grams

Next Steps

The program uses Natural Language Processing (NLP) to predict the common words from a data set extracted from the social networking tools

25 July, 2015

Introduction

Data Acquisition

Loading libraries

Load & explore data

Remove unnecessary words like retweet,hashtag,URL,links

Generating Corpus from dataset

Derive some of the most frequent words

Creating word cloud from the most frequent words

Use RWeka package to generate n­grams

Plot top 5 most frequent words from the n-grams

Next Steps

Use RWeka package to generate ngrams