Capstone Project: Data Mining using NLP concepts

Overview

The purpose of this report is to do exploratory analysis of the text data set provided in the Capstone Project of Coursera DataScience Course. I am going to apply NLP (Natural Language Processing) techniques to explore the given dataset and use it to create prediction model for a set of texts.

Data Source

The data in the Capstone Project is obtained from Coursera.It has data in four different lanuages from three different sources:

Twitter
Blog
News

For this project, I am using data for english lanuage only.

setwd("~/Downloads/final/en_US")
source("max_len.R")
## Get blog data
blog <- readLines("en_US.blogs.txt")
## No. of records in blog data
length(blog)

## [1] 899288

## maximum length of a record
max_len(blog)

## [1] 40833

## Get news data
news <- readLines("en_US.news.txt")
##no.of records in news data
length(news)

## [1] 1010242

## maximum length of a record
max_len(news)

## [1] 11384

## Get twitter data
##twit <- readLines("en_US.twitter.txt")
## Removed hidden charcter ^@ manually, and saved and copied original file. 
twit <- readLines("en_US.twitter1.txt")
##no.of records in twitter data
length(twit)

## [1] 2360148

## maximum length of a record
max_len(twit)

## [1] 140

Data Cleanup and Preprocessing

I need to remove non-ascii characters, links, urls from the text, before processing it further.

library(stringr)
## Cleaning up tweet data
twit <- str_replace_all(twit, "http*", "")
twit <- str_replace_all(twit, "www*", "")
twit <- str_replace_all(twit, "[;:?.>&#_!()<%~@^*$/=]", "")

## Cleaning up blog data
blog <- str_replace_all(blog, "http*", "")
blog <- str_replace_all(blog, "www*", "")
blog <- str_replace_all(blog, "[;:?.>&#_!()<%~@^*$/=]", "")

## Cleaning up news data
news <- str_replace_all(news, "http*", "")
news <- str_replace_all(news, "www*", "")
news <- str_replace_all(news, "[;:?.>&#_!()<%~@^*$/=]", "")

I have used rbinom for random sampling of data, for the exploratory analysis and prediction modeling.

library(tm)

## Create random binomial variable to be used in data selection
xT <- c(rbinom(length(twit), 1, .01))
xB <- c(rbinom(length(blog), 1, .01))
xN <- c(rbinom(length(news), 1, .01))

## Create corpus for tweet
cTwit <- Corpus(VectorSource(twit[which(xT==1)]))
## Create corpus for blog
cBlog <- Corpus(VectorSource(blog[which(xB==1)]))
## Create corpus for news
cNews <- Corpus(VectorSource(news[which(xN==1)]))

tm package is used to cleanup the corpuses to remove unnecessary words and characters to get a good sample of data, to be used in prediction modeling.

library(tm)

## Create text vector from a text file to remove profane words
conpf <- file("badwords.txt", "r")
pfVector <- VectorSource(conpf)

## Cleanup twit corpus
## Remove punctuation symbols
cTwit <- tm_map(cTwit, removePunctuation)
cBlog <- tm_map(cBlog, removePunctuation)
cNews <- tm_map(cNews, removePunctuation)
## Remove numbers
cTwit <- tm_map(cTwit, removeNumbers)
cBlog <- tm_map(cBlog, removeNumbers)
cNews <- tm_map(cNews, removeNumbers)
## Convert to lower case
cTweet <- tm_map(cTwit, tolower)
cBlog <- tm_map(cBlog, tolower)
cNews <- tm_map(cNews, tolower)
## Remove profane words
cTwit <- tm_map(cTwit, removeWords, pfVector)
cBlog <- tm_map(cBlog, removeWords, pfVector)
cNews <- tm_map(cNews, removeWords, pfVector)
## Remove stopwords
cTwit <- tm_map(cTwit, removeWords, stopwords("english"))
cBlog <- tm_map(cBlog, removeWords, stopwords("english"))
cNews <- tm_map(cNews, removeWords, stopwords("english"))
## Remove white spaces
cTwit <- tm_map(cTwit, stripWhitespace)
cBlog <- tm_map(cBlog, stripWhitespace)
cNews <- tm_map(cNews, stripWhitespace)
## Need to convert corpus to plainTextDocument, after running tolower
cTwit <- tm_map(cTwit, PlainTextDocument)
cBlog <- tm_map(cBlog, PlainTextDocument)
cNews <- tm_map(cNews, PlainTextDocument)
## Stem corpus
library(SnowballC)
cTwit <- tm_map(cTwit, stemDocument)
cBlog <- tm_map(cBlog, stemDocument)
cNews <- tm_map(cNews, stemDocument)

library(wordcloud)
wordcloud(cTwit, scale=c(5,0.01), max.words=50, random.order=FALSE, rot.per=0.15, use.r.layout=FALSE, colors = brewer.pal(9, "Dark2"))

wordcloud(cBlog, scale=c(5,0.01), max.words=50, random.order=FALSE, rot.per=0.15, use.r.layout=FALSE, colors = brewer.pal(9, "Dark2"))

wordcloud(cNews, scale=c(5,0.01), max.words=99, random.order=FALSE, rot.per=0.15, use.r.layout=FALSE, colors = brewer.pal(9, "Dark2"))

Wordcloud plot shows the top few frequent terms to give us vague idea about how the most frequent terms in the corpus. To get more statistical details, we need to analyze the corpus using other techniques available in NLP, such as Ter Document Matrix analysis.

library(RWeka)
## Merge the three corpuses together
cMerged <- c(cTwit, cBlog, cNews)
length(cMerged)

## [1] 212986

## Create term document matrix
tdm <- TermDocumentMatrix(cMerged)
tdm

## <<TermDocumentMatrix (terms: 113258, documents: 212986)>>
## Non-/sparse entries: 2557420/24119810968
## Sparsity           : 100%
## Maximal term length: 231
## Weighting          : term frequency (tf)

## Remove sparse terms
tdmDense <- removeSparseTerms(tdm, 0.999)
tdmDense

## <<TermDocumentMatrix (terms: 2047, documents: 212986)>>
## Non-/sparse entries: 1876666/434105676
## Sparsity           : 100%
## Maximal term length: 12
## Weighting          : term frequency (tf)

tdmD1 <- removeSparseTerms(tdmDense, 0.99)
tdmD1

## <<TermDocumentMatrix (terms: 159, documents: 212986)>>
## Non-/sparse entries: 706463/33158311
## Sparsity           : 98%
## Maximal term length: 7
## Weighting          : term frequency (tf)

findFreqTerms(tdmD1, lowfreq=100, highfreq = Inf)

##   [1] "also"    "alway"   "anoth"   "around"  "ask"     "away"    "back"   
##   [8] "best"    "better"  "big"     "book"    "busi"    "call"    "can"    
##  [15] "cant"    "chang"   "citi"    "come"    "day"     "didnt"   "differ" 
##  [22] "dont"    "end"     "even"    "ever"    "everi"   "everyon" "famili" 
##  [29] "feel"    "final"   "find"    "first"   "follow"  "found"   "free"   
##  [36] "friend"  "game"    "get"     "girl"    "give"    "good"    "got"    
##  [43] "great"   "guy"     "happen"  "happi"   "hard"    "head"    "help"   
##  [50] "high"    "home"    "hope"    "hour"    "hous"    "ill"     "includ" 
##  [57] "its"     "ive"     "job"     "just"    "keep"    "know"    "last"   
##  [64] "let"     "life"    "like"    "littl"   "live"    "lol"     "long"   
##  [71] "look"    "lot"     "love"    "made"    "make"    "man"     "mani"   
##  [78] "may"     "mean"    "meet"    "might"   "miss"    "month"   "morn"   
##  [85] "move"    "much"    "name"    "need"    "never"   "new"     "next"   
##  [92] "night"   "now"     "old"     "one"     "open"    "part"    "peopl"  
##  [99] "person"  "place"   "plan"    "play"    "point"   "post"    "put"    
## [106] "read"    "realli"  "right"   "run"     "said"    "say"     "school" 
## [113] "season"  "second"  "see"     "seem"    "set"     "show"    "sinc"   
## [120] "someth"  "start"   "state"   "still"   "stop"    "sure"    "take"   
## [127] "talk"    "team"    "tell"    "thank"   "that"    "the"     "there"  
## [134] "thing"   "think"   "though"  "thought" "three"   "time"    "today"  
## [141] "tonight" "tri"     "turn"    "two"     "use"     "wait"    "want"   
## [148] "watch"   "way"     "week"    "well"    "what"    "will"    "win"    
## [155] "work"    "world"   "year"    "you"     "your"

## Plot top 20 frequent terms
FreqTerm <- as.data.frame(rowSums(as.matrix(tdmD1)))
FreqTerm$words <- row.names(FreqTerm)
row.names(FreqTerm) <- NULL
colnames(FreqTerm)[1] <- 'totalFreq'
library(plyr)
library(dplyr)
FreqTerm <- FreqTerm %>% arrange(desc(totalFreq))
FreqTerm20 <- FreqTerm[1:20,]
library(ggplot2)
##qplot(FreqTerm20$words,FreqTerm20$totalFreq)
qplot(FreqTerm20$words,FreqTerm20$totalFreq, xlab= "Word", ylab = "Frequency", geom= "bar", stat="identity")

The term document analysis shows that the original matrix gotten from this corpus is very sparse. For this sample, I will do further analysis of n-gram tokens. This will be helpful in creating algorithm for next word prediction. As per Markov Chain MonteCarlo theorem, we can ignore the probabilities of the all the preceding words except that nearest word and use it to predict the next word. So, I will make 2-gram and 3-gram tokens to count the frequency of the words appearing together.

Conclusion

From this sample, I got very sparse term document matrix and the predication based on term document frequency was not very accurate. Model based on N-gram frequencies was more helpful in predicting next words during initial testing.

Next steps:

Get good sample size and improve on utilizing the avaialble memory to process this sample.
Count n-gram frequencies and use add-one smoothing to get probablities for unknown terms.
User Good-Turing and Kneser-Ney smooting to modify the probabilities to create better alogorithm for next word prediction.
Create shiny app for the next word prediction program.
Create presentation for the above program using slidify.