Objective

This is week 2 milestone report for the Data Science Capstone project of N-Gram Predictor. In this report, a subset of data was created by randomly picking lines from the text corpus. Data cleaning operations were performed and a basic Exploratory Data Analysis was done in an attempt to understand the data. The steps taken to accomplist this are as follows.

library(tm)
library(ggplot2)
library(SnowballC)
library(ctv)
library(textclean)
library(wordcloud)
library(RColorBrewer)
library(stringi)
library(textclean)
library(RWeka)

Read the necessary data into VCorpus

The large data set is already sliced into a smaller one by picking up 1000 random lines form the original data set.

data <- VCorpus(DirSource("subset/"), readerControl = list(language='en'))
#print(summary(data))
print(inspect(data))
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 2306705
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 2051558
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 695735
## 
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3

Fetching the profane words list from the url

Fetching the dictionary of profane words in order to filter them from the occurrences.

con <- url('http://www.cs.cmu.edu/~biglou/resources/bad-words.txt', 'r')
profaneWords <- readLines(con)
close(con)

Cleaning the data and making it ready for visualization

In the following transformation follow following steps -
1. Convert text to lower case
2. Remove URLs
3. Remove Email Addresses
4. Remove Numbers
5. Replace apostrope with none
6. Remove Punctuation
7. Remove profane words
8. Strip extra white spaces

The corpus contains many words that are from foreign languages. They need to be removed if we are to build a model for english text. Approach to filter the words from foreign languages is to filter the non-ascii characters form the corpus. Along with this URLs, Emails are also removed.

removeURL<-function(x) gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", x)
removeEmail<-function(x) gsub("^[[:alnum:].-_]+@[[:alnum:].-]+$","", x)
removeapos<-function(x) gsub("'","",x)
removeNonEnglish<-function(x)iconv(x, "latin1", "ASCII", sub="")
tolowercase<-function(x) tolower(x)


data <- tm_map(data, content_transformer(tolower))
data <- tm_map(data, content_transformer(removeURL))
data <- tm_map(data, content_transformer(removeEmail))
data <- tm_map(data, content_transformer(removeNumbers))
data <- tm_map(data, content_transformer(removeapos))
data <- tm_map(data, content_transformer(removePunctuation))
data <- tm_map(data, content_transformer(removeNonEnglish))
data <- tm_map(data, removeWords, stopwords('en'))
data <- tm_map(data, removeWords, profaneWords)
data <- tm_map(data, stripWhitespace)

Size of the corpus after cleaning the text

print(summary(data))
##              Length Class             Mode
## sm_blogs.txt 2      PlainTextDocument list
## sm_nws.txt   2      PlainTextDocument list
## sm_twt.txt   2      PlainTextDocument list
print(inspect(data))
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 321363
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 17374
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 219186
## 
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3

Create structured data from the processed text

dtm <- TermDocumentMatrix(data)
dt_matrix = as.matrix(dtm)
print(dim(dt_matrix))
## [1] 17933     3

Exploratory Data Analysis

Visualizations and Statistics after Cleaning the Data Set

1 Gram

tmp <- sort(rowSums(dt_matrix), decreasing = TRUE)
top <- head(tmp, 100)
words <- names(top)
df <- data.frame(words=words, freq = top)
print(head(df, 10))
##      words freq
## just  just  563
## like  like  509
## one    one  476
## will  will  465
## can    can  410
## get    get  391
## good  good  360
## time  time  351
## dont  dont  327
## now    now  314

2 Gram

bigrams <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dt_matrix_two <- TermDocumentMatrix(data, control = list(tokenize = bigrams))
word_matrix_two <- as.data.frame(as.matrix(dt_matrix_two))
v2 <- sort(rowSums(word_matrix_two), decreasing = TRUE)
d2 <- data.frame(word=names(v2), freq = v2)
print(head(d2, 10))
##                            word freq
## cant wait             cant wait   42
## right now             right now   41
## im going               im going   36
## last night           last night   34
## dont know             dont know   33
## looking forward looking forward   28
## first time           first time   25
## make sure             make sure   25
## dont want             dont want   24
## im sure                 im sure   23

3 Gram

trigrams <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dt_matrix_three <- TermDocumentMatrix(data, control = list(tokenize = trigrams))
word_matrix_three <- as.data.frame(as.matrix(dt_matrix_three))
v3 <- sort(rowSums(word_matrix_three), decreasing = TRUE)
d3 <- data.frame(word=names(v3), freq = v3)
print(head(d3, 10))
##                                          word freq
## happy mothers day           happy mothers day   11
## renault laguna rt           renault laguna rt    8
## cant wait see                   cant wait see    6
## couple years ago             couple years ago    6
## dakota indian artifact dakota indian artifact    5
## first time ever               first time ever    5
## happy new year                 happy new year    5
## let us know                       let us know    5
## north dakota indian       north dakota indian    5
## amazon services llc       amazon services llc    4

Visualizations

Word clouds

Word cloud for 1-gram

pal <- brewer.pal(9, "Blues")
pal <- pal[-(1:2)]
wordcloud(df$words, df$freq, min.freq = 50, scale=c(3,.5), rot.per = 0.5, random.order = T, colors = pal)

#### Word cloud for 2-gram

pal <- brewer.pal(9, "PuBuGn")
pal <- pal[-(1:2)]
wordcloud(d2$word, d2$freq, min.freq = 10, scale=c(2,.5), rot.per = 0.5, random.order = T, colors = pal)

#### Word cloud for 3-grams

pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:2)]
wordcloud(d3$word, d3$freq, min.freq = 3, scale=c(2,.5), rot.per = 0.5, random.order = T, colors = pal)

Bar plots for word frequencies

Bar plot for 1-gram

barplot(tmp[1:100], xlab="term", ylab = "frequency", col=heat.colors(50), las=1)

Bar plots for word frequencies

Bar plot for 2-gram

barplot(v2[1:100], xlab="term", ylab = "frequency", col=heat.colors(50))

Bar plots for word frequencies

Bar plot for 3-gram

barplot(v3[1:100], xlab="term", ylab = "frequency", col=heat.colors(50))

Later Steps

  1. Build the model on the full data set and build 2-gram and 3-grams
  2. Optimize the algorithms for execution time and memory
  3. Build a data product (shiny app) that predicts the words as you type.