This is week 2 milestone report for the Data Science Capstone project of N-Gram Predictor. In this report, a subset of data was created by randomly picking lines from the text corpus. Data cleaning operations were performed and a basic Exploratory Data Analysis was done in an attempt to understand the data. The steps taken to accomplist this are as follows.
library(tm)
library(ggplot2)
library(SnowballC)
library(ctv)
library(textclean)
library(wordcloud)
library(RColorBrewer)
library(stringi)
library(textclean)
library(RWeka)
The large data set is already sliced into a smaller one by picking up 1000 random lines form the original data set.
data <- VCorpus(DirSource("subset/"), readerControl = list(language='en'))
#print(summary(data))
print(inspect(data))
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 2306705
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 2051558
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 695735
##
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
Fetching the dictionary of profane words in order to filter them from the occurrences.
con <- url('http://www.cs.cmu.edu/~biglou/resources/bad-words.txt', 'r')
profaneWords <- readLines(con)
close(con)
In the following transformation follow following steps -
1. Convert text to lower case
2. Remove URLs
3. Remove Email Addresses
4. Remove Numbers
5. Replace apostrope with none
6. Remove Punctuation
7. Remove profane words
8. Strip extra white spaces
The corpus contains many words that are from foreign languages. They need to be removed if we are to build a model for english text. Approach to filter the words from foreign languages is to filter the non-ascii characters form the corpus. Along with this URLs, Emails are also removed.
removeURL<-function(x) gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", x)
removeEmail<-function(x) gsub("^[[:alnum:].-_]+@[[:alnum:].-]+$","", x)
removeapos<-function(x) gsub("'","",x)
removeNonEnglish<-function(x)iconv(x, "latin1", "ASCII", sub="")
tolowercase<-function(x) tolower(x)
data <- tm_map(data, content_transformer(tolower))
data <- tm_map(data, content_transformer(removeURL))
data <- tm_map(data, content_transformer(removeEmail))
data <- tm_map(data, content_transformer(removeNumbers))
data <- tm_map(data, content_transformer(removeapos))
data <- tm_map(data, content_transformer(removePunctuation))
data <- tm_map(data, content_transformer(removeNonEnglish))
data <- tm_map(data, removeWords, stopwords('en'))
data <- tm_map(data, removeWords, profaneWords)
data <- tm_map(data, stripWhitespace)
Size of the corpus after cleaning the text
print(summary(data))
## Length Class Mode
## sm_blogs.txt 2 PlainTextDocument list
## sm_nws.txt 2 PlainTextDocument list
## sm_twt.txt 2 PlainTextDocument list
print(inspect(data))
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 321363
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 17374
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 219186
##
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
dtm <- TermDocumentMatrix(data)
dt_matrix = as.matrix(dtm)
print(dim(dt_matrix))
## [1] 17933 3
tmp <- sort(rowSums(dt_matrix), decreasing = TRUE)
top <- head(tmp, 100)
words <- names(top)
df <- data.frame(words=words, freq = top)
print(head(df, 10))
## words freq
## just just 563
## like like 509
## one one 476
## will will 465
## can can 410
## get get 391
## good good 360
## time time 351
## dont dont 327
## now now 314
bigrams <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dt_matrix_two <- TermDocumentMatrix(data, control = list(tokenize = bigrams))
word_matrix_two <- as.data.frame(as.matrix(dt_matrix_two))
v2 <- sort(rowSums(word_matrix_two), decreasing = TRUE)
d2 <- data.frame(word=names(v2), freq = v2)
print(head(d2, 10))
## word freq
## cant wait cant wait 42
## right now right now 41
## im going im going 36
## last night last night 34
## dont know dont know 33
## looking forward looking forward 28
## first time first time 25
## make sure make sure 25
## dont want dont want 24
## im sure im sure 23
trigrams <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dt_matrix_three <- TermDocumentMatrix(data, control = list(tokenize = trigrams))
word_matrix_three <- as.data.frame(as.matrix(dt_matrix_three))
v3 <- sort(rowSums(word_matrix_three), decreasing = TRUE)
d3 <- data.frame(word=names(v3), freq = v3)
print(head(d3, 10))
## word freq
## happy mothers day happy mothers day 11
## renault laguna rt renault laguna rt 8
## cant wait see cant wait see 6
## couple years ago couple years ago 6
## dakota indian artifact dakota indian artifact 5
## first time ever first time ever 5
## happy new year happy new year 5
## let us know let us know 5
## north dakota indian north dakota indian 5
## amazon services llc amazon services llc 4
pal <- brewer.pal(9, "Blues")
pal <- pal[-(1:2)]
wordcloud(df$words, df$freq, min.freq = 50, scale=c(3,.5), rot.per = 0.5, random.order = T, colors = pal)
#### Word cloud for 2-gram
pal <- brewer.pal(9, "PuBuGn")
pal <- pal[-(1:2)]
wordcloud(d2$word, d2$freq, min.freq = 10, scale=c(2,.5), rot.per = 0.5, random.order = T, colors = pal)
#### Word cloud for 3-grams
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:2)]
wordcloud(d3$word, d3$freq, min.freq = 3, scale=c(2,.5), rot.per = 0.5, random.order = T, colors = pal)
barplot(tmp[1:100], xlab="term", ylab = "frequency", col=heat.colors(50), las=1)
barplot(v2[1:100], xlab="term", ylab = "frequency", col=heat.colors(50))
barplot(v3[1:100], xlab="term", ylab = "frequency", col=heat.colors(50))