Task 2 - Exploratory Data Analysis

The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.

Loading Required Libraries

library(ggplot2)
library(NLP)
library(tm)
library(ngram)
library(RColorBrewer)
library(corpus)
library(stringi)
library(wordcloud)
library(RWeka)
options(mc.cores=1)

Sizes of the given data sets

bdata<-(file.info("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.blogs.txt")$size)/1024/1024
tdata<-(file.info("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.twitter.txt")$size)/1024/1024
ndata<-(file.info("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.news.txt")$size)/1024/1024

sprintf("The en_US.blogs.txt file is: %s Megabytes", bdata)
## [1] "The en_US.blogs.txt file is: 200.424207687378 Megabytes"
sprintf("The en_US.twitter.txt file is: %s Megabytes", tdata)
## [1] "The en_US.twitter.txt file is: 159.364068984985 Megabytes"
sprintf("The en_US.news.txt file is: %s Megabytes", ndata)
## [1] "The en_US.news.txt file is: 196.277512550354 Megabytes"

Number of lines in the given data sets

twitterdata<-file("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.twitter.txt", open="rb")
total_lines<- 0L 
while(length(chunk<-readBin(twitterdata,"raw", 65536))>0){
  total_lines<-total_lines+sum(chunk==as.raw(10L))
}
close(twitterdata)

blogsdata<-file("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.blogs.txt", open="rb")
total_lines0<-0L
while(length(chunk<-readBin(blogsdata,"raw", 65536))>0){
  total_lines0<-total_lines0+sum(chunk==as.raw(10L))
}
close(blogsdata)

newsdata<-file("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.news.txt", open="rb")
total_lines1<-0L
while(length(chunk<-readBin(newsdata,"raw", 65536))>0){
  total_lines1<-total_lines1+sum(chunk==as.raw(10L))
}
close(newsdata)

sprintf("The en_US.twitter.txt file has: %s Lines", total_lines)
## [1] "The en_US.twitter.txt file has: 2360148 Lines"
sprintf("The en_US.blogs.txt file has: %s Lines", total_lines0)
## [1] "The en_US.blogs.txt file has: 899288 Lines"
sprintf("The en_US.news.txt file has: %s Lines", total_lines1)
## [1] "The en_US.news.txt file has: 1010242 Lines"

*** Measuring the length of the longest line seen in any of the three en_US data sets ***

blogsdata<-file("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.blogs.txt", open="rb")
twitterdata<-file("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.twitter.txt", open="rb")
newsdata<-file("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.news.txt", open="rb")
#Reading Lines
blogsdata_lines<-readLines(blogsdata, warn=FALSE, encoding="UTF-8")
close(blogsdata)
blogsdata_L<-summary(nchar(blogsdata_lines))[6]
blogsdata_L
##  Max. 
## 40833
twitterdata_lines<-readLines(twitterdata, warn=FALSE, encoding="UTF-8")
close(twitterdata)
twitterdata_L<-summary(nchar(twitterdata_lines))[6]
twitterdata_L
## Max. 
##  140
newsdata_lines<-readLines(newsdata, warn=FALSE, encoding="UTF-8")
close(newsdata)
newsdata_L<-summary(nchar(newsdata_lines))[6]
newsdata_L
##  Max. 
## 11384

Tasks to accomplish

Taking random samples of the data for easy analyses

set.seed(0916)
sample_blog<-sample(blogsdata_lines, length(blogsdata_lines)*0.05)
sample_twitter<-sample(twitterdata_lines, length(twitterdata_lines)*0.05)
sample_news<-sample(newsdata_lines, length(newsdata_lines)*0.05)

#Combining the samples and checking total length
sample_together<-c(sample_blog, sample_twitter, sample_news)
sample_together<-iconv(sample_together, "UTF-8", "ASCII", sub="")
length(sample_together)
## [1] 213483

Exploratory Analysis - - Checking the sample

corpus<-VCorpus(VectorSource(sample_together))
corpus<-tm_map(corpus, content_transformer(stripWhitespace))#Removes multiple white spaces between words
corpus<-tm_map(corpus, content_transformer(tolower))#Converts texts or tokens to lower(or upper) case 
corpus<-tm_map(corpus, content_transformer(removePunctuation))#Removes punctuation marks
corpus<-tm_map(corpus, content_transformer(removeNumbers))#Removes Numbers
corpus<-tm_map(corpus, content_transformer(PlainTextDocument))# Creates Plain Text Documents
corpus<-tm_map(corpus, removeWords, stopwords("english"))
  1. Perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.

Getting Rid of Punctuation Marks and Functional Words and Creating N-GRAM Database for the Given Data Sets

#Checking Most Frequent One Word, Two Words, Three Words, and 4 Words Combinations in the Combined Data Set
# A. UniGram
unigram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
tdm_uni<-TermDocumentMatrix(corpus,control=list(tokenize=unigram))
mostFreq_uni<-findFreqTerms(tdm_uni, lowfreq=40)

# B. BiGrams
bigram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
tdm_bi<-TermDocumentMatrix(corpus,control=list(tokenize=bigram))
mostFreq_bi<-findFreqTerms(tdm_bi, lowfreq=30)

# C. TriGrams
trigram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
tdm_tri<-TermDocumentMatrix(corpus,control=list(tokenize=trigram))
mostFreq_tri<-findFreqTerms(tdm_tri, lowfreq=20)

# D. QuadGrams
quadgram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
tdm_quad<-TermDocumentMatrix(corpus,control=list(tokenize=quadgram))
mostFreq_quad<-findFreqTerms(tdm_quad, lowfreq=15)

A.2. Creating a Bar Diagram

ggplot(freq_uni[1:30,], aes(factor(word, levels=unique(word)),frequency))+ geom_bar(stat=‘identity’)+ theme(axis.test.x=element_text(angle=85))+ xlab(‘High Frequency Word List-30’)+ ylab(‘Frequency’) ```

  1. Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.

Creating Frequency and Order of the Selected Bi-grams bi_blog_freq <- rowSums(as.matrix(BiGramBlogs)) bi_blog_order <- bi_news_freq <- bi_news_order <- bi_twitter_freq <- bi_twitter_order<- Creating Word could of Bi-grams of the Selected Sample Sizes