Data Science Capstone Peer-graded Assignment: Milestone Report

Introduction

The goal of this report is to explain only the major features of the data I have identified and briefly summarize plans for creating the prediction algorithm and Shiny app for the Capstone Project.

Loading the Files and the Packages

library(dplyr)
library(ggplot2)
library(stringr)
library(stringi)
library(tm) 
library(NLP)
library(qdap)
library(RWeka)
library(ngram)

blog <- file("blog.txt", "r")
lines1<-readLines(blog)
close(blog)

twitter <- file("en_US.twitter.txt", "r") 
lines2<-readLines(twitter, skipNul = TRUE)
close(twitter)

news<- file("en_US.news.txt", "rb")
lines3<-readLines(news)
close(news)

lenblog<-length(lines1)
lentwitter<-length(lines2)
lennews<-length(lines3)

wblog<-sum(str_count(lines1))
wtwitter<-sum(str_count(lines2))
wnews<-sum(str_count(lines3))

size_blogs <- file.info("en_US/en_US.blogs.txt")$size / 1024^2
size_news <- file.info("en_US/en_US.news.txt")$size  / 1024^2

size_twitter <- file.info("en_US/en_US.twitter.txt")$size / 1024^2 


tabled_data<-data.frame(file=c("en_US.blogs.txt", "en_US.twitter.txt", "en_US.news.txt"),
                  lines.count=c(lenblog,lentwitter,lennews),
                  word.count=c(wblog,wtwitter,wnews),
                  size=c(size_blogs,size_news,size_twitter))
tabled_data

##                file lines.count word.count size
## 1   en_US.blogs.txt       77259   15683765   NA
## 2 en_US.twitter.txt     2360148  162385035   NA
## 3    en_US.news.txt     1010242  203791405   NA

Due to my laptops memory limits the analysis is performed by using limited data from the dataset

set.seed(12345)
blogs <-iconv(lines1,"latin1","ASCII",sub="")
news <-iconv(lines2,"latin1","ASCII",sub="")
twitter <-iconv(lines3,"latin1","ASCII",sub="")

sample_data <-c(sample(blogs,length(blogs)*0.005),
                sample(news,length(news)*0.005),
                sample(twitter,length(twitter)*0.005))

Cleaning of the data

Cleaning process done where - lowercase letters (Example : “Can” –> “can”) - replace contractions (Example : “doesn’t” –> “does not”) - remove punctuation - remove numbers and all non letters characters - handling apostrophes (“’”) - remove common redundant words (example “a”, “in”, “and”) - remove un-necessary spaces

corpus <- VCorpus(VectorSource(sample_data))
corpus1 <- tm_map(corpus,removePunctuation)
corpus2 <- tm_map(corpus1,stripWhitespace)
corpus3 <- tm_map(corpus2,tolower) # Convert to lowercase

corpus4 <- tm_map(corpus3,removeNumbers)
corpus5 <- tm_map(corpus4,PlainTextDocument)
#removing stop words in English (a, as, at, so, etc.)
corpus6 <- tm_map(corpus5,removeWords,stopwords("english"))

Tokenization

In NLP n-gram is a contiguous sequence of n items from a given sequence of text or speech. Unigrams are single words. Bigrams are two words combinations. Trigrams are three-word combinations.

The following function is used to extract 1-grams, 2-grams, 3-grams from the text Corpus using RWeka.

one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
thr<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpus6,control=list(tokenize=one))
two_table<-TermDocumentMatrix(corpus6,control=list(tokenize=two))
thr_table<-TermDocumentMatrix(corpus6,control=list(tokenize=thr))

#Finding the frequency of terms in each of these 3 matrices and construct dataframes of these frequencies.

one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
thr_corpus<-findFreqTerms(thr_table,lowfreq=10)

one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)

##      Word frequency
## said said      1453
## will will      1040
## just just      1037

two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)

##                Word frequency
## right now right now       101
## last year last year        80

thr_corpus_num<-rowSums(as.matrix(thr_table[thr_corpus,]))
thr_corpus_table<-data.frame(Word=names(thr_corpus_num),frequency=thr_corpus_num)
thr_corpus_sort<-thr_corpus_table[order(-thr_corpus_table$frequency),]
head(thr_corpus_sort)

##                                Word frequency
## cant wait see         cant wait see        16
## happy mothers day happy mothers day        16
## let us know             let us know        11

Exploratrory Analysis

Now we will see the results of the data analysis which took place in the form of graphs and check whether our analysis was good and can be used to fit in a model.

one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity")
one_g<-one_g+labs(title="Unigrams",x="Words",y="Frequency")
one_g<-one_g+theme(axis.text.x=element_text(angle=90))
one_g

two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity")
two_g<-two_g+labs(title="Bigrams",x="Words",y="Frequency")
two_g<-two_g+theme(axis.text.x=element_text(angle=90))
two_g

thr_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
thr_g<-thr_g+geom_bar(stat="identity")
thr_g<-thr_g+labs(title="Trigrams",x="Words",y="Frequency")
thr_g<-thr_g+theme(axis.text.x=element_text(angle=90))
thr_g

Prediction Algorithm for the app

1.The prediction model would have to use these frequencies. In the app, as long as the user is typing, the app should respond with suggestions of the words that match all of part of the string already typed, with ordred with descending frequency.

2.When a user types a space (meaning that he finished his first word), the app should respond with the possible, also ordred with descending frequency.