Task2 - Explatory Data

This document is to work on week 2, John Hopkins data science specialization Capstone Project. This document should explains only the features of the data youidentified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager.

Intoduction

The main structure for managing documents is a so-called Corpus, Corpus is a large collection of texts. It is a body of written or spoken material upon which a linguistic analysis is based. The plural form of corpus is corpora.

Text Mining on Corpus

Loading the files and explore them.

con <- file("C:/DataScienceProgram/Capstone/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "r")
twitter<- readLines(con,-1, skipNul= TRUE)
close(con)

con <- file("C:/DataScienceProgram/Capstone/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "r")
blogs<- readLines(con,-1, skipNul= TRUE)
close(con)

con <- file("C:/DataScienceProgram/Capstone/Coursera-SwiftKey/final/en_US/en_US.news.txt", "r")
news<- readLines(con,-1, skipNul= TRUE)

## Warning in readLines(con, -1, skipNul = TRUE): incomplete final line
## found on 'C:/DataScienceProgram/Capstone/Coursera-SwiftKey/final/en_US/
## en_US.news.txt'

close(con)

filesdata= data.frame(
  fileName = c("en_US.twitter.txt","en_US.blogs.txt","en_US.news.txt"), 
  NumberOfLines = c(length(twitter), length(blogs), length(news)), 
  NumberOfWords = c(sum(str_count(twitter, "\\S+")), sum(str_count(blogs, "\\S+")), sum(str_count(news, "\\S+")))
)
filesMatrix<- as.matrix(filesdata,3,3)

kable(filesMatrix , format = "markdown")

fileName	NumberOfLines	NumberOfWords
en_US.twitter.txt	2360148	30373832
en_US.blogs.txt	899288	37334441
en_US.news.txt	77259	2643972

Sampling and Transformation

The files are large files, taking data sample is useful to give us information we need like the most frequent words and n-grams. To clean the files - Remove Numbers - Remove Enlish stop words - Extra white spaces - Remove Punctuations

set.seed(3)
samplingsize<-5000

SampleTrans <- function(x){
    x<-sample(x,samplingsize)
    vs <- VectorSource(x)
    x <- VCorpus(vs, readerControl=list(readPlain, language="en", load=TRUE))
    x<-tm_map(x,removeNumbers)
    x<-tm_map(x,removeWords, stopwords("english"))
    x<-tm_map(x,stripWhitespace)
    x<-tm_map(x, content_transformer(tolower))
    x<-tm_map(x, removePunctuation,preserve_intra_word_dashes = TRUE)
  
}
 twitter_sample <- SampleTrans(twitter)
 blogs_sample <- SampleTrans(blogs)
 news_sample <- SampleTrans(news)

Tokenize

n <- 10

top_N_token <- function(corpus, n)
{
  tokens <- TermDocumentMatrix(corpus, control = list(wordLengths = c(3,10))) 
  fm <- rowSums(as.matrix(tokens))
   ngram<-data.frame(word=names(fm),freq=fm)
   tokens_freq<-ngram[order(-ngram$freq),]
   
   tokens_top <- tokens_freq[1:n, ]
}


 twitter_tokens_top<-top_N_token(twitter_sample, 10)
 blog_tokens_top<-top_N_token(blogs_sample, 10)
 news_tokens_top<-top_N_token(news_sample, 10)
 
 
g.twitter.top <- ggplot(twitter_tokens_top, aes(x = word, y = freq)) 
g.twitter.top <- g.twitter.top + geom_bar(stat = "identity")+labs(title = "Most Frequent: Twitter")
print(g.twitter.top)

g.blog.top <- ggplot(blog_tokens_top, aes(x = word, y = freq)) 
g.blog.top <- g.blog.top + geom_bar(stat = "identity")+labs(title = "Most Frequent: Blogs")
print(g.blog.top)

g.news.top <- ggplot(news_tokens_top, aes(x = word, y = freq)) 
g.news.top <- g.news.top + geom_bar(stat = "identity")+labs(title = "Most Frequent: News")
print(g.news.top)

Prediction Algorithm Plan

N-grams Could be used as a prediction algorithm, Google and Microsoft have developed web scale n-gram models that can be used in a variety of tasks such as spelling correction, word breaking and text summarization.

N-gram

N-grams of texts are extensively used in text mining and natural language processing tasks. They are basically a set of co-occuring words within a given window and when computing the n-grams you typically move one word forward (although you can move X words forward in more advanced scenarios). For example, for the sentence “The cow jumps over the moon”. If N=2 (known as bigrams), then the ngrams would be: the cow cow jumps jumps over over the the moon

 BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
 TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))


bigrams<-function(x )
  {
  tdm <- TermDocumentMatrix(x, control = list(tokenize = BigramTokenizer))
   fm <- rowSums(as.matrix(tdm))
   ngram<-data.frame(ngram=names(fm),freq=fm)
   ngram<-ngram[order(-ngram$freq),]
}
trigrams<-function(x)
  {
  tdm <- TermDocumentMatrix(x, control = list(tokenize = TrigramTokenizer))
   fm <- rowSums(as.matrix(tdm))
   ngram<-data.frame(ngram=names(fm),freq=fm)
   ngram<-ngram[order(-ngram$freq),]
}

blogs_a2<-bigrams(blogs_sample); blogs_a3<-trigrams(blogs_sample)
 news_a2<-bigrams(news_sample); news_a3<-trigrams(news_sample)
 twitter_a2<-bigrams(twitter_sample); twitter_a3<-trigrams(twitter_sample)

 twitter_bigrams_top<-twitter_a2[1:10,]
   blog_bigrams_top<-blogs_a2[1:10,]
 news_bigrams_top<-news_a2[1:10,]
 
#twitter 2 grams
 twitter_bigrams_top

##               ngram freq
## i think     i think   57
## i love       i love   56
## right now right now   39
## i just       i just   38
## i need       i need   37
## i know       i know   36
## i want       i want   32
## i can         i can   28
## i get         i get   27
## i hope       i hope   27

 #blogs 2 grams
 blog_bigrams_top

##               ngram freq
## i think     i think  129
## i will       i will  104
## i know       i know  102
## i can         i can   91
## i love       i love   72
## i really   i really   69
## i just       i just   67
## i want       i want   63
## i donâ<U+0080><U+0099>t i donâ<U+0080><U+0099>t   62
## time i       time i   57

  #news 2 grams
 news_bigrams_top

##                       ngram freq
## i think             i think   59
## new york           new york   59
## last year         last year   54
## st louis           st louis   54
## high school     high school   44
## new jersey       new jersey   42
## years ago         years ago   35
## m pm                   m pm   31
## first time       first time   30
## san francisco san francisco   29

 twitter_a3[1:10,]

##                               ngram freq
## i wish i                   i wish i    9
## happy mothers day happy mothers day    8
## yes yes yes             yes yes yes    8
## i just got               i just got    7
## i just want             i just want    7
## let us know             let us know    6
## cinco de mayo         cinco de mayo    5
## i guess i                 i guess i    5
## i think i                 i think i    5
## cant wait see         cant wait see    4

 blogs_a3[1:10, ]

##                           ngram freq
## i think i             i think i   22
## i know i               i know i   16
## first time i       first time i   13
## i donâ<U+0080><U+0099>t think i donâ<U+0080><U+0099>t think   11
## i can see             i can see    9
## i thought i         i thought i    9
## i donâ<U+0080><U+0099>t know   i donâ<U+0080><U+0099>t know    8
## i feel like         i feel like    8
## i knew i               i knew i    8
## last night i       last night i    8

 news_a3[1:10, ]

##                                           ngram freq
## st louis county                 st louis county   12
## new york city                     new york city    8
## gov chris christie           gov chris christie    7
## two years ago                     two years ago    7
## chief financial officer chief financial officer    6
## first time since               first time since    6
## i feel like                         i feel like    6
## new york times                   new york times    6
## past two years                   past two years    6
## the associated press       the associated press    6