First I set the path to load the data set . Then I downloaded the three English text files provided:

I used readLines() to read in the three files into character vectors. Each element of the resulting character vector is a a blog entry, a news story or, tweet .

Lets perform the exploratory analysis of the training data sets.

Lets perform basic summary of the en_US.blogs.txt.

path1 <- "E:/SS/Coursera Data Science Specialization/Data Science Capstone/Week 1/en_US/en_US.blogs.txt"
conn1 <- file(path1, open="rb")
en_US.blogs <- readLines(conn1)
close(conn1)
len=length(en_US.blogs) 
cat("There are ",len,"observations in en_US.blogs")
## There are  899288 observations in en_US.blogs
size = format(round(file.info(path1)[1]/1024/1024,2),nsmall=2)
cat("The size Of en_US.blogs.txt is",size[[1]],"mb")
## The size Of en_US.blogs.txt is 200.42 mb
nchars <- lapply(en_US.blogs,nchar)
maxchars <- which.max(nchars) 
cat("The line",maxchars,"have maximum characters")
## The line 483415 have maximum characters
max_char <- nchars[[maxchars]]  
cat("The number of Characters in the line having maximum character is",max_char)
## The number of Characters in the line having maximum character is 40835
num_words <- sum(sapply(strsplit(en_US.blogs,"\\s+"),length)) 
cat("The number of words in en_US.blogs.txt is",num_words)
## The number of words in en_US.blogs.txt is 37334441

Lets perform basic summary of the en_US.news.

path2 <- "E:/SS/Coursera Data Science Specialization/Data Science Capstone/Week 1/en_US/en_US.news.txt"
conn2 <- file(path2, open="rb")
en_US.news <- readLines(conn2)
close(conn2)
len1=length(en_US.news) 
cat("There are ",len1,"observations in en_US.news")
## There are  1010242 observations in en_US.news
size1 = format(round(file.info(path2)[1]/1024/1024,2),nsmall=2)
cat("The size Of en_US.news.txt is",size1[[1]],"mb")
## The size Of en_US.news.txt is 196.28 mb
nchars1 <- lapply(en_US.news,nchar)
maxchars1 <- which.max(nchars1) 
cat("The line",maxchars1,"have maximum characters")
## The line 123628 have maximum characters
max_char1 <- nchars1[[maxchars1]]  
cat("The number of Characters in the line having maximum character is",max_char1)
## The number of Characters in the line having maximum character is 11384
num_words1 <- sum(sapply(strsplit(en_US.news,"\\s+"),length)) 
cat("The number of words in en_US.news.txt is",num_words1)
## The number of words in en_US.news.txt is 34372598

Lets perform basic summary of the en_US.twitter.

path3 <- "E:/SS/Coursera Data Science Specialization/Data Science Capstone/Week 1/en_US/en_US.twitter.txt"
conn3 <- file(path3, open="rb")
suppressWarnings(en_US.twitter <- readLines(conn3))
close(conn3)
len2=length(en_US.twitter) 
cat("There are ",len2,"observations in en_US.twitter")
## There are  2360148 observations in en_US.twitter
size2 = format(round(file.info(path3)[1]/1024/1024,2),nsmall=2)
cat("The size Of en_US.twitter.txt is",size2[[1]],"mb")
## The size Of en_US.twitter.txt is 159.36 mb
nchars2 <- lapply(en_US.twitter,nchar)
maxchars2 <- which.max(nchars2) 
cat("The line",maxchars2,"have maximum characters")
## The line 1484357 have maximum characters
max_char2 <- nchars2[[maxchars2]]  
cat("The number of Characters in the line having maximum character is",max_char2)
## The number of Characters in the line having maximum character is 213
num_words2 <- sum(sapply(strsplit(en_US.twitter,"\\s+"),length)) 
cat("The number of words in en_US.twitter.txt is",num_words2)
## The number of words in en_US.twitter.txt is 30373792

Building n-grams

I dwelved into text mining algorithms and found n-grams model to be effective. Since the objects en_US.blogs, en_US.news , en_US.twitter are volumnous, I sampled 10% of data from each ofthem to develop a corpus.

# Create a sample of the combined corpus, find ngrams and show top 10
library(text2vec)
## Warning: package 'text2vec' was built under R version 3.3.2
set.seed(1295)
# Take a 10% sample of the combined corpus to make it more manageable
enCorpus <- c(en_US.blogs,en_US.news,en_US.twitter)
samp <- sample(length(enCorpus), ceiling(length(enCorpus) * .10))
enCorpus <- enCorpus[samp]

I will use text2vec package and tm package to analyse the text.

First I will use text2vec package to tokenize the data to n-grams, then later I will use tm package to build word cloud, so that we can understand the utilities of both the packages.

Here, I use the text2vec package to tokenize the data:

  • It will split entries into separate words

  • Then it will convert to lower case

  • Then it will remove punctuation and whitespace

# Create tokens
library(text2vec)
library(magrittr)

tokens <- enCorpus %>% tolower() %>%  word_tokenizer()
it <- itoken(tokens)

Here, I will build 5-grams ,4-grams, 3-grams, 2-grams , 1-gram model. Then, I will filter n-grams from each, resulting only which appear atleast 20-times. Then, I select top 10 n-grams.

Let’s see Top 10 5-grams

Top 10 5-grams

# Getting 5-grams and terms_counts, min count = 20
vocab5gram <- create_vocabulary(it, ngram = c(5L, 5L)) %>%  prune_vocabulary(term_count_min = 20)
# Sort vocab5gram  in decreasing order
vocab5gram$vocab <- vocab5gram$vocab[order(vocab5gram$vocab$terms_counts,decreasing = TRUE)]
#vocab5gram$vocab[1:10]  # Top 10      5-grams

** terms terms_counts doc_counts**

1: at_the_end_of_the 356 350 2: 302 15 3: can_t_wait_to_see 282 282 4: i_don_t_want_to 231 222 5: it_s_going_to_be 222 220 6: i_can_t_wait_to 202 202 7: in_the_middle_of_the 187 186 8: i_don_t_know_what 163 159 9: for_the_first_time_in 159 158 10: s_going_to_be_a 138 138

Let’s see Top 10 4-grams

Top 10 4-grams

# Getting 4-grams and terms_counts, min count = 20
vocab4gram <- create_vocabulary(it, ngram = c(4L, 4L)) %>%  prune_vocabulary(term_count_min = 20)
# Sort vocab4gram  in decreasing order
vocab4gram$vocab <- vocab4gram$vocab[order(vocab4gram$vocab$terms_counts,decreasing = TRUE)]
#vocab4gram$vocab[1:10]  # Top 10      4-grams

The table below shows the top 10 4-grams along with the number of times they appear in the sampled corpus.

** terms terms_counts doc_counts**

1: i_don_t_know 862 844 2: can_t_wait_to 859 859 3: i_m_going_to 799 776 4: the_end_of_the 769 753 5: thanks_for_the_follow 654 654 6: for_the_first_time 648 639 7: the_rest_of_the 646 639 8: at_the_end_of 637 626 9: i_don_t_think 590 584 10: at_the_same_time 534 527

Top 10 3-grams

# Getting 3-grams and terms_counts, min count = 20
vocab3gram <- create_vocabulary(it, ngram = c(3L, 3L)) %>%  prune_vocabulary(term_count_min = 20)
# Sort vocab3gram  in decreasing order
vocab3gram$vocab <- vocab3gram$vocab[order(vocab3gram$vocab$terms_counts,decreasing = TRUE)]
#vocab3gram$vocab[1:10]  # Top 10      3-grams

The table below shows the top 10 3-grams along with the number of times they appear in the sampled corpus.

** terms terms_counts doc_counts**

1: i_don_t 4449 4205 2: one_of_the 3507 3407 3: a_lot_of 3036 2843 4: thanks_for_the 2393 2390 5: i_can_t 2128 2089 6: it_s_a 2056 2013 7: to_be_a 1842 1808 8: going_to_be 1782 1711 9: i_m_not 1706 1658 10: can_t_wait 1643 1636

Top 10 2-grams

# Getting 2-grams and terms_counts, min count = 20
vocab2gram <- create_vocabulary(it, ngram = c(2L, 2L)) %>%  prune_vocabulary(term_count_min = 20)
# Sort vocab2gram  in decreasing order
vocab2gram$vocab <- vocab2gram$vocab[order(vocab2gram$vocab$terms_counts,decreasing = TRUE)]
#vocab2gram$vocab[1:10]  # Top 10      2-grams

The table below shows the top 10 2-grams along with the number of times they appear in the sampled corpus.

** terms terms_counts doc_counts**

1: of_the 43190 35523 2: in_the 40813 35269 3: to_the 21282 19476 4: for_the 20001 18827 5: on_the 19505 17969 6: i_m 18278 16472 7: it_s 18205 16217 8: to_be 16333 14924 9: at_the 14278 13470 10: don_t 13010 12041

Top 10 1-grams

library(tm)
## Loading required package: NLP
# Getting 1-grams and terms_counts, min count = 20
vocab1gram <- create_vocabulary(it, ngram = c(1L, 1L),stopwords= c(stopwords("english"),letters)) %>%  prune_vocabulary(term_count_min = 20)
# Sort vocab1gram  in decreasing order
vocab1gram$vocab <- vocab1gram$vocab[order(vocab1gram$vocab$terms_counts,decreasing = TRUE)]
#vocab1gram$vocab[1:10]  # Top 10      2-grams

The table below shows the top 10 1-grams along with the number of times they appear in the sampled corpus.

** terms terms_counts doc_counts**

1: 50931 25557 2: will 31688 26295 3: can 31323 27773 4: just 30643 28370 5: said 30286 27560 6: one 30202 26384 7: like 27385 24649 8: get 22799 20945 9: time 22461 20334 10: new 19762 17660

Creating Word Cloud with cleaned data

First we sample 10% of data from three datasets. Then clean the data with text mining package (tm). Then create a list of corpora for three datasets.

# Define a function to clean text
clean_data <- function (x) {
  samp <- x
  for (i in 1:length(samp)) {
    original_row <- samp[i]
    cleaned_row <- iconv(original_row, "UTF-8", "ASCII", sub = "")
    samp[i] <- cleaned_row
  }
x <- samp
}

# Creating sample

samp_en_US.blogs    <- sample(en_US.blogs,   length(en_US.blogs) * .10)
samp_en_US.news     <- sample(en_US.news,    length(en_US.news) * .10)
samp_en_US.twitter  <- sample(en_US.twitter, length(en_US.twitter) * .10)

# Remove non-english characters from sample
sampleBlogs   <- clean_data(samp_en_US.blogs)
sampleNews    <- clean_data(samp_en_US.news)
sampleTwitter <- clean_data(samp_en_US.twitter)

# Save merged corpus samples 

writeLines(sampleData <- c(sampleBlogs, sampleNews, sampleTwitter), "./sampleData.txt")

#From sampleBlogs create corpus
    # Create corpus 
    corpora <- Corpus(VectorSource(sampleBlogs))
    # Cleaning corpus 
    corpora <- tm_map(corpora, tolower)
    # Eleminate punctuation characters
    corpora <- tm_map(corpora, removePunctuation)
    # Eleminate numbers
    corpora <- tm_map(corpora, removeNumbers)
    # Strip Whitespace
    corpora <- tm_map(corpora, stripWhitespace)
    
    # Eliminate bad words
    # Have build a list of badwords
    badwords <- readLines("E:/SS/Coursera Data Science Specialization/Data Science Capstone/Week 2/badwords.txt",n=1323)
    corpora <- tm_map(corpora, removeWords, badwords)
    # Eleminate English stop words
    corpora <- tm_map(corpora, removeWords, stopwords("english"))
    # Perform stemming
    corpora <- tm_map(corpora, stemDocument)
    # Create plain text format
    corpora <- tm_map(corpora, PlainTextDocument)
    # Calculate document term frequency for corpus
    dtMatrix <- DocumentTermMatrix(corpora, control=list(wordLengths=c(0,Inf)))
    #str(dtMatrix)

Plot of Sampled Corpus with Word Cloud

library(wordcloud); library(slam)
## Loading required package: RColorBrewer
# Set random seed for reproducibility
set.seed(1264)

Headings= c("Word Cloud Of US English Blogs",
            "Word Cloud Of US English News", 
            "Word Cloud Of US English Twitter")

# From corpus and DTM, plot word cloud (Max = 150)
par(mar=c(3,2,1,2))
wordcloud(words = colnames(dtMatrix), freq = slam::col_sums(dtMatrix), 
        scale = c(3,1), max.words = 100, random.order = FALSE, rot.per = 0.45, 
        use.r.layout = FALSE, colors = brewer.pal(11, "Dark2"))
## Warning in brewer.pal(11, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
    title(Headings[1],cex.main = 1.5)

Creating corpus of samp_en_US.news and creating wordcloud

#From sampleNews create corpus
    # Create corpus 
    corpora <- Corpus(VectorSource(sampleNews))
    # Cleaning corpus 
    corpora <- tm_map(corpora, tolower)
    # Eleminate punctuation characters
    corpora <- tm_map(corpora, removePunctuation)
    # Eleminate numbers
    corpora <- tm_map(corpora, removeNumbers)
    # Strip Whitespace
    corpora <- tm_map(corpora, stripWhitespace)
    
    # Eliminate bad words
    # Have build a list of badwords
    badwords <- readLines("E:/SS/Coursera Data Science Specialization/Data Science Capstone/Week 2/badwords.txt",n=1323)
    corpora <- tm_map(corpora, removeWords, badwords)
    # Eleminate English stop words
    corpora <- tm_map(corpora, removeWords, stopwords("english"))
    # Perform stemming
    corpora <- tm_map(corpora, stemDocument)
    # Create plain text format
    corpora <- tm_map(corpora, PlainTextDocument)
    # Calculate document term frequency for corpus
    dtMatrix <- DocumentTermMatrix(corpora, control=list(wordLengths=c(0,Inf)))
    #str(dtMatrix)

Plot of Sampled Corpus with Word Cloud

library(wordcloud); library(slam)
# Set random seed for reproducibility
set.seed(1264)

Headings= c("Word Cloud Of US English Blogs",
            "Word Cloud Of US English News", 
            "Word Cloud Of US English Twitter")

# From corpus and DTM, plot word cloud (Max = 150)
par(mar=c(3,2,1,2))

wordcloud(words = colnames(dtMatrix), freq = slam::col_sums(dtMatrix), 
        scale = c(3,1), max.words = 100, random.order = FALSE, rot.per = 0.45, 
        use.r.layout = FALSE, colors = brewer.pal(10, "Dark2"))
## Warning in brewer.pal(10, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
    title(Headings[2],cex.main = 1.5)

Creating corpus of samp_en_US.twitter and creating wordcloud

#From sampleTwitter create corpus
    # Create corpus 
    corpora <- Corpus(VectorSource(sampleTwitter))
    # Cleaning corpus 
    corpora <- tm_map(corpora, tolower)
    # Eleminate punctuation characters
    corpora <- tm_map(corpora, removePunctuation)
    # Eleminate numbers
    corpora <- tm_map(corpora, removeNumbers)
    # Strip Whitespace
    corpora <- tm_map(corpora, stripWhitespace)
    
    # Eliminate bad words
    # Have build a list of badwords
    badwords <- readLines("E:/SS/Coursera Data Science Specialization/Data Science Capstone/Week 2/badwords.txt",n=1323)
    corpora <- tm_map(corpora, removeWords, badwords)
    # Eleminate English stop words
    corpora <- tm_map(corpora, removeWords, stopwords("english"))
    # Perform stemming
    corpora <- tm_map(corpora, stemDocument)
    # Create plain text format
    corpora <- tm_map(corpora, PlainTextDocument)
    # Calculate document term frequency for corpus
    dtMatrix <- DocumentTermMatrix(corpora, control=list(wordLengths=c(0,Inf)))
    #str(dtMatrix)

Plot of Sampled Corpus with Word Cloud

library(wordcloud); library(slam)
# Set random seed for reproducibility
set.seed(1264)

Headings= c("Word Cloud Of US English Blogs",
            "Word Cloud Of US English News", 
            "Word Cloud Of US English Twitter")

# From corpus and DTM, plot word cloud (Max = 150)
par(mar=c(3,2,1,2))
wordcloud(words = colnames(dtMatrix), freq = slam::col_sums(dtMatrix), 
        scale = c(3,1), max.words = 100, random.order = FALSE, rot.per = 0.45, 
        use.r.layout = FALSE, colors = brewer.pal(10, "Dark2"))
## Warning in brewer.pal(10, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
    title(Headings[3],cex.main = 1.5)

Building a prediction algorithm and Shiny app

I will try and experiment with different models.

Since, I have developed 1-gram, 2-grams, 3-grams and 4-grams, I will use those to predict the next word.

A decision tree with left side words of n-grams to predict right most word.

Back-off and interpolated smoothing models may be useful for n-gram model .

It is needed when the input doesn’t match anything we’ve seen before .

Kneser-Ney Smoothing may be used.

The layout of Shiny app will be :

. Text describing the app and directions about its usage .

. A text input box for the user to enter a word or phrase .

. A Submit button to take the input to a prediction algorithm .

. A display space to display the outcome .