Capstone - Milestone Report

Synopsis

The captsone project focuses on utilizing data science techniques on the subject of natural language processing. The key task is to create an application using R Shiny to predict the next word given a preceding word or sequence of words. SwiftKey, the corporate sponsor for this project.

Objective / Motivation

Demonstrate that you’ve downloaded the data and have successfully loaded it in.
Create a basic report of summary statistics about the data sets.
Report any interesting findings that you amassed so far.
Get feedback on your plans for creating a prediction algorithm and Shiny app.

Review criteria

Does the link lead to an HTML page describing the exploratory analysis of the training data set?
Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables?
Has the data scientist made basic plots, such as histograms to illustrate features of the data?
Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate?

Load Packages

# Loading required libraries
library(tm);

## Loading required package: NLP

library(ngram); 
library(wordcloud);

## Loading required package: RColorBrewer

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(quanteda)

## Package version: 1.5.1

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:utils':
## 
##     View

Read and loading text file

En_Twit_text <- readLines("./data/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul=TRUE)
En_US_blogs_text <- readLines("./data/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul=TRUE)
En_US_NEWS_text <- readLines("./data/en_US/en_US.news.txt", encoding = "UTF-8", skipNul=TRUE)

#Random Sampling from the three text files to be used for model building

twitter_sample = sample(En_Twit_text, length(En_Twit_text)*0.015, replace = FALSE)
news_sample = sample(En_US_NEWS_text, length(En_US_NEWS_text)*0.015, replace = FALSE)
blogs_sample = sample(En_US_blogs_text, length(En_US_blogs_text)*0.015, replace = FALSE)

# Creating a corpus for text mining and pre-processing
sample_files = c(twitter_sample,news_sample,blogs_sample)
files = Corpus(VectorSource(sample_files))

df.nwords.all <- data.frame(nword = c(twitter_sample, news_sample, blogs_sample), 
  type = c(rep("blog", length(blogs_sample)), rep("twitter",length(twitter_sample)), 
           rep("web", length(news_sample))))

Exploratory data analysis

make_Corpus<- function(test_file) {
    gen_corp<- paste(test_file, collapse=" ")
    gen_corp <- VectorSource(gen_corp)
    gen_corp <- Corpus(gen_corp)
}
    
clean_corp <- function(corp_data) {

  WordSeparators  <- "[[:punct:]]|\u00ad|\u0091|\u0092|\u0093|\u0094|\u0095|\u0096|\u0097|\u0098|\u00a6"
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, ' ', x))})

    corp_data <- tm_map(corp_data, toSpace, WordSeparators)
    corp_data <- tm_map(corp_data, removeNumbers)
    corp_data <- tm_map(corp_data, content_transformer(tolower))
    corp_data <- tm_map(corp_data, removeWords, stopwords("english"))
    corp_data <- tm_map(corp_data, removePunctuation)
    corp_data <- tm_map(corp_data, stripWhitespace)
    corp_data <- tm_map(corp_data, PlainTextDocument)
    return (corp_data)
}

high_freq_words <- function (corp_data) {
    term_sparse <- DocumentTermMatrix(corp_data)
    term_matrix <- as.matrix(term_sparse)   ## convert our term-document-matrix into a normal matrix
    freq_words <- colSums(term_matrix)
    freq_words <- as.data.frame(sort(freq_words, decreasing=TRUE))
    freq_words$word <- rownames(freq_words)
    colnames(freq_words) <- c("Frequency","word")
    return (freq_words)
}

Bar Chart of High frequency words

## en_US.news.txt High frequency words 
    
    US_news_corpus <- make_Corpus(news_sample)
    US_news_corpus <- clean_corp(US_news_corpus)

## Warning in tm_map.SimpleCorpus(corp_data, toSpace, WordSeparators):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removeWords,
## stopwords("english")): transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corp_data, PlainTextDocument):
## transformation drops documents

    US_news_most_used_word <- high_freq_words(US_news_corpus)
    US_news_most_used_word1<- US_news_most_used_word[1:15,]

    p<-ggplot(data=US_news_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent words : US News") +theme(legend.title=element_blank()) + coord_flip()

## en_US.blogs.txt High frequency words 
    US_blogs_corpus <- make_Corpus(blogs_sample)
    US_blogs_corpus <- clean_corp(US_blogs_corpus)

## Warning in tm_map.SimpleCorpus(corp_data, toSpace, WordSeparators):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removeWords,
## stopwords("english")): transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corp_data, PlainTextDocument):
## transformation drops documents

    US_blogs_most_used_word <- high_freq_words(US_blogs_corpus)
    US_blogs_most_used_word1<- US_blogs_most_used_word[1:15,]

    p<-ggplot(data=US_blogs_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent words : US blogs") +theme(legend.title=element_blank()) + coord_flip()

    ## en_US.twitter.txt High frequency words 
    twitter_corpus <- make_Corpus(twitter_sample)
    twitter_corpus <- clean_corp(twitter_corpus)

## Warning in tm_map.SimpleCorpus(corp_data, toSpace, WordSeparators):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removeWords,
## stopwords("english")): transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corp_data, PlainTextDocument):
## transformation drops documents

    twitter_most_used_word <- high_freq_words(twitter_corpus)
    twitter_most_used_word1<- twitter_most_used_word[1:15,]
    
    p<-ggplot(data=twitter_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent words : Twitter") +theme(legend.title=element_blank()) + coord_flip()

Generating the Word Cloud

## US News Word Cloud
    wordcloud(US_news_most_used_word$word[1:100], US_news_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

## US Blogs Word Cloud
    wordcloud(US_blogs_most_used_word$word[1:100], US_blogs_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

## US Twitter Word Cloud
    wordcloud(twitter_most_used_word$word[1:100], twitter_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

Word Analysis

For the Data analysis of text document we need to create a bag of word matrices with Unigram, Bigram, Trigrams. These Ngram model set improve the predictabily of the data analysis.

## news High frequency words    
    US_News_tokens<- tokens(news_sample,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    US_News_tokens <- tokens_tolower(US_News_tokens)
    US_News_tokens <- tokens_select(US_News_tokens, stopwords(),selection ="remove")

    US_News_unigram <- tokens_ngrams(US_News_tokens, n=1)  ## unigram
    US_News_unigram.dfm <- dfm(US_News_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    US_News_bigram <- tokens_ngrams(US_News_tokens, n=2)  ## bigram
    US_News_bigram.dfm <- dfm(US_News_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    US_News_trigram <- tokens_ngrams(US_News_tokens, n=3)  ## trigram
    US_News_trigram.dfm <- dfm(US_News_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(US_News_unigram.dfm, 20)  # 20 top US News Unigram words

##   said    one    new   also    two    can   year   just  first   time 
##   3793   1249   1035    925    859    831    831    810    801    783 
## people   last  state  years   like    get school   make   city    now 
##    736    728    726    690    689    614    565    560    559    526

    topfeatures(US_News_trigram.dfm, 20)  # 20 top US News Trigram words

##         president_barack_obama                  two_years_ago 
##                             28                             25 
##             gov_chris_christie                  new_york_city 
##                             24                             20 
##                 four_years_ago     county_prosecutor's_office 
##                             15                             14 
##       national_weather_service                three_years_ago 
##                             11                             10 
##                   world_war_ii             u.s_district_court 
##                              9                              9 
##               last_three_years                st_louis_county 
##                              9                              9 
##                 past_two_years                cents_per_share 
##                              9                              9 
##                 new_york_times securities_exchange_commission 
##                              8                              8 
##           superior_court_judge              u.s_supreme_court 
##                              8                              8 
##        chief_executive_officer             u.s_district_judge 
##                              7                              7

## blog High frequency words
    US_blogs_tokens<- tokens(blogs_sample,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    US_blogs_tokens <- tokens_tolower(US_blogs_tokens)
    US_blogs_tokens <- tokens_select(US_blogs_tokens, stopwords(),selection ="remove")

    US_blogs_unigram <- tokens_ngrams(US_blogs_tokens, n=1)  ## unigram
    US_blogs_unigram.dfm <- dfm(US_blogs_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    US_blogs_bigram <- tokens_ngrams(US_blogs_tokens, n=2)  ## bigram
    US_blogs_bigram.dfm <- dfm(US_blogs_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    US_blogs_trigram <- tokens_ngrams(US_blogs_tokens, n=3)  ## tiigram
    US_blogs_trigram.dfm <- dfm(US_blogs_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(US_blogs_unigram.dfm, 20)  # 20 top US blogs Unigram words

##    one    can   just   like   time    get people   know    now   back 
##   1875   1482   1471   1416   1291   1055    957    939    879    837 
##   also     us   even   make    new really   well   much  first    see 
##    830    811    797    780    764    759    751    751    747    729

    topfeatures(US_blogs_bigram.dfm, 20)  # 20 top US blogs Bigram words

##      new_york       can_see     right_now     last_year     years_ago 
##            80            78            75            75            73 
##     make_sure    first_time    little_bit     one_thing     feel_like 
##            64            61            54            54            54 
##   even_though       one_day    last_night     last_week   many_people 
##            52            52            52            51            49 
##   high_school     long_time     every_day united_states      let_know 
##            47            45            45            39            39

    topfeatures(US_blogs_trigram.dfm, 20)  # 20 top US blogs Trigram words

##     couple_weeks_ago       new_york_times        new_york_city 
##                   11                   10                    9 
##  look_forward_seeing       happy_new_year         world_war_ii 
##                    8                    7                    7 
##       love_love_love incorporated_item_pp    mummy_mummy_mummy 
##                    6                    6                    6 
##      spend_much_time      just_little_bit        two_weeks_ago 
##                    5                    5                    5 
##       spend_lot_time      dream_come_true  one_favorite_things 
##                    5                    5                    5 
##       four_years_ago ghost_towns_oklahoma      three_years_old 
##                    5                    5                    4 
##        long_time_now       last_two_years 
##                    4                    4

## twitter Ngram words 
    twitter_tokens<- tokens(twitter_sample,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    twitter_tokens <- tokens_tolower(twitter_tokens)
    twitter_tokens <- tokens_select(twitter_tokens, stopwords(),selection ="remove")

    twitter_unigram <- tokens_ngrams(twitter_tokens, n=1)  ## unigram
    twitter_unigram.dfm <- dfm(twitter_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    twitter_bigram <- tokens_ngrams(twitter_tokens, n=2)  ## bigram
    twitter_bigram.dfm <- dfm(twitter_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    twitter_trigram <- tokens_ngrams(twitter_tokens, n=3)  ## trigram
    twitter_trigram.dfm <- dfm(twitter_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(twitter_unigram.dfm, 20)  # 20 top Unigram words

##   just   like    get   love   good thanks     rt    day    can   know 
##   2297   1824   1630   1601   1546   1434   1425   1344   1341   1259 
##    now    one      u   time     go  great  today    lol    new    see 
##   1205   1193   1181   1121   1108   1103   1080   1043   1030    958

    topfeatures(twitter_bigram.dfm, 20)  # 20 top Bigram words

##       right_now      last_night  happy_birthday looking_forward 
##             237             171             128             126 
##       feel_like        just_got    good_morning       good_luck 
##             106             105             102             101 
##   thanks_follow     follow_back      looks_like        let_know 
##              99              86              85              83 
##         can_get       thanks_rt     thanks_much       next_week 
##              80              72              72              70 
##       make_sure      first_time       great_day   please_follow 
##              67              66              64              62

    topfeatures(twitter_trigram.dfm, 20)  # 20 top  Trigram words

##         happy_new_year     happy_mother's_day            let_us_know 
##                     30                     30                     28 
##      happy_mothers_day          cinco_de_mayo        show_last_night 
##                     16                     15                     13 
## looking_forward_seeing          cant_wait_see          just_got_back 
##                     13                     13                     11 
##    ralph_waldo_emerson       st_patrick's_day     please_follow_back 
##                     11                     11                     10 
##          just_got_done         keep_good_work    thanks_following_us 
##                     10                     10                      9 
##   looking_forward_next       just_finished_mi        finished_mi_run 
##                      9                      9                      9 
##         ever_ever_ever               go_go_go 
##                      9                      8

Interesting findings that you amassed so far

I have gone through the multiple literatures and youtube vidios on Text mining. Learned almost everything new specifically “quanteda” library. How text data set will get exploded with different ngrams and Bag of words. Looks like quanteda library is useful in generating the text analytics. Which very fast compare to TM library.

Get feedback on your plans for creating a prediction algorithm and Shiny app.

Plan of Approach:

Tockenization and bag of words with multiple Ngrams.
Due to limited resource (hardware and computational), analysis was done on a same sample to build the shiny app.

Feedback:

Looking forward for the feedback and suggestion to improve the analysis.