Background

This is a milestone report for peer graded assigment as part of Data Science Captone Course from Coursera in Week 2. The objective of this document is as follows,

1.Demonstrate that you’ve downloaded the data and have successfully loaded it in. 2.Create a basic report of summary statistics about the data sets. 3.Report any interesting findings that you amassed so far. Get feedback on your plans for creating a prediction algorithm and Shiny app. 4.This report also will be served as a base for creating the next assignment report, hence it should be as clear and concise as possible. The content of this report will be structured to 5 sections as per the objective mentioned above.

Fetching and Loading the Training Dataset

The training dataset to get started that will be the basis for most of the capstone. The dataset must be downloaded from the link below and not from external websites to start.

https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

Load Packages

library(quanteda)
## Package version: 3.3.1
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 2 of 2 threads used.
## See https://quanteda.io for tutorials and examples.
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following objects are masked from 'package:quanteda':
## 
##     meta, meta<-
## 
## Attaching package: 'tm'
## The following object is masked from 'package:quanteda':
## 
##     stopwords
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tokenizers)
library(stringi)
library(wordcloud)
## Loading required package: RColorBrewer

Download and import dataset

con <- file("en_US.news.txt", open="r")
    En_US_NEWS_text <- readLines(con, skipNul = TRUE); close(con)
## Warning in readLines(con, skipNul = TRUE): incomplete final line found on
## 'en_US.news.txt'
    con <- file("en_US.blogs.txt", open="r")
    En_US_blogs_text <- readLines(con, skipNul = TRUE); close(con) 

    con <- file("en_US.twitter.txt", open="r")
    En_Twit_text <- readLines(con, skipNul = TRUE); close(con)

Summary Statistics

file_stat<- function(text_file, lines) {
    f_size <- file.info(text_file)[1]/1024^2
    nchars <- lapply(lines, nchar)
    maxchars <- which.max(nchars)
    word_count <- sum(sapply(strsplit(lines, "\\s+"), length))
    return(c(text_file, format(round(as.double(f_size), 2), nsmall=2), length(lines),maxchars, word_count))
}

    En_US_news_stat<- file_stat("en_US.news.txt", En_US_NEWS_text)
    En_US_blogs_stat <- file_stat("en_US.blogs.txt", En_US_blogs_text)
    En_Twit_text_stat<- file_stat("en_US.twitter.txt", En_Twit_text)

    test_summary <- c(En_US_news_stat, En_US_blogs_stat,En_Twit_text_stat)

    df <- data.frame(matrix(unlist(test_summary), nrow=3, byrow=T))
    colnames(df) <- c("Text_file", "Size(MB)", "Line_Count", "Max Line Length", "Words_Count")
    df
##           Text_file Size(MB) Line_Count Max Line Length Words_Count
## 1    en_US.news.txt   196.28      77259           14556     2643969
## 2   en_US.blogs.txt   200.42     899288          483415    37334131
## 3 en_US.twitter.txt   159.36    2360148         1105776    30373583

Exploratory data analysis:

Here will be the functions to make the test data corpus, clean the corpus, and then capture the high frequency words.

make_Corpus<- function(test_file) {
    gen_corp<- paste(test_file, collapse=" ")
    gen_corp <- VectorSource(gen_corp)
    gen_corp <- Corpus(gen_corp)
}
    
clean_corp <- function(corp_data) {

    corp_data <- tm_map(corp_data, removeNumbers)
    corp_data <- tm_map(corp_data, content_transformer(tolower))
    corp_data <- tm_map(corp_data, removeWords, stopwords("english"))
    corp_data <- tm_map(corp_data, removePunctuation)
    corp_data <- tm_map(corp_data, stripWhitespace)
    return (corp_data)
}

high_freq_words <- function (corp_data) {
    term_sparse <- DocumentTermMatrix(corp_data)
    term_matrix <- as.matrix(term_sparse)   ## convert our term-document-matrix into a normal matrix
    freq_words <- colSums(term_matrix)
    freq_words <- as.data.frame(sort(freq_words, decreasing=TRUE))
    freq_words$word <- rownames(freq_words)
    colnames(freq_words) <- c("Frequency","word")
    return (freq_words)
}

Bar Chart of High frequency words:

This section will explore the different text mining commands and extract the high frequency words.

## en_US.news.txt High frequency words 
    En_US_NEWS_text1<-sample(En_US_NEWS_text, round(0.1*length(En_US_NEWS_text)), replace = F)
    US_news_corpus <- make_Corpus(En_US_NEWS_text1)
    US_news_corpus <- clean_corp(US_news_corpus)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
    US_news_most_used_word <- high_freq_words(US_news_corpus)
    US_news_most_used_word1<- US_news_most_used_word[1:15,]

    p<-ggplot(data=US_news_most_used_word1,aes(x=reorder(word,Frequency),y=Frequency ,fill=as.numeric(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent Words in US News") +theme(legend.position = "none") + coord_flip()

## en_US.blogs.txt High frequency words 
    En_US_blogs_text1<-sample(En_US_blogs_text, round(0.1*length(En_US_blogs_text)), replace = F)
    US_blogs_corpus <- make_Corpus(En_US_blogs_text1)
    US_blogs_corpus <- clean_corp(US_blogs_corpus)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
    US_blogs_most_used_word <- high_freq_words(US_blogs_corpus)
    US_blogs_most_used_word1<- US_blogs_most_used_word[1:15,]

    p<-ggplot(data=US_blogs_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=as.numeric(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent Words in US blogs") +theme(legend.position = "none") + coord_flip()

## en_US.twitter.txt High frequency words 
    En_Twit_text1<-sample(En_Twit_text, round(0.1*length(En_Twit_text)), replace = F)
    twitter_corpus <- make_Corpus(En_Twit_text1)
    twitter_corpus <- clean_corp(twitter_corpus)
## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation
## drops documents
    twitter_most_used_word <- high_freq_words(twitter_corpus)
    twitter_most_used_word1<- twitter_most_used_word[1:15,]

    p<-ggplot(data=twitter_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=as.numeric(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent Words in Twitter") +theme(legend.position = "none") + coord_flip()

Using Wordcloud to compute words

## US News Word Cloud
    wordcloud(US_news_most_used_word$word[1:100], US_news_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

## US Blogs Word Cloud
    wordcloud(US_blogs_most_used_word$word[1:100], US_blogs_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

## Twitter Word Cloud
    wordcloud(twitter_most_used_word$word[1:100], twitter_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

Word Analysis:

For the Data analysis of text document we need to create a bag of word matrices with Unigram, Bigram, Trigrams. These Ngram model set improve the predictabily of the data analysis.

## en_US.news.txt High frequency words    
    En_US_NEWS_text1<-sample(En_US_NEWS_text, round(0.01*length(En_US_NEWS_text)), replace = F)
    US_News_tokens<- quanteda::tokens(En_US_NEWS_text1,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    US_News_tokens <- tokens_tolower(US_News_tokens)
    US_News_tokens <- tokens_select(US_News_tokens, stopwords(),selection ="remove")

    US_News_unigram <- tokens_ngrams(US_News_tokens, n=1)  ## unigram
    US_News_unigram.dfm <- dfm(US_News_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
    US_News_bigram <- tokens_ngrams(US_News_tokens, n=2)  ## bigram
    US_News_bigram.dfm <- dfm(US_News_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
    US_News_trigram <- tokens_ngrams(US_News_tokens, n=3)  ## trigram
    US_News_trigram.dfm <- dfm(US_News_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
    topfeatures(US_News_unigram.dfm, 10)  # 10 top US News Unigram words
##   said   will    one    can   like   also people  state   year    new 
##    173     96     76     51     49     46     43     41     40     40
   topfeatures(US_News_bigram.dfm, 10)  # 10 top US News Bigram words
##      los_angeles         st_louis        right_now      high_school 
##               11               10                8                8 
##        last_year        years_ago         new_york associated_press 
##                7                6                6                5 
##       looks_like  school_district 
##                4                4
 topfeatures(US_News_trigram.dfm, 10)  # 10 top US News Trigram words
##       cents_per_gallon crested_butte_mountain     first_quarter_year 
##                      3                      3                      2 
##        st_louis_county        rain_barrel_use      barrel_use_tubing 
##                      2                      2                      2 
##  butte_mountain_guides     los_angeles_police  returned_state_police 
##                      2                      2                      2 
##  treasurer_andy_dillon 
##                      2
## en_US.blog.txt High frequency words
    En_US_blogs_text1<-sample(En_US_blogs_text, round(0.02*length(En_US_blogs_text)), replace = F)
    US_blogs_tokens<-quanteda::tokens(En_US_blogs_text1,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    US_blogs_tokens <- tokens_tolower(US_blogs_tokens)
    US_blogs_tokens <- tokens_select(US_blogs_tokens, stopwords(),selection ="remove")

    US_blogs_unigram <- tokens_ngrams(US_blogs_tokens, n=1)  ## unigram
    US_blogs_unigram.dfm <- dfm(US_blogs_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
    US_blogs_bigram <- tokens_ngrams(US_blogs_tokens, n=2)  ## bigram
    US_blogs_bigram.dfm <- dfm(US_blogs_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
    US_blogs_trigram <- tokens_ngrams(US_blogs_tokens, n=3)  ## tiigram
    US_blogs_trigram.dfm <- dfm(US_blogs_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
    topfeatures(US_blogs_unigram.dfm, 20)  # 20 top US blogs Unigram words
##    one   will    can   just   like   time    get   know people    now    new 
##   2561   2256   1970   1934   1866   1834   1416   1260   1206   1182   1095 
##    day   also  first   even   make     us  think   much really 
##   1086   1079   1064   1063   1030   1025   1012   1012   1009
topfeatures(US_blogs_bigram.dfm, 20)  # 20 top US blogs Bigram words
##  first_time   right_now   years_ago even_though   feel_like    new_york 
##         100          96          96          86          85          82 
##  don’t_know     can_see   last_year   make_sure   every_day   last_week 
##          82          78          77          74          71          69 
##     one_day high_school   one_thing  last_night don’t_think  every_time 
##          67          63          62          58          57          54 
##     can_get many_people 
##          53          52
topfeatures(US_blogs_trigram.dfm, 20)  # 20 top US blogs Trigram words
##                fake_fake_fake            bmw_service_center 
##                            33                            22 
##     service_center_california       south_carolina_mortgage 
##                            22                            13 
## carolina_mortgage_refinancing          north_dakota_maquest 
##                            13                            12 
##                new_york_times                 new_york_city 
##                            10                            10 
##           amazon_services_llc           services_llc_amazon 
##                            10                            10 
##                 llc_amazon_eu              couple_weeks_ago 
##                            10                             9 
##                  world_war_ii                     horse_c_k 
##                             8                             8 
##             cedar_rapids_iowa               please_let_know 
##                             8                             7 
##               spend_much_time              please_feel_free 
##                             7                             7 
##              dhs_cedar_rapids                just_make_sure 
##                             7                             6
## en_US.twitter.txt Ngram words 
    En_Twit_text1<-sample(En_Twit_text, round(0.02*length(En_Twit_text)), replace = F)
    twitter_tokens<- quanteda::tokens(En_Twit_text1,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    twitter_tokens <- tokens_tolower(twitter_tokens)
    twitter_tokens <- tokens_select(twitter_tokens, stopwords(),selection ="remove")

    twitter_unigram <- tokens_ngrams(twitter_tokens, n=1)  ## unigram
    twitter_unigram.dfm <- dfm(twitter_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.
## Warning: 'remove' is deprecated; use dfm_remove() instead
    twitter_bigram <- tokens_ngrams(twitter_tokens, n=2)  ## bigram
    twitter_bigram.dfm <- dfm(twitter_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
    twitter_trigram <- tokens_ngrams(twitter_tokens, n=3)  ## trigram
    twitter_trigram.dfm <- dfm(twitter_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
## Warning: '...' should not be used for tokens() arguments; use 'tokens()' first.

## Warning: 'remove' is deprecated; use dfm_remove() instead
    topfeatures(twitter_unigram.dfm, 10)  # 10 top Unigram words
##   just   like    get   love   good   will thanks    can    day     rt 
##   3056   2390   2287   2263   1953   1893   1880   1822   1737   1713
topfeatures(twitter_bigram.dfm, 10)  # 10 top Bigram words
##       right_now      last_night looking_forward  happy_birthday    good_morning 
##             335             227             166             159             155 
##   thanks_follow       feel_like        just_got       good_luck     follow_back 
##             147             132             131             124             113
topfeatures(twitter_trigram.dfm, 10)  # 10 top Trigram words
##            let_us_know      happy_mothers_day         happy_new_year 
##                     38                     33                     29 
##          cinco_de_mayo     happy_mother's_day         love_love_love 
##                     24                     23                     20 
##            yes_yes_yes looking_forward_seeing     please_follow_back 
##                     18                     17                     15 
##         cake_cake_cake 
##                     14

Conclusion:

The final deliverable in the capstone project is to build a predictive algorithm that will be deployed as a Shiny app for the user interface. The Shiny app should take as input a phrase (multiple words) in a text box input and output a prediction of the next word. Priorities for the final project include: - Tokenization and bag of words with multiple Ngrams. - Using a small sample (~ 1 to 5%). - Run the Machine Learning programs to develop the predictive model. - Explore the options to improve the accuracy and speed of execution.