Capstone: Milestone Report

Instructions

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets. 3. Report any interesting findings that you amassed so far. 4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Review Criteria

Does the link lead to an HTML page describing the exploratory analysis of the training data set?
Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables?
Has the data scientist made basic plots, such as histograms to illustrate features of the data?
Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate?

Dataset Source

Download the dataset from the following link and unzip the files in the current working directory - (https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip)

Loading Library

library(tm)

## Loading required package: NLP

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(quanteda)

## Package version: 2.0.1

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following objects are masked from 'package:NLP':
## 
##     meta, meta<-

## The following object is masked from 'package:utils':
## 
##     View

library(wordcloud)

## Loading required package: RColorBrewer

library(stats)

Loading dataset into R

data_blogs <- readLines("en_US.blogs.txt",encoding = "UTF-8", skipNul = TRUE)
data_news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
data_twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

file_stat<- function(text_file, lines) {
    f_size <- file.info(text_file)[1]/1024^2
    nchars <- lapply(lines, nchar)
    maxchars <- which.max(nchars)
    word_count <- sum(sapply(strsplit(lines, "\\s+"), length))
    return(c(text_file, format(round(as.double(f_size), 2), nsmall=2), length(lines),maxchars, word_count))
}

    blogs_statistic <- file_stat("en_US.blogs.txt", data_blogs)
    news_statistic<- file_stat("en_US.news.txt", data_news)
    twitter_statistic<- file_stat("en_US.twitter.txt", data_twitter)

    statistic_summary <- c(blogs_statistic, news_statistic,twitter_statistic)

    df <- data.frame(matrix(unlist(statistic_summary), nrow=3, byrow=T))
    colnames(df) <- c("Text_file", "Size(MB)", "Line_Count", "Max Line Length", "Words_Count")
    print(df)

##           Text_file Size(MB) Line_Count Max Line Length Words_Count
## 1   en_US.blogs.txt   200.42     899288          483415    37334149
## 2    en_US.news.txt   196.28    1010242          123628    34372814
## 3 en_US.twitter.txt   159.36    2360148              26    30373605

Exploratory Data Analysis

Here I am writing a functions to make the test data Corpus, Clean the corpus, and capture the hight frquency words

make_Corpus<- function(test_file) {
    gen_corp<- paste(test_file, collapse=" ")
    gen_corp <- VectorSource(gen_corp)
    gen_corp <- Corpus(gen_corp)
}

    
clean_corp <- function(corp_data) {

    corp_data <- tm_map(corp_data, removeNumbers)
    corp_data <- tm_map(corp_data, content_transformer(tolower))
    corp_data <- tm_map(corp_data, removeWords, stopwords("english"))
    corp_data <- tm_map(corp_data, removePunctuation)
    corp_data <- tm_map(corp_data, stripWhitespace)
    return (corp_data)
}

high_freq_words <- function (corp_data) {
    term_sparse <- DocumentTermMatrix(corp_data)
    term_matrix <- as.matrix(term_sparse)   ## convert our term-document-matrix into a normal matrix
    freq_words <- colSums(term_matrix)
    freq_words <- as.data.frame(sort(freq_words, decreasing=TRUE))
    freq_words$word <- rownames(freq_words)
    colnames(freq_words) <- c("Frequency","word")
    return (freq_words)
}

Bar Chart of High Frequency Words

This section is explore the different text mining commads and extract the high frequency words

## en_US.news.txt High frequency words 
    En_US_NEWS_text1<-sample(data_news, round(0.1*length(data_news)), replace = F)
    US_news_corpus <- make_Corpus(En_US_NEWS_text1)
    US_news_corpus <- clean_corp(US_news_corpus)

## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation drops
## documents

    US_news_most_used_word <- high_freq_words(US_news_corpus)
    US_news_most_used_word1<- US_news_most_used_word[1:15,]

    p<-ggplot(data=US_news_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent words : US News") +theme(legend.title=element_blank()) + coord_flip()

## en_US.blogs.txt High frequency words 
    En_US_blogs_text1<-sample(data_blogs, round(0.1*length(data_blogs)), replace = F)
    US_blogs_corpus <- make_Corpus(En_US_blogs_text1)
    US_blogs_corpus <- clean_corp(US_blogs_corpus)

## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation drops
## documents

    US_blogs_most_used_word <- high_freq_words(US_blogs_corpus)
    US_blogs_most_used_word1<- US_blogs_most_used_word[1:15,]

    p<-ggplot(data=US_blogs_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent words : US blogs") +theme(legend.title=element_blank()) + coord_flip()

## en_US.twitter.txt High frequency words 
    En_Twit_text1<-sample(data_twitter, round(0.1*length(data_twitter)), replace = F)
    twitter_corpus <- make_Corpus(En_Twit_text1)
    twitter_corpus <- clean_corp(twitter_corpus)

## Warning in tm_map.SimpleCorpus(corp_data, removeNumbers): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(corp_data, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removeWords, stopwords("english")):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corp_data, removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corp_data, stripWhitespace): transformation drops
## documents

    twitter_most_used_word <- high_freq_words(twitter_corpus)
    twitter_most_used_word1<- twitter_most_used_word[1:15,]
    
    p<-ggplot(data=twitter_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent words : Twitter") +theme(legend.title=element_blank()) + coord_flip()

Generating World Cloud

Word Cloud is Cool representation of the Word display based on the Frequencies.

## US News Word Cloud
    wordcloud(US_news_most_used_word$word[1:100], US_news_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

## US News Word Cloud
    wordcloud(US_blogs_most_used_word$word[1:100], US_blogs_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

## US Twitter Word Cloud
    wordcloud(twitter_most_used_word$word[1:100], twitter_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

Word Analysis

For the Data analysis of text document we need to create a bag of word matrices with Unigram, Bigram, Trigrams. These Ngram model set improve the predictabily of the data analysis.

## en_US.news.txt High frequency words    
    En_US_NEWS_text1<-sample(data_news, round(0.01*length(data_news)), replace = F)
    US_News_tokens<- tokens(En_US_NEWS_text1,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    US_News_tokens <- tokens_tolower(US_News_tokens)
    US_News_tokens <- tokens_select(US_News_tokens, stopwords(),selection ="remove")

    US_News_unigram <- tokens_ngrams(US_News_tokens, n=1)  ## unigram
    US_News_unigram.dfm <- dfm(US_News_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    US_News_bigram <- tokens_ngrams(US_News_tokens, n=2)  ## bigram
    US_News_bigram.dfm <- dfm(US_News_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    US_News_trigram <- tokens_ngrams(US_News_tokens, n=3)  ## trigram
    US_News_trigram.dfm <- dfm(US_News_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(US_News_unigram.dfm, 20)  # 20 top US News Unigram words

##   said    one    new    can    two   also   year   just   time  state   last 
##   2474    822    746    610    575    571    548    533    504    503    496 
##  years   like  first    get people    now   city  three school 
##    493    477    459    449    440    358    356    355    343

  topfeatures(US_News_bigram.dfm, 20)  # 20 top US News Bigram words

##       new_york      last_year    high_school       st_louis      last_week 
##            119            117             88             84             81 
##     new_jersey      years_ago      two_years  united_states    health_care 
##             75             71             54             53             51 
##    los_angeles     little_bit     first_time      right_now vice_president 
##             49             38             36             35             34 
##  san_francisco     five_years    police_said    even_though officials_said 
##             33             31             30             29             28

 topfeatures(US_News_trigram.dfm, 20)  # 20 top US News Trigram words

##             new_york_city    president_barack_obama             two_years_ago 
##                        17                        13                        13 
##           st_louis_county            new_york_times           three_years_ago 
##                        13                         8                         8 
##          first_time_since            last_two_years   county_sheriff's_office 
##                         7                         7                         7 
##           points_per_game           cents_per_share           health_care_law 
##                         7                         7                         6 
##      dow_jones_industrial  jones_industrial_average     told_associated_press 
##                         6                         6                         6 
##      high_school_students attorney_general's_office               rock_n_roll 
##                         6                         5                         5 
##  president_barack_obama's            late_last_year 
##                         5                         5

## en_US.blog.txt High frequency words
    En_US_blogs_text1<-sample(data_blogs, round(0.02*length(data_blogs)), replace = F)
    US_blogs_tokens<- tokens(En_US_blogs_text1,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    US_blogs_tokens <- tokens_tolower(US_blogs_tokens)
    US_blogs_tokens <- tokens_select(US_blogs_tokens, stopwords(),selection ="remove")

    US_blogs_unigram <- tokens_ngrams(US_blogs_tokens, n=1)  ## unigram
    US_blogs_unigram.dfm <- dfm(US_blogs_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    US_blogs_bigram <- tokens_ngrams(US_blogs_tokens, n=2)  ## bigram
    US_blogs_bigram.dfm <- dfm(US_blogs_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    US_blogs_trigram <- tokens_ngrams(US_blogs_tokens, n=3)  ## tiigram
    US_blogs_trigram.dfm <- dfm(US_blogs_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(US_blogs_unigram.dfm, 20)  # 20 top US blogs Unigram words

##    one    can   just   like   time    get people   know    now   also    day 
##   2642   2009   2002   1970   1682   1434   1234   1199   1169   1114   1058 
##    new   even   make  first     us   good    see   much  think 
##   1048   1045   1024   1019   1015   1006    985    964    960

 topfeatures(US_blogs_bigram.dfm, 20)  # 20 top US blogs Bigram words

##   right_now    new_york   years_ago even_though   every_day   feel_like 
##         103         101          96          95          92          91 
##     can_see   last_week  first_time   make_sure     one_day   last_year 
##          82          79          77          77          74          70 
##  little_bit many_people   just_like  last_night   one_thing     can_get 
##          69          63          62          61          60          60 
##   long_time high_school 
##          53          52

topfeatures(US_blogs_trigram.dfm, 20)  # 20 top US blogs Trigram words

##          new_york_city         new_york_times       couple_weeks_ago 
##                     22                     12                     10 
##          level_mp_cost   preheat_oven_degrees           world_war_ii 
##                      9                      8                      8 
##    amazon_services_llc    services_llc_amazon          llc_amazon_eu 
##                      8                      8                      8 
##           new_york_n.y    paintball_gun_cases      north_dakota_wine 
##                      8                      8                      7 
## dakota_wine_restaurant         spent_lot_time            let_us_know 
##                      7                      7                      7 
##               bmix_j_k       please_feel_free        please_let_know 
##                      7                      6                      6 
##       mp_cost_duration  cost_duration_seconds 
##                      6                      6

## en_US.twitter.txt Ngram words 
    En_Twit_text1<-sample(data_twitter, round(0.02*length(data_twitter)), replace = F)
    twitter_tokens<- tokens(En_Twit_text1,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    twitter_tokens <- tokens_tolower(twitter_tokens)
    twitter_tokens <- tokens_select(twitter_tokens, stopwords(),selection ="remove")

    twitter_unigram <- tokens_ngrams(twitter_tokens, n=1)  ## unigram
    twitter_unigram.dfm <- dfm(twitter_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    twitter_bigram <- tokens_ngrams(twitter_tokens, n=2)  ## bigram
    twitter_bigram.dfm <- dfm(twitter_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    twitter_trigram <- tokens_ngrams(twitter_tokens, n=3)  ## trigram
    twitter_trigram.dfm <- dfm(twitter_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(twitter_unigram.dfm, 20)  # 20 top Unigram words

##   just   like   love    get   good    can     rt thanks    day    now   know 
##   3024   2511   2149   2134   2004   1835   1813   1799   1765   1634   1621 
##    one      u  great  today   time     go    lol    new    see 
##   1604   1539   1516   1512   1462   1427   1393   1361   1335

    topfeatures(twitter_bigram.dfm, 20)  # 20 top Bigram words

##       right_now      last_night looking_forward  happy_birthday    good_morning 
##             333             228             178             160             150 
##        just_got       feel_like   thanks_follow      looks_like       good_luck 
##             142             136             133             124             121 
##         can_get        let_know     follow_back       next_week       make_sure 
##             121             119             112              85              83 
##        just_saw     sounds_like         join_us       just_like       thanks_rt 
##              79              77              77              75              74

topfeatures(twitter_trigram.dfm, 20)  # 20 top  Trigram words

##            let_us_know      happy_mothers_day     happy_mother's_day 
##                     50                     40                     35 
##         happy_new_year            omg_omg_omg looking_forward_seeing 
##                     33                     24                     17 
##         jobs_jobs_jobs   please_please_please    ralph_waldo_emerson 
##                     15                     14                     13 
##          cinco_de_mayo       make_feel_better            come_see_us 
##                     13                     12                     12 
##         cake_cake_cake           come_join_us    hope_everyone_great 
##                     12                     11                     11 
##          just_got_back         hope_great_day         love_love_love 
##                     11                     11                     10 
##     follow_back_please           let_know_can 
##                     10                     10

Future Works

The goal is to create a predictive model which predicts the most probable words to follow an input from the user. This model will be evaluated and deployed as a shiny application.