Milestone Report for Week2 of Coursera Data Science Capstone

INTRODUCTION

This document is being published for submission of the assignment Milestone Report for Week2 of Coursera Data Science Capstone.

I have loaded following libraries for required exploratory data analysis:

library(ggplot2)       # plotting
library(R.utils)       # utility
library(quanteda)      # for tokenization
library(RColorBrewer)  # wordcloud
library(plyr)          # data ordering
library(tm)
library(dplyr)
library(magrittr)
library(stringr)
library(stringi)
library(tm)
library(rJava)
library(RWekajars)
library(RWeka)
library(SnowballC)
library(wordcloud)

Downloading the data

The dataset used for analysis is available at available at this url: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

Loading the Data into R

For the demonstation perpose we are using the English Language text data.

    setwd("./final/en_US")

    con <- file("en_US.news.txt", open="r")
    En_US_NEWS_text <- readLines(con); close(con)
    
    con <- file("en_US.blogs.txt", open="r")
    En_US_blogs_text <- readLines(con); close(con) 

    con <- file("en_US.twitter.txt", open="r")
    En_Twit_text <- readLines(con); close(con)

List of files

list.files("./final/en_US")

## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

Data set summary statistics details

Extracting the following text files summary:

setwd("./final/en_US")
file_stat<- function(text_file, lines) {
    f_size <- file.info(text_file)[1]/1024^2
    nchars <- lapply(lines, nchar)
    maxchars <- which.max(nchars)
    word_count <- sum(sapply(strsplit(lines, "\\s+"), length))
    return(c(text_file, format(round(as.double(f_size), 2), nsmall=2), length(lines),maxchars, word_count))
}

    En_US_news_stat<- file_stat("en_US.news.txt", En_US_NEWS_text)
    En_US_blogs_stat <- file_stat("en_US.blogs.txt", En_US_blogs_text)
    En_Twit_text_stat<- file_stat("en_US.twitter.txt", En_Twit_text)

    test_summary <- c(En_US_news_stat, En_US_blogs_stat,En_Twit_text_stat)

    df <- data.frame(matrix(unlist(test_summary), nrow=3, byrow=T))
    colnames(df) <- c("Text_file", "Size(MB)", "Line_Count", "Max Line Length", "Words_Count")
    print(df)

##           Text_file Size(MB) Line_Count Max Line Length Words_Count
## 1    en_US.news.txt   196.28      77259           14556     2643972
## 2   en_US.blogs.txt   200.42     899288          483415    37334441
## 3 en_US.twitter.txt   159.36    2360148         1484357    30373792

Performing exploratory data Analysis

Here going to create test data corpus, clean corpus and count high frequency words:

make_Corpus<- function(test_file) {
    gen_corp<- paste(test_file, collapse=" ")
    gen_corp <- VectorSource(gen_corp)
    gen_corp <- Corpus(gen_corp)
}
    
clean_corp <- function(corp_data) {

    corp_data <- tm_map(corp_data, removeNumbers)
    corp_data <- tm_map(corp_data, content_transformer(tolower))
    corp_data <- tm_map(corp_data, removeWords, stopwords("english"))
    corp_data <- tm_map(corp_data, removePunctuation)
    corp_data <- tm_map(corp_data, stripWhitespace)
    return (corp_data)
}

high_freq_words <- function (corp_data) {
    term_sparse <- DocumentTermMatrix(corp_data)
    term_matrix <- as.matrix(term_sparse)   ## convert our term-document-matrix into a normal matrix
    freq_words <- colSums(term_matrix)
    freq_words <- as.data.frame(sort(freq_words, decreasing=TRUE))
    freq_words$word <- rownames(freq_words)
    colnames(freq_words) <- c("Frequency","word")
    return (freq_words)
}

creating Bar Chart

Here going to create bar chart for high frequency words:

 En_US_NEWS_text1<-sample(En_US_NEWS_text, round(0.1*length(En_US_NEWS_text)), replace = F)
    US_news_corpus <- make_Corpus(En_US_NEWS_text1)
    US_news_corpus <- clean_corp(US_news_corpus)
    US_news_most_used_word <- high_freq_words(US_news_corpus)
    US_news_most_used_word1<- US_news_most_used_word[1:15,]

    p<-ggplot(data=US_news_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent words : US News") +theme(legend.title=element_blank()) + coord_flip()

    ## en_US.blogs.txt High frequency words 
    En_US_blogs_text1<-sample(En_US_blogs_text, round(0.1*length(En_US_blogs_text)), replace = F)
    US_blogs_corpus <- make_Corpus(En_US_blogs_text1)
    US_blogs_corpus <- clean_corp(US_blogs_corpus)
    US_blogs_most_used_word <- high_freq_words(US_blogs_corpus)
    US_blogs_most_used_word1<- US_blogs_most_used_word[1:15,]

    p<-ggplot(data=US_blogs_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent words : US blogs") +theme(legend.title=element_blank()) + coord_flip()

## en_US.twitter.txt High frequency words

    En_Twit_text1<-sample(En_Twit_text, round(0.1*length(En_Twit_text)), replace = F)
    twitter_corpus <- make_Corpus(En_Twit_text1)
    twitter_corpus <- clean_corp(twitter_corpus)
    twitter_most_used_word <- high_freq_words(twitter_corpus)
    twitter_most_used_word1<- twitter_most_used_word[1:15,]
    
    p<-ggplot(data=twitter_most_used_word1, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Word") +labs(title = "Most Frequent words : Twitter") +theme(legend.title=element_blank()) + coord_flip()

## US News Word Cloud
    wordcloud(US_news_most_used_word$word[1:100], US_news_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

## US Twitter Word Cloud
    wordcloud(twitter_most_used_word$word[1:100], twitter_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

## Warning in wordcloud(twitter_most_used_word$word[1:100],
## twitter_most_used_word$Frequency[1:100], : thanks could not be fit on page. It
## will not be plotted.

## US Blogs Word Cloud
    wordcloud(US_blogs_most_used_word$word[1:100], US_blogs_most_used_word$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

Word Analysis

For the Data analysis of text document we need to create a bag of word matrices with Unigram, Bigram, Trigrams. These Ngram model set improve the predictabily of the data analysis.

## en_US.news.txt High frequency words    
    En_US_NEWS_text1<-sample(En_US_NEWS_text, round(0.01*length(En_US_NEWS_text)), replace = F)
    US_News_tokens<- tokens(En_US_NEWS_text1,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    US_News_tokens <- tokens_tolower(US_News_tokens)
    US_News_tokens <- tokens_select(US_News_tokens, stopwords(),selection ="remove")

    US_News_unigram <- tokens_ngrams(US_News_tokens, n=1)  ## unigram
    US_News_unigram.dfm <- dfm(US_News_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    US_News_bigram <- tokens_ngrams(US_News_tokens, n=2)  ## bigram
    US_News_bigram.dfm <- dfm(US_News_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    US_News_trigram <- tokens_ngrams(US_News_tokens, n=3)  ## trigram
    US_News_trigram.dfm <- dfm(US_News_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(US_News_unigram.dfm, 20)  # 20 top US News Unigram words

##   said      â   will    one    new      s    can   also people   year   time 
##    215    116     88     67     55     52     50     48     46     45     44 
##    two   just   like  years   last  state    get   back   much 
##     43     42     38     38     38     37     34     32     30

 topfeatures(US_News_bigram.dfm, 20)  # 20 top US News Bigram words

##         itâ_s      new_york     last_year     last_week     years_ago 
##            11            10             9             9             8 
##       didnâ_t united_states        â_said     also_said   three_years 
##             8             6             6             6             6 
##   police_said      can_make san_francisco     san_diego     next_year 
##             6             5             5             5             5 
##   high_school supreme_court     two_years   years_later   white_house 
##             5             4             4             4             4

 topfeatures(US_News_trigram.dfm, 20)  # 20 top US News Trigram words

##          third_fourth_lines capital_improvement_program 
##                           3                           2 
##             want_take_votes    take_votes_controversial 
##                           2                           2 
##  votes_controversial_issues             three_years_ago 
##                           2                           2 
##       port_authority_police             points_per_game 
##                           2                           2 
##                    â_œiâ_ve          four_children_ages 
##                           2                           2 
##       circuit_court_appeals              make_feel_like 
##                           2                           2 
##              feel_like_youâ                like_youâ_re 
##                           2                           2 
##            youâ_re_floating             re_floating_top 
##                           2                           2 
##         metro_east_lutheran           pulse_locker_room 
##                           2                           2 
##                  said_â_œwe   school_activities_program 
##                           2                           2

## en_US.blog.txt High frequency words
    En_US_blogs_text1<-sample(En_US_blogs_text, round(0.02*length(En_US_blogs_text)), replace = F)
    US_blogs_tokens<- tokens(En_US_blogs_text1,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    US_blogs_tokens <- tokens_tolower(US_blogs_tokens)
    US_blogs_tokens <- tokens_select(US_blogs_tokens, stopwords(),selection ="remove")

    US_blogs_unigram <- tokens_ngrams(US_blogs_tokens, n=1)  ## unigram
    US_blogs_unigram.dfm <- dfm(US_blogs_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    US_blogs_bigram <- tokens_ngrams(US_blogs_tokens, n=2)  ## bigram
    US_blogs_bigram.dfm <- dfm(US_blogs_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    US_blogs_trigram <- tokens_ngrams(US_blogs_tokens, n=3)  ## tiigram
    US_blogs_trigram.dfm <- dfm(US_blogs_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(US_blogs_unigram.dfm, 20)  # 20 top US blogs Unigram words

##      â      s    one   will   like   just    can      t   time    get people 
##   4632   3440   2406   2205   1994   1989   1930   1897   1750   1395   1242 
##    now   know   also     iâ   back   even  first    day    new 
##   1192   1183   1144   1091   1052   1025   1013   1005   1004

 topfeatures(US_blogs_bigram.dfm, 20)  # 20 top US blogs Bigram words

##     itâ_s      iâ_m    donâ_t   didnâ_t     iâ_ve   thatâ_s    canâ_t  doesnâ_t 
##       789       568       547       278       276       252       181       155 
##   youâ_re  thereâ_s    â_œthe     iâ_ll   wasnâ_t      iâ_d      â_œi    t_know 
##       143       134       134       129       121       116       112       107 
##       â_â right_now couldnâ_t years_ago 
##       103       101        98        98

  topfeatures(US_blogs_trigram.dfm, 20)  # 20 top US blogs Trigram words

##   donâ_t_know     iâ_m_sure   donâ_t_want  donâ_t_think    itâ_s_just 
##            77            55            39            38            29 
##         â_â_â  didnâ_t_know    itâ_s_like    iâ_m_going  didnâ_t_want 
##            27            26            24            23            22 
##       â_œiâ_m      â_œitâ_s new_york_city   donâ_t_like    know_itâ_s 
##            22            21            19            19            18 
##    itâ_s_time   iâ_ve_never      now_iâ_m   think_itâ_s youâ_re_going 
##            17            17            17            17            17

## en_US.twitter.txt Ngram words 
    En_Twit_text1<-sample(En_Twit_text, round(0.02*length(En_Twit_text)), replace = F)
    twitter_tokens<- tokens(En_Twit_text1,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    twitter_tokens <- tokens_tolower(twitter_tokens)
    twitter_tokens <- tokens_select(twitter_tokens, stopwords(),selection ="remove")

    twitter_unigram <- tokens_ngrams(twitter_tokens, n=1)  ## unigram
    twitter_unigram.dfm <- dfm(twitter_unigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    twitter_bigram <- tokens_ngrams(twitter_tokens, n=2)  ## bigram
    twitter_bigram.dfm <- dfm(twitter_bigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    twitter_trigram <- tokens_ngrams(twitter_tokens, n=3)  ## trigram
    twitter_trigram.dfm <- dfm(twitter_trigram, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(twitter_unigram.dfm, 20)  # 20 top Unigram words

##   just   like    get   love   good   will    can    day     rt thanks    now 
##   3119   2446   2252   2143   2055   1947   1820   1779   1751   1733   1648 
##    one   know   time      u  great  today     go    new    lol 
##   1627   1582   1579   1567   1529   1478   1454   1433   1353

topfeatures(twitter_bigram.dfm, 20)  # 20 top Bigram words

##             â_œ       right_now      last_night looking_forward    good_morning 
##             388             330             237             184             165 
##       feel_like  happy_birthday        just_got       good_luck     follow_back 
##             161             161             158             128             126 
##      looks_like   thanks_follow        let_know         can_get       next_week 
##             125             119             107             106              94 
##       make_sure       great_day   please_follow    social_media        just_saw 
##              92              87              82              79              78

   topfeatures(twitter_trigram.dfm, 20)  # 20 top  Trigram words

##     happy_mother's_day            let_us_know      happy_mothers_day 
##                     41                     36                     29 
##          cinco_de_mayo         happy_new_year                 rt_â_œ 
##                     26                     25                     24 
##                  â_â_â               ðÿ_ðÿ_ðÿ         keep_good_work 
##                     22                     19                     17 
##          new_york_city          just_got_back looking_forward_seeing 
##                     15                     14                     13 
##         never_gets_old            come_see_us       will_follow_back 
##                     13                     12                     12 
##          just_got_home    thanks_following_us            yes_yes_yes 
##                     12                     11                     11 
##          just_got_done   thanks_everyone_came 
##                     10                     10

Interesting findings that I have amassed so far

I have learned use of many libraries those were new to me. Furthermore I have been revised with skills learned in different courses of specialization.

Plan of Approach:

I further have plans to implement a shiny app for word prediction

Feedback

Please provide your valuable suggesstions to improve exploratory analysis.