The goal here is to build your first simple model for the relationship between words. This is the first step in building a predictive text mining application. You will explore simple models and discover more complicated modeling techniques.

Tasks to accomplish

Build basic n-gram model - using the exploratory analysis you performed, build a basic n-gram model for predicting the next word based on the previous 1, 2, or 3 words. Build a model to handle unseen n-grams - in some cases people will want to type a combination of words that does not appear in the corpora. Build a model to handle cases where a particular n-gram isn’t observed. Questions to consider

How can you efficiently store an n-gram model (think Markov Chains)? How can you use the knowledge about word frequencies to make your model smaller and more efficient? How many parameters do you need (i.e. how big is n in your n-gram model)? Can you think of simple ways to “smooth” the probabilities (think about giving all n-grams a non-zero probability even if they aren’t observed in the data) ? How do you evaluate whether your model is any good? How can you use backoff models to estimate the probability of unobserved n-grams?

Loading Data Set

#necessary library for the project
library(tm)

## Loading required package: NLP

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(wordcloud)

## Loading required package: RColorBrewer

#open connection to news data
connect <- file("en_US.news.txt", open="r")
News_Data <- readLines(connect)

## Warning in readLines(connect): incomplete final line found on
## 'en_US.news.txt'

close(connect)

#open connection to twitter data
connect <- file("en_US.twitter.txt", open="r")
Twitter_Data <- readLines(connect)

## Warning in readLines(connect): line 167155 appears to contain an embedded
## nul

## Warning in readLines(connect): line 268547 appears to contain an embedded
## nul

## Warning in readLines(connect): line 1274086 appears to contain an embedded
## nul

## Warning in readLines(connect): line 1759032 appears to contain an embedded
## nul

close(connect)

#open connection to blog data
connect <- file("en_US.blogs.txt", open="r")
Blog_Data <- readLines(connect)
close(connect)

Summarize the Datasets before any analysis

get_stat<- function(text_file, lines) {
    f_size <- file.info(text_file)[1]/1024^2
    nchars <- lapply(lines, nchar)
    maxchars <- which.max(nchars)
    word_count <- sum(sapply(strsplit(lines, "\\s+"), length))
    return(c(text_file, format(round(as.double(f_size), 2), nsmall=2), length(lines),maxchars, word_count))
}

    News_Stat<- get_stat("en_US.news.txt", News_Data)
    Twitter_Stat<- get_stat("en_US.twitter.txt", Twitter_Data)
    Blog_Stat <- get_stat("en_US.blogs.txt", Blog_Data)
    
    Summarry_All <- c(News_Stat, Blog_Stat,Twitter_Stat)

    df <- data.frame(matrix(unlist(Summarry_All), nrow=3, byrow=T))
    colnames(df) <- c("Text_file", "Size(MB)", "Line_Count", "Max Line Length", "Words_Count")
    print(df)

##           Text_file Size(MB) Line_Count Max Line Length Words_Count
## 1    en_US.news.txt   196.28      77259           14556     2643972
## 2   en_US.blogs.txt   200.42     899288          483415    37334441
## 3 en_US.twitter.txt   159.36    2360148         1484357    30373792

Exploratory analysis should be performed to get initial insights of the datasets

Generate_Corpus<- function(test_file) {
    corpus_gen<- paste(test_file, collapse=" ")
    corpus_gen <- VectorSource(corpus_gen)
    corpus_gen <- Corpus(corpus_gen)
}
    
Clean_Corpus <- function(corpus_cln) {

    corpus_cln <- tm_map(corpus_cln, removeNumbers)
    corpus_cln <- tm_map(corpus_cln, content_transformer(tolower))
    corpus_cln <- tm_map(corpus_cln, removeWords, stopwords("english"))
    corpus_cln <- tm_map(corpus_cln, removePunctuation)
    corpus_cln <- tm_map(corpus_cln, stripWhitespace)
    return (corpus_cln)
}

Common_Words <- function (corpus_cln) {
    term_sparse <- DocumentTermMatrix(corpus_cln)
    term_matrix <- as.matrix(term_sparse)  
    words_common <- colSums(term_matrix)
    words_common <- as.data.frame(sort(words_common, decreasing=TRUE))
    words_common$word <- rownames(words_common)
    colnames(words_common) <- c("Frequency","word")
    return (words_common)
}

Display most commonly used words in News data using bar plot

    News_Data_Text<-sample(News_Data, round(0.1*length(News_Data)), replace = F)
    News_Data_Corpus <- Generate_Corpus(News_Data_Text)
    News_Data_Corpus <- Clean_Corpus(News_Data_Corpus)

## Warning in tm_map.SimpleCorpus(corpus_cln, removeNumbers): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, removeWords,
## stopwords("english")): transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, stripWhitespace): transformation
## drops documents

    Most_Popular_Word_News <- Common_Words(News_Data_Corpus)
    Most_Popular_Word_SubN<- Most_Popular_Word_News[1:15,]

    p<-ggplot(data=Most_Popular_Word_SubN, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Words") +labs(title = "Most Popular News Words") +theme(legend.title=element_blank()) + coord_flip()

Display most commonly used words in Twitter Data using bar plot

    Twitter_Data_Text<-sample(Twitter_Data, round(0.1*length(Twitter_Data)), replace = F)
    Twitter_Data_Corpus <- Generate_Corpus(Twitter_Data_Text)
    Twitter_Data_Corpus <- Clean_Corpus(Twitter_Data_Corpus)

## Warning in tm_map.SimpleCorpus(corpus_cln, removeNumbers): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, removeWords,
## stopwords("english")): transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, stripWhitespace): transformation
## drops documents

    Most_Popular_Word_Twit <- Common_Words(Twitter_Data_Corpus)
    Most_Popular_Word_SubT<- Most_Popular_Word_Twit[1:15,]

    p<-ggplot(data=Most_Popular_Word_SubT, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Words") +labs(title = "Most Popular News Words") +theme(legend.title=element_blank()) + coord_flip()

Display most commonly used words in Blogs Data using bar plot

    Blog_Data_Text<-sample(Twitter_Data, round(0.1*length(Blog_Data)), replace = F)
    Blog_Data_Corpus <- Generate_Corpus(Blog_Data_Text)
    Blog_Data_Corpus <- Clean_Corpus(Blog_Data_Corpus)

## Warning in tm_map.SimpleCorpus(corpus_cln, removeNumbers): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, removeWords,
## stopwords("english")): transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus_cln, stripWhitespace): transformation
## drops documents

    Most_Popular_Word_Blog <- Common_Words(Blog_Data_Corpus)
    Most_Popular_Word_SubB<- Most_Popular_Word_Blog[1:15,]

    p<-ggplot(data=Most_Popular_Word_SubB, aes(x=reorder(word,Frequency), y=Frequency,
                    fill=factor(reorder(word,-Frequency))))+ geom_bar(stat="identity") 
    p + xlab("Words") +labs(title = "Most Popular News Words") +theme(legend.title=element_blank()) + coord_flip()

Generate Word Cloud to display the most common words using a beautiful tool calld word cloud

    wordcloud(Most_Popular_Word_News$word[1:100], Most_Popular_Word_News$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

wordcloud(Most_Popular_Word_Twit$word[1:100], Most_Popular_Word_Twit$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

wordcloud(Most_Popular_Word_Blog$word[1:100], Most_Popular_Word_Blog$Frequency[1:100],
              colors=brewer.pal(8, "Dark2"))

N gram model for News Data

    library(quanteda)

## Package version: 1.4.3

## Parallel computing: 2 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:utils':
## 
##     View

    News_Text_sub<-sample(News_Data, round(0.01*length(News_Data)), replace = F)
    Token_News<- tokens(News_Text_sub,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    Token_News <- tokens_tolower(Token_News)
    Token_News <- tokens_select(Token_News, stopwords(),selection ="remove")

    Unigram_News <- tokens_ngrams(Token_News, n=1)  
    Unigram_News.dfm <- dfm(Unigram_News, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    Bigram_News <- tokens_ngrams(Token_News, n=2)  
    Bigram_News.dfm <- dfm(Bigram_News, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    Trigram_News <- tokens_ngrams(Token_News, n=3)  
    Trigram_News.dfm <- dfm(Trigram_News, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(Unigram_News.dfm, 20)

##   said      â    one   just    can   also    new      s people   time 
##    186    104     80     63     55     53     52     50     49     46 
##    two   last  first   year  state    get   game    now  years    way 
##     44     42     42     40     36     32     32     32     31     31

    topfeatures(Bigram_News.dfm, 20)

##       new_york      last_week       st_louis          itâ_s        didnâ_t 
##             10              7              7              6              6 
##    even_though medical_center         â_said       thereâ_s         â_œthe 
##              6              6              5              5              5 
##    los_angeles  united_states    high_school    last_season      last_year 
##              5              5              5              4              4 
##        can_get    police_said      two_years     new_jersey           â_œi 
##              4              4              4              3              3

    topfeatures(Trigram_News.dfm, 20)

##            new_england_patriots             womenâ_s_basketball 
##                               2                               2 
##                 delta_air_lines                       said_â_œi 
##                               2                               2 
##                  one_food_truck department_economic_development 
##                               2                               2 
##                  st_square_feet              third_fourth_lines 
##                               2                               2 
##                smoke_cedar_bark                  one_bin_ladenâ 
##                               2                               2 
##                    bin_ladenâ_s        georgia_street_community 
##                               2                               2 
##                     donâ_t_want                   based_tac_air 
##                               2                               2 
##        girls_athletics_programs        generate_million_million 
##                               2                               2 
##       congressional_term_limits                â_œobviously_itâ 
##                               2                               1 
##                œobviously_itâ_s                       itâ_s_lot 
##                               1                               1

N gram model for Twitter Data

    library(quanteda)
    Twitter_Text_sub<-sample(Twitter_Data, round(0.01*length(Twitter_Data)), replace = F)
    Token_Twitter<- tokens(Twitter_Text_sub,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    Token_Twitter <- tokens_tolower(Token_Twitter)
    Token_Twitter <- tokens_select(Token_Twitter, stopwords(),selection ="remove")

    Unigram_Twitter <- tokens_ngrams(Token_Twitter, n=1)  
    Unigram_Twitter.dfm <- dfm(Unigram_Twitter, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    Bigram_Twitter <- tokens_ngrams(Token_Twitter, n=2)  
    Bigram_Twitter.dfm <- dfm(Bigram_Twitter, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    Trigram_Twitter <- tokens_ngrams(Token_Twitter, n=3)  
    Trigram_Twitter.dfm <- dfm(Trigram_Twitter, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(Unigram_Twitter.dfm, 20)

##   just   like    get   love   good    day     rt    can thanks    one 
##   1553   1223   1146   1046   1001    953    924    921    885    878 
##   know    now  great      u     go   time  today    lol    new    see 
##    834    821    813    812    786    742    741    670    661    644

    topfeatures(Bigram_Twitter.dfm, 20)

##             â_œ       right_now      last_night looking_forward 
##             215             135             114             101 
##  happy_birthday    good_morning   thanks_follow        just_got 
##              89              83              78              77 
##      looks_like     follow_back        let_know         can_get 
##              70              62              61              60 
##       good_luck       great_day   please_follow     sounds_like 
##              54              52              51              50 
##           ðÿ_ðÿ       feel_like       next_week         one_day 
##              47              46              45              41

    topfeatures(Trigram_Twitter.dfm, 20)

##       happy_mothers_day                   î_î_î             let_us_know 
##                      27                      24                      23 
##                ðÿ_ðÿ_ðÿ      happy_mother's_day  looking_forward_seeing 
##                      15                      14                      13 
##    follow_follow_follow                   â_â_â          happy_new_year 
##                      12                      11                      11 
##                gt_gt_gt           just_got_done                  rt_â_œ 
##                      11                      10                      10 
## dreamed_dreamed_dreamed       louis_louis_louis          cake_cake_cake 
##                       9                       8                       8 
##          love_love_love           cinco_de_mayo           get_work_done 
##                       7                       7                       7 
##     thanks_following_us     ralph_waldo_emerson 
##                       7                       7

N gram model for Blogs Data

    library(quanteda)
    Blog_Text_sub<-sample(Blog_Data, round(0.01*length(Blog_Data)), replace = F)
    Token_Blog<- tokens(Blog_Text_sub,what ="word", remove_numbers = TRUE, 
                            remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
    Token_Blog <- tokens_tolower(Token_Blog)
    Token_Blog <- tokens_select(Token_Blog, stopwords(),selection ="remove")

    Unigram_Blog <- tokens_ngrams(Token_Blog, n=1)  
    Unigram_Blog.dfm <- dfm(Unigram_Blog, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)    

    Bigram_Blog <- tokens_ngrams(Token_Blog, n=2)  
    Bigram_Blog.dfm <- dfm(Bigram_Blog, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    
    Trigram_Blog <- tokens_ngrams(Token_Blog, n=3)  
    Trigram_Blog.dfm <- dfm(Trigram_Blog, tolower =TRUE, remove = stopwords("english"), 
                              remove_punct = TRUE)
    topfeatures(Unigram_Blog.dfm, 20)

##      â      s    one    can   like   just      t   time    get people 
##   2243   1656   1231    996    994    989    913    886    737    610 
##    new   know    now   also   back    day     iâ   make   good    see 
##    577    571    539    534    530    518    517    504    504    500

    topfeatures(Bigram_Blog.dfm, 20)

##     itâ_s    donâ_t      iâ_m     iâ_ve   didnâ_t   thatâ_s    canâ_t 
##       392       266       264       138       123       115        90 
##   youâ_re  doesnâ_t  thereâ_s    isnâ_t    â_œthe years_ago     iâ_ll 
##        72        70        65        65        65        60        57 
##  new_york      iâ_d feel_like   wasnâ_t     heâ_s right_now 
##        56        55        52        51        49        49

    topfeatures(Trigram_Blog.dfm, 20)

##          donâ_t_know            iâ_m_sure           iâ_m_going 
##                   30                   29                   18 
## italian_people_italy         donâ_t_think          donâ_t_need 
##                   17                   14                   13 
##          donâ_t_want           itâ_s_just           itâ_s_like 
##                   13                   13                   12 
##         didnâ_t_know        donâ_t_really           donâ_t_get 
##                   11                   11                   11 
##                ð_ð_ð            iâ_m_just              â_œiâ_m 
##                   10                   10                   10 
##             â_œitâ_s         didnâ_t_want            iâ_d_like 
##                   10                   10                   10 
##          iâ_ve_never            donâ_t_go 
##                   10                    8

Milestone Report

Sakib Shahriar

April 5, 2019