Coursera Data Science Capstone Project

Introduction

In order to answer the questions of Quiz 2 to Week 3 will be used: >the grepl() command READING ALL DOCUMENTS from the three files from SwiftKey, >extract the documents that contains the group of words searched, >extract the words around the nuclear group of word searched >process this final extract to eliminate extra blanks, punctuation, words like preposition, articles >extract the word that results as complimente of the initial group of words. In this group of words we should find one of the options offered for each question in the quiz.

library(stringr)
library(stringi)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
setwd("C:/Coursera/10_Data_Science_Capstone")

#Select ALL DOCUMENTS in Blogs
allBlogs <- readLines("en_US.blogs.txt")
#Select ALL DOCUMENTS in News
allNews <- readLines("en_US.news.txt")
#Select ALL DOCUMENTS in Twitters
allTwitter <- readLines("en_US.twitter.txt")

summary(allBlogs)
##    Length     Class      Mode 
##    899289 character character
summary(allNews)
##    Length     Class      Mode 
##   1010243 character character
summary(allTwitter)
##    Length     Class      Mode 
##   2360149 character character
tokenmaker <- function(x) {
        corpus <- Corpus(VectorSource(x))
        corpus <- tm_map(corpus, content_transformer(tolower))
        corpus <- tm_map(corpus, removePunctuation)
        corpus <- tm_map(corpus, stripWhitespace)
        corpus <- tm_map(corpus, removeWords, stopwords("english"))
        corpus <- tm_map(corpus, removeNumbers)
        corpus <- tm_map(corpus, PlainTextDocument)
#        corpus <- tm_map(corpus, stemDocument)
        corpus <- Corpus(VectorSource(corpus))
}  

wordcounter <- function(x) {
        dtm<-DocumentTermMatrix(x)
        dtm_matrix <- as.matrix(dtm)
        word_freq <- colSums(dtm_matrix)
        word_freq <- sort(word_freq, decreasing = TRUE)
        words <- names(word_freq)
        return(list(words, word_freq))
}  

NextWordIs <- function(x,y){
        BQuest<-grepl(x, allBlogs, ignore.case=TRUE)
        BDocs<-allBlogs[BQuest]
        textoachado<-'a'
        NextWordIs<-'a'
        i<-length(BDocs)
        if (i>0)
                {
                for (i in 1:i)
                {  textoachado[i]<- str_extract(BDocs[i], y)
                NextWordIs[i]<- stri_extract_last_words(textoachado[i]) 
                }
                }
        NQuest<-grepl(x, allNews, ignore.case=TRUE)
        NDocs<-allNews[NQuest]
        j=length(NDocs)
        if (j>0)
                {
                for (j in 1:j)
                {  textoachado[i+j]<- str_extract(NDocs[j], y)
                NextWordIs[i+j]<- stri_extract_last_words(textoachado[i+j]) 
                }
                }
        TQuest<-grepl(x, allTwitter, ignore.case=TRUE)
        TDocs<-allTwitter[TQuest]
        k=length(TDocs)
        if (k>0)
                {
                for (k in 1:k)
                {  textoachado[i+j+k]<- str_extract(TDocs[k], y)
                NextWordIs[i+j+k]<- stri_extract_last_words(textoachado[i+j+k]) 
                }
                }
        bundle<-as.data.frame(NextWordIs, stringsAsFactors=FALSE)
        summary (bundle)
        blogs_token <- tokenmaker(bundle)
        blogs_words <- wordcounter(blogs_token)
        summary(nchar(bundle))
        head(bundle)
        tdm_Blogs<-TermDocumentMatrix(blogs_token)
        m_Blogs<-as.matrix(tdm_Blogs)
        v_Blogs<-sort(rowSums(m_Blogs),decreasing=TRUE)
        d_Blogs<-data.frame(word=names(v_Blogs),freq=v_Blogs)
        head(v_Blogs, 100)    
        return(list(head(v_Blogs,100)))
}

Question 1 is absolute simple. At the end we will see that one word will emerge as the most probable sequence for the phrase.

—–Questão 1 —————

resultado_01<-NextWordIs("a case of ", "([Aa]+ +[Cc]ase+ +[Oo]f+ +[^ ]+ )" )
resultado_01
## [[1]]
##           beer       mistaken      character           list           wine 
##             21             12              6              5              5 
##          first         miller            one          water       criminal 
##              4              4              4              4              3 
##         making         severe        waiting        whether            bad 
##              3              3              3              3              2 
##          coors        crossed        defense        finding           food 
##              2              2              2              2              2 
##           four          great            jet        knowing       language 
##              2              2              2              2              2 
##         little           mind          pabst           poor            red 
##              2              2              2              2              2 
##         spring       thallium          trial         trying            100 
##              2              2              2              2              1 
##            118 96917510032654           acid           acne        amnesia 
##              1              1              1              1              1 
##         anchor        anxiety          armed       arrested            art 
##              1              1              1              1              1 
##     assembling         author         baboon        bananas          banks 
##              1              1              1              1              1 
##      belvedere         better          bible         boldly        bolting 
##              1              1              1              1              1 
##          books       boosting        bottles          bowed          boxer 
##              1              1              1              1              1 
##       breaking            bud      budweiser           bulk    butterflies 
##              1              1              1              1              1 
##      butternut          buyer     bwvaktboom      cargument         carpal 
##              1              1              1              1              1 
##        carrots       catching         caveat            cds     cellulitis 
##              1              1              1              1              1 
##       changing        chicken     chickenpox          chore       citation 
##              1              1              1              1              1 
##          class           cold          colic     collateral      collegial 
##              1              1              1              1              1 
##    conspicuous        content      continued           cool         corona 
##              1              1              1              1              1 
##        council        damaged  datetimestamp         degree           deja 
##              1              1              1              1              1 
##          delhi     democratic      dependent    description        dialing 
##              1              1              1              1              1

Question 1 answer: the word beer came as the most present word and make sense in the phrase.

—–Questão 2 —————

resultado_02<-NextWordIs("would mean the ", "([Ww]ould+ +[Mm]ean+ +[Tt]he+ +[^ ]+ )" )  
resultado_02
## [[1]]
##          world      character           list            end           loss 
##            169              6              5              4              3 
##       absolute         entire       language            100            118 
##              2              2              2              1              1 
##  3391780853271       accident        airport         angles         author 
##              1              1              1              1              1 
##        average          board           bull         called        central 
##              1              1              1              1              1 
##        century        content    cooperation         cworld  datetimestamp 
##              1              1              1              1              1 
##          death     demolition    description     difference   displacement 
##              1              1              1              1              1 
##           drug        heading           hour          isdst           mday 
##              1              1              1              1              1 
##           meta            min            mon            new         origin 
##              1              1              1              1              1 
##          owner         person        poorest         public        raiders 
##              1              1              1              1              1 
##            red        release         school            sec         sitter 
##              1              1              1              1              1 
##        society          state            sun           take transformation 
##              1              1              1              1              1 
##            use           wday         worldd           yday           year 
##              1              1              1              1              1

Question 2 answer: here too, one word came as the most present after “would mean the” and that is world, that make sense in the phrase.

—–Questão 3 —————

resultado_03<-NextWordIs("make me the ", "([Mm]ake+ +[Mm]e+ +[Tt]he+ +[^ ]+ )" )  
resultado_03
## [[1]]
##      happiest     character          list          best      language 
##            29             6             5             2             2 
##        worlds           100           118  280200958252       asshole 
##             2             1             1             1             1 
##        author           bad       biggest           bun        cfirst 
##             1             1             1             1             1 
##       content       coolest datetimestamp      daughter   description 
##             1             1             1             1             1 
##          face          girl       heading          hour         isdst 
##             1             1             1             1             1 
##        jackie         kiddy       manager          mday          meta 
##             1             1             1             1             1 
##           min           mon        morbid        mother        number 
##             1             1             1             1             1 
##        origin        parent  professional          role         scape 
##             1             1             1             1             1 
##     scapegoat           sec        target    undisputed      universe 
##             1             1             1             1             1 
##           way          wday        winner          yday          year 
##             1             1             1             1             1

Question 3 answer: here too, one word came as the most present after “make me the” and that is happiest, that make sense in the phrase.

—–Questão 4 —————

resultado_04<-NextWordIs("struggling ", "([Ss]truggling+ +[^ ]+ +[^ ]+ +[^ ]+ )" )  
resultado_04
## [[1]]
##       ends        bit        new       said       work       back 
##         30         16         10         10         10          9 
##     issues      years     afloat       debt       fact      first 
##          9          9          8          7          7          7 
##     health       jobs       keep       life      money       pace 
##          7          7          7          7          7          7 
##       time       will  character     enough       make      sense 
##          7          7          6          6          6          6 
##       list       many       part       runs     school      since 
##          5          5          5          5          5          5 
##      still     things     weight       cars      child      debts 
##          5          5          5          4          4          4 
##  difficult    disease   economic    economy       even      games 
##          4          4          4          4          4          4 
##       high   identity       last   mortgage       need        now 
##          4          4          4          4          4          4 
##    touring      abuse  addiction       baby        buy        can 
##          4          3          3          3          3          3 
##     common    company   countrys    current      dance        day 
##          3          3          3          3          3          3 
##     eating    effects   families      field    finally   finances 
##          3          3          3          3          3          3 
##       food      found       free    funding       game        get 
##          3          3          3          3          3          3 
##      great      horse       just       loss       mall       much 
##          3          3          3          3          3          3 
##        one particular       past        pay   payments   pressure 
##          3          3          3          3          3          3 
##   problems   question      rates      sales     second       self 
##          3          3          3          3          3          3 
## shortfalls       shot     social  something   striking   students 
##          3          3          3          3          3          3 
##      stuff      think      third      times 
##          3          3          3          3
resultado_04a<-NextWordIs("struggling ", "([Ss]truggling+ +[^ ]+ +[^ ]+ )" )  
resultado_04a
## [[1]]
##        find         get        make        keep         pay     survive 
##         104          87          70          55          32          28 
##        stay         bit      little     recover  understand    maintain 
##          24          18          18          15          15          14 
##         put      figure         now     compete         fit        like 
##          13          11          11          10          10          10 
##      market        come  depression        hold       raise      school 
##          10           9           9           9           9           9 
##        work     breathe        cope        high   character        much 
##           9           8           8           8           7           7 
##     provide       score       years      afford      breath     control 
##           7           7           7           6           6           6 
##      finish        just        meet       reach     schools         see 
##           6           6           6           6           6           6 
##   something      weight        year     balance       bring       carry 
##           6           6           6           5           5           5 
##       close        deal   establish        grow        life        list 
##           5           5           5           5           5           5 
##      mental        walk      accept   addiction      almost     another 
##           5           5           4           4           4           4 
##     attract        back       catch   companies     company     contain 
##           4           4           4           4           4           4 
##      create        days      decide        form        gain        hard 
##           4           4           4           4           4           4 
## infertility        live         lot        move    musician         new 
##           4           4           4           4           4           4 
##    overcome        pull      racing     rebuild      remain    remember 
##           4           4           4           4           4           4 
##      season     several       start      system      adjust      agency 
##           4           4           4           4           3           3 
##         air        best       break      budget      cancer      coming 
##           3           3           3           3           3           3 
##   countries         day    district     economy 
##           3           3           3           3
resultado_04b<-NextWordIs("struggling ", "([Ss]truggling+ +[^ ]+ )" )  
resultado_04b
## [[1]]
##       economy   financially      students      mightily       schools 
##            42            18            16            14            14 
##         right      families          just          city    homeowners 
##            12            11            11            10             8 
##       readers    struggling     americans       housing      internet 
##             8             8             7             7             7 
##        school         since        artist     character          even 
##             7             7             6             6             6 
##          like         teams       artists        cities     companies 
##             6             6             5             5             5 
##       company          list        middle   offensively       program 
##             5             5             5             5             5 
##        trying         urban        writer         actor         daily 
##             5             5             5             4             4 
##         horse neighborhoods           nyc       parents          team 
##             4             4             4             4             4 
##         towns  academically      business      campaign        casino 
##             4             3             3             3             3 
##      children     cleveland   communities     countries      district 
##             3             3             3             3             3 
##     districts         early  economically      european     franchise 
##             3             3             3             3             3 
##      industry      language        lately      learners       nations 
##             3             3             3             3             3 
##           now       offense     residents      teachers         today 
##             3             3             3             3             3 
##         world       yankees          year         young       actress 
##             3             3             3             3             2 
##         along        angels          bell         black       british 
##             2             2             2             2             2 
##       casinos         chain         class        closer   consistency 
##             2             2             2             2             2 
##         davis    department   desperately          dont      downtown 
##             2             2             2             2             2 
##          east     economies   electronics         every      finances 
##             2             2             2             2             2 
##          firm     francisco         going   independent          keep 
##             2             2             2             2             2 
## manufacturing   meadowlands        mother      operator        owings 
##             2             2             2             2             2

Question 4 answer: first big problem.. There is no sequence “struggling but the” in whole corpus. Work with strugglin and any other 3, 2 and 1 words in sequence, do not create any corpus with the words from Question 4. Includding three more files of News from Reuters and news from sports did not get better. So the word defense was selected only for the sense it makes in the phrase.

—–Questão 5 —————

resultado_05<-NextWordIs("date at the ", "([Dd]ate+ +[Aa]t+ +[Tt]he+ +[^ ]+ )" )  
resultado_05
## [[1]]
##     character           end          list          time      language 
##             6             5             5             5             2 
##    university           100           118 2796950340271           app 
##             2             1             1             1             1 
##           art        author       beverly        bottom        braves 
##             1             1             1             1             1 
##          cake        cheese          chip       content       cricket 
##             1             1             1             1             1 
##        cwrong datetimestamp   description      driskill       firesky 
##             1             1             1             1             1 
##          four       grocery       heading          hour         isdst 
##             1             1             1             1             1 
##          lake         magic          mday       medical          meta 
##             1             1             1             1             1 
##           min           mon         movie      national         naval 
##             1             1             1             1             1 
##          next        origin       palafox         power         prado 
##             1             1             1             1             1 
##   prestigious    republican           rex           sec          wday 
##             1             1             1             1             1 
##       whiskey          yday          year 
##             1             1             1

Question 5 answer: second problem… There is only one occurence of “date at the” in whole documents at corpus where the following word is at the list of question 5. It is the word grocery that occurs only once. But that was not the right answer. The right was beach, a logical choice.

—–Questão 6 —————

resultado_06<-NextWordIs("be on my ", "([Bb]e+ +[Oo]n+ +[Mm]y+ +[^ ]+ )" )  
resultado_06
## [[1]]
##              way             mind             list             show 
##               32               14               11                8 
##        character             best             game             ipod 
##                6                5                4                4 
##            radio             side             back       background 
##                4                4                3                3 
##            couch             team              top          youtube 
##                3                3                3                3 
##             dick         facebook             feet            first 
##                2                2                2                2 
##          friends         language             last             next 
##                2                2                2                2 
##            phone            porch           screen            terms 
##                2                2                2                2 
##              100              118    7492880821228            album 
##                1                1                1                1 
##           annual           author              bad             bday 
##                1                1                1                1 
##            belly             bike         birthday             blog 
##                1                1                1                1 
##            board            break              bus           career 
##                1                1                1                1 
##          channel         children        christmas              cna 
##                1                1                1                1 
##          college         computer     constituents          content 
##                1                1                1                1 
##              cuz             dads            daily    datetimestamp 
##                1                1                1                1 
##              day         deathbed      description         deserved 
##                1                1                1                1 
##     dissertation             dolo            dsixl             etsy 
##                1                1                1                1 
##          fantasy         favorite            final           flight 
##                1                1                1                1 
##            grave            grind            guard           guilty 
##                1                1                1                1 
##          heading          holiday             hour           icloud 
##                1                1                1                1 
##            ihome            isdst            knees              lap 
##                1                1                1                1 
##           laptop           linked          lostken             mday 
##                1                1                1                1 
## melodifestivalen             menu             meta              min 
##                1                1                1                1 
##           mobile             moms              mon            mouth 
##                1                1                1                1 
##              new         official           origin         personal 
##                1                1                1                1 
##             pone             pure              rag          remixes 
##                1                1                1                1

Question 6 answer: The word that came as the most present after “be on my” was way, that make sense in the phrase.

—–Questão 7 —————

resultado_07<-NextWordIs("quite some ", "([Qq]uite+ +[Ss]ome+ +[^ ]+ )" )  
resultado_07  
## [[1]]
##          time     character          list       company      distance 
##           321             6             5             2             2 
##      language        people           way           100           118 
##             2             2             2             1             1 
## 4715158939362        author       content         ctime datetimestamp 
##             1             1             1             1             1 
##   description        extras       freedom           fun          hair 
##             1             1             1             1             1 
##       heading          hour   improvement   interesting         isdst 
##             1             1             1             1             1 
##          mday          meta           min           mon        months 
##             1             1             1             1             1 
##          news        origin           sec      terrible        things 
##             1             1             1             1             1 
##     tradition          wday          yday          year         years 
##             1             1             1             1             1

Question 7 answer: The word that came as the most present after “quite some” was time, that make sense in the phrase.

—–Questão 8 —————

resultado_08<-NextWordIs("his little ", "([Hh]is+ +[Ll]ittle+ +[^ ]+ )" )  
resultado_08  
## [[1]]
##        girl         guy         boy         gem     brother        blog 
##          98          63          45          34          32          28 
##         one      sister      beauty         man     project         bit 
##          27          23          21          19          17          16 
##       cutie        baby         kid       piece       piggy       thing 
##          16          15          14          14          12          12 
##      corner       heart        head        lady      number        book 
##          11          11          10          10          10           9 
##    brothers         dog      league       light        game       house 
##           9           9           9           9           8           8 
##        list         old        blue      finger       place        town 
##           8           8           7           7           7           7 
##       white       angel        body         box   character        face 
##           7           6           6           6           6           6 
##      family        hand       hands      person        spot      tidbit 
##           6           6           6           6           6           6 
##       video         bag       break        card       ditty        dude 
##           6           5           5           5           5           5 
##       girls        legs        life        note        tiny         car 
##           5           5           5           5           5           4 
##       child        feet       fella        film     fingers     journey 
##           4           4           4           4           4           4 
##      nugget        ones       patch    princess       scene         son 
##           4           4           4           4           4           4 
##       story  sweetheart     sweetie       world      yellow        area 
##           4           4           4           4           4           3 
##         ass       belly        bird      bundle       chair    daughter 
##           3           3           3           3           3           3 
##    exercise        eyes        fact      friend        gift       girly 
##           3           3           3           3           3           3 
##       group        idea      island       kitty        mama     miracle 
##           3           3           3           3           3           3 
##      monkey        peek personality        pink 
##           3           3           3           3

Question 8 answer: Two words from the list for question 8 was founs as the most present after “quite some” that are fingers with 4 occurences and eyes with 3, both makes sense in the phrase and the correct was fingers.

—–Questão 9 —————

resultado_09<-NextWordIs("during the ", "([Dd]uring+ +[TT]he+ +[^ ]+ )" )  
resultado_09  
## [[1]]
##     character          list     amendment       biggest           day 
##             6             5             2             2             2 
##         great      language           100           118 4176108837128 
##             2             2             1             1             1 
##           act        author          bees   believetour           big 
##             1             1             1             1             1 
##      birthday        bloody         booth          born        buccos 
##             1             1             1             1             1 
##    centennial    chronicles           cna           cnn     communist 
##             1             1             1             1             1 
##       content datetimestamp   description         first       heading 
##             1             1             1             1             1 
##          hour     institute         isdst          last          lion 
##             1             1             1             1             1 
##        lonely        lovely          mday          meta           min 
##             1             1             1             1             1 
##           mon         movie      notebook        origin        oscars 
##             1             1             1             1             1 
##         palio         plain          rise          saga           sec 
##             1             1             1             1             1 
##          wday         width          yday          year 
##             1             1             1             1

Question 9 answer: The only word in the corpus that are at the list is bad with 4 occurences but there were a lot of other words so bad is not a principal component…

—–Questão 10 —————

resultado_10<-NextWordIs("must be ",  "([Mm]ust+ +[Bb]e+ +[^ ]+ )" )  
resultado_10  
## [[1]]
##        able        done        made   something       taken    approved 
##         130         125         100          89          77          69 
##        nice    received   following     getting      really         one 
##          60          57          56          53          51          50 
##        paid     willing   submitted        good        like       given 
##          49          45          43          41          41          38 
##       going        said    prepared      pretty        used     careful 
##          37          37          35          35          33          32 
##        hard        kept        true        held        seen accompanied 
##          32          32          31          30          30          28 
##   addressed       ready     present  considered       crazy       wrong 
##          28          27          26          25          23          23 
##   completed    followed         put       close     feeling       lived 
##          22          22          22          21          21          21 
##     removed     stopped        time         new        part    provided 
##          21          21          21          20          20          20 
##        born         met   destroyed    thinking       filed       noted 
##          18          18          17          17          16          16 
##       tired     allowed     brought        felt   protected        read 
##          16          15          15          15          15          15 
##     talking    watching   wondering       aware       based         cut 
##          15          15          15          14          14          14 
##        open  understood    balanced      better       bored      coming 
##          14          14          13          13          13          13 
##      placed  postmarked   purchased       quite    replaced     someone 
##          13          13          13          13          13          13 
##      turned        free       great        just        sent       spent 
##          13          12          12          12          12          12 
##       tough      viewed     working       dealt    involved   preserved 
##          12          12          12          11          11          11 
##   respected         set      strong      tested        told     treated 
##          11          11          11          11          11          11 
##      within     another   available      broken 
##          11          10          10          10

Question 10 answer: Like other questions, there is no sequence for “must be” with any of the four words offered in the question. So the word insane was selected only for the sense it makes in the phrase, by common sense.

Thanks for reading.