In order to answer the questions of Quiz 2 to Week 3 will be used: >the grepl() command READING ALL DOCUMENTS from the three files from SwiftKey, >extract the documents that contains the group of words searched, >extract the words around the nuclear group of word searched >process this final extract to eliminate extra blanks, punctuation, words like preposition, articles >extract the word that results as complimente of the initial group of words. In this group of words we should find one of the options offered for each question in the quiz.
library(stringr)
library(stringi)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
setwd("C:/Coursera/10_Data_Science_Capstone")
#Select ALL DOCUMENTS in Blogs
allBlogs <- readLines("en_US.blogs.txt")
#Select ALL DOCUMENTS in News
allNews <- readLines("en_US.news.txt")
#Select ALL DOCUMENTS in Twitters
allTwitter <- readLines("en_US.twitter.txt")
summary(allBlogs)
## Length Class Mode
## 899289 character character
summary(allNews)
## Length Class Mode
## 1010243 character character
summary(allTwitter)
## Length Class Mode
## 2360149 character character
tokenmaker <- function(x) {
corpus <- Corpus(VectorSource(x))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, PlainTextDocument)
# corpus <- tm_map(corpus, stemDocument)
corpus <- Corpus(VectorSource(corpus))
}
wordcounter <- function(x) {
dtm<-DocumentTermMatrix(x)
dtm_matrix <- as.matrix(dtm)
word_freq <- colSums(dtm_matrix)
word_freq <- sort(word_freq, decreasing = TRUE)
words <- names(word_freq)
return(list(words, word_freq))
}
NextWordIs <- function(x,y){
BQuest<-grepl(x, allBlogs, ignore.case=TRUE)
BDocs<-allBlogs[BQuest]
textoachado<-'a'
NextWordIs<-'a'
i<-length(BDocs)
if (i>0)
{
for (i in 1:i)
{ textoachado[i]<- str_extract(BDocs[i], y)
NextWordIs[i]<- stri_extract_last_words(textoachado[i])
}
}
NQuest<-grepl(x, allNews, ignore.case=TRUE)
NDocs<-allNews[NQuest]
j=length(NDocs)
if (j>0)
{
for (j in 1:j)
{ textoachado[i+j]<- str_extract(NDocs[j], y)
NextWordIs[i+j]<- stri_extract_last_words(textoachado[i+j])
}
}
TQuest<-grepl(x, allTwitter, ignore.case=TRUE)
TDocs<-allTwitter[TQuest]
k=length(TDocs)
if (k>0)
{
for (k in 1:k)
{ textoachado[i+j+k]<- str_extract(TDocs[k], y)
NextWordIs[i+j+k]<- stri_extract_last_words(textoachado[i+j+k])
}
}
bundle<-as.data.frame(NextWordIs, stringsAsFactors=FALSE)
summary (bundle)
blogs_token <- tokenmaker(bundle)
blogs_words <- wordcounter(blogs_token)
summary(nchar(bundle))
head(bundle)
tdm_Blogs<-TermDocumentMatrix(blogs_token)
m_Blogs<-as.matrix(tdm_Blogs)
v_Blogs<-sort(rowSums(m_Blogs),decreasing=TRUE)
d_Blogs<-data.frame(word=names(v_Blogs),freq=v_Blogs)
head(v_Blogs, 100)
return(list(head(v_Blogs,100)))
}
resultado_01<-NextWordIs("a case of ", "([Aa]+ +[Cc]ase+ +[Oo]f+ +[^ ]+ )" )
resultado_01
## [[1]]
## beer mistaken character list wine
## 21 12 6 5 5
## first miller one water criminal
## 4 4 4 4 3
## making severe waiting whether bad
## 3 3 3 3 2
## coors crossed defense finding food
## 2 2 2 2 2
## four great jet knowing language
## 2 2 2 2 2
## little mind pabst poor red
## 2 2 2 2 2
## spring thallium trial trying 100
## 2 2 2 2 1
## 118 96917510032654 acid acne amnesia
## 1 1 1 1 1
## anchor anxiety armed arrested art
## 1 1 1 1 1
## assembling author baboon bananas banks
## 1 1 1 1 1
## belvedere better bible boldly bolting
## 1 1 1 1 1
## books boosting bottles bowed boxer
## 1 1 1 1 1
## breaking bud budweiser bulk butterflies
## 1 1 1 1 1
## butternut buyer bwvaktboom cargument carpal
## 1 1 1 1 1
## carrots catching caveat cds cellulitis
## 1 1 1 1 1
## changing chicken chickenpox chore citation
## 1 1 1 1 1
## class cold colic collateral collegial
## 1 1 1 1 1
## conspicuous content continued cool corona
## 1 1 1 1 1
## council damaged datetimestamp degree deja
## 1 1 1 1 1
## delhi democratic dependent description dialing
## 1 1 1 1 1
resultado_02<-NextWordIs("would mean the ", "([Ww]ould+ +[Mm]ean+ +[Tt]he+ +[^ ]+ )" )
resultado_02
## [[1]]
## world character list end loss
## 169 6 5 4 3
## absolute entire language 100 118
## 2 2 2 1 1
## 3391780853271 accident airport angles author
## 1 1 1 1 1
## average board bull called central
## 1 1 1 1 1
## century content cooperation cworld datetimestamp
## 1 1 1 1 1
## death demolition description difference displacement
## 1 1 1 1 1
## drug heading hour isdst mday
## 1 1 1 1 1
## meta min mon new origin
## 1 1 1 1 1
## owner person poorest public raiders
## 1 1 1 1 1
## red release school sec sitter
## 1 1 1 1 1
## society state sun take transformation
## 1 1 1 1 1
## use wday worldd yday year
## 1 1 1 1 1
resultado_03<-NextWordIs("make me the ", "([Mm]ake+ +[Mm]e+ +[Tt]he+ +[^ ]+ )" )
resultado_03
## [[1]]
## happiest character list best language
## 29 6 5 2 2
## worlds 100 118 280200958252 asshole
## 2 1 1 1 1
## author bad biggest bun cfirst
## 1 1 1 1 1
## content coolest datetimestamp daughter description
## 1 1 1 1 1
## face girl heading hour isdst
## 1 1 1 1 1
## jackie kiddy manager mday meta
## 1 1 1 1 1
## min mon morbid mother number
## 1 1 1 1 1
## origin parent professional role scape
## 1 1 1 1 1
## scapegoat sec target undisputed universe
## 1 1 1 1 1
## way wday winner yday year
## 1 1 1 1 1
resultado_04<-NextWordIs("struggling ", "([Ss]truggling+ +[^ ]+ +[^ ]+ +[^ ]+ )" )
resultado_04
## [[1]]
## ends bit new said work back
## 30 16 10 10 10 9
## issues years afloat debt fact first
## 9 9 8 7 7 7
## health jobs keep life money pace
## 7 7 7 7 7 7
## time will character enough make sense
## 7 7 6 6 6 6
## list many part runs school since
## 5 5 5 5 5 5
## still things weight cars child debts
## 5 5 5 4 4 4
## difficult disease economic economy even games
## 4 4 4 4 4 4
## high identity last mortgage need now
## 4 4 4 4 4 4
## touring abuse addiction baby buy can
## 4 3 3 3 3 3
## common company countrys current dance day
## 3 3 3 3 3 3
## eating effects families field finally finances
## 3 3 3 3 3 3
## food found free funding game get
## 3 3 3 3 3 3
## great horse just loss mall much
## 3 3 3 3 3 3
## one particular past pay payments pressure
## 3 3 3 3 3 3
## problems question rates sales second self
## 3 3 3 3 3 3
## shortfalls shot social something striking students
## 3 3 3 3 3 3
## stuff think third times
## 3 3 3 3
resultado_04a<-NextWordIs("struggling ", "([Ss]truggling+ +[^ ]+ +[^ ]+ )" )
resultado_04a
## [[1]]
## find get make keep pay survive
## 104 87 70 55 32 28
## stay bit little recover understand maintain
## 24 18 18 15 15 14
## put figure now compete fit like
## 13 11 11 10 10 10
## market come depression hold raise school
## 10 9 9 9 9 9
## work breathe cope high character much
## 9 8 8 8 7 7
## provide score years afford breath control
## 7 7 7 6 6 6
## finish just meet reach schools see
## 6 6 6 6 6 6
## something weight year balance bring carry
## 6 6 6 5 5 5
## close deal establish grow life list
## 5 5 5 5 5 5
## mental walk accept addiction almost another
## 5 5 4 4 4 4
## attract back catch companies company contain
## 4 4 4 4 4 4
## create days decide form gain hard
## 4 4 4 4 4 4
## infertility live lot move musician new
## 4 4 4 4 4 4
## overcome pull racing rebuild remain remember
## 4 4 4 4 4 4
## season several start system adjust agency
## 4 4 4 4 3 3
## air best break budget cancer coming
## 3 3 3 3 3 3
## countries day district economy
## 3 3 3 3
resultado_04b<-NextWordIs("struggling ", "([Ss]truggling+ +[^ ]+ )" )
resultado_04b
## [[1]]
## economy financially students mightily schools
## 42 18 16 14 14
## right families just city homeowners
## 12 11 11 10 8
## readers struggling americans housing internet
## 8 8 7 7 7
## school since artist character even
## 7 7 6 6 6
## like teams artists cities companies
## 6 6 5 5 5
## company list middle offensively program
## 5 5 5 5 5
## trying urban writer actor daily
## 5 5 5 4 4
## horse neighborhoods nyc parents team
## 4 4 4 4 4
## towns academically business campaign casino
## 4 3 3 3 3
## children cleveland communities countries district
## 3 3 3 3 3
## districts early economically european franchise
## 3 3 3 3 3
## industry language lately learners nations
## 3 3 3 3 3
## now offense residents teachers today
## 3 3 3 3 3
## world yankees year young actress
## 3 3 3 3 2
## along angels bell black british
## 2 2 2 2 2
## casinos chain class closer consistency
## 2 2 2 2 2
## davis department desperately dont downtown
## 2 2 2 2 2
## east economies electronics every finances
## 2 2 2 2 2
## firm francisco going independent keep
## 2 2 2 2 2
## manufacturing meadowlands mother operator owings
## 2 2 2 2 2
resultado_05<-NextWordIs("date at the ", "([Dd]ate+ +[Aa]t+ +[Tt]he+ +[^ ]+ )" )
resultado_05
## [[1]]
## character end list time language
## 6 5 5 5 2
## university 100 118 2796950340271 app
## 2 1 1 1 1
## art author beverly bottom braves
## 1 1 1 1 1
## cake cheese chip content cricket
## 1 1 1 1 1
## cwrong datetimestamp description driskill firesky
## 1 1 1 1 1
## four grocery heading hour isdst
## 1 1 1 1 1
## lake magic mday medical meta
## 1 1 1 1 1
## min mon movie national naval
## 1 1 1 1 1
## next origin palafox power prado
## 1 1 1 1 1
## prestigious republican rex sec wday
## 1 1 1 1 1
## whiskey yday year
## 1 1 1
resultado_06<-NextWordIs("be on my ", "([Bb]e+ +[Oo]n+ +[Mm]y+ +[^ ]+ )" )
resultado_06
## [[1]]
## way mind list show
## 32 14 11 8
## character best game ipod
## 6 5 4 4
## radio side back background
## 4 4 3 3
## couch team top youtube
## 3 3 3 3
## dick facebook feet first
## 2 2 2 2
## friends language last next
## 2 2 2 2
## phone porch screen terms
## 2 2 2 2
## 100 118 7492880821228 album
## 1 1 1 1
## annual author bad bday
## 1 1 1 1
## belly bike birthday blog
## 1 1 1 1
## board break bus career
## 1 1 1 1
## channel children christmas cna
## 1 1 1 1
## college computer constituents content
## 1 1 1 1
## cuz dads daily datetimestamp
## 1 1 1 1
## day deathbed description deserved
## 1 1 1 1
## dissertation dolo dsixl etsy
## 1 1 1 1
## fantasy favorite final flight
## 1 1 1 1
## grave grind guard guilty
## 1 1 1 1
## heading holiday hour icloud
## 1 1 1 1
## ihome isdst knees lap
## 1 1 1 1
## laptop linked lostken mday
## 1 1 1 1
## melodifestivalen menu meta min
## 1 1 1 1
## mobile moms mon mouth
## 1 1 1 1
## new official origin personal
## 1 1 1 1
## pone pure rag remixes
## 1 1 1 1
resultado_07<-NextWordIs("quite some ", "([Qq]uite+ +[Ss]ome+ +[^ ]+ )" )
resultado_07
## [[1]]
## time character list company distance
## 321 6 5 2 2
## language people way 100 118
## 2 2 2 1 1
## 4715158939362 author content ctime datetimestamp
## 1 1 1 1 1
## description extras freedom fun hair
## 1 1 1 1 1
## heading hour improvement interesting isdst
## 1 1 1 1 1
## mday meta min mon months
## 1 1 1 1 1
## news origin sec terrible things
## 1 1 1 1 1
## tradition wday yday year years
## 1 1 1 1 1
resultado_08<-NextWordIs("his little ", "([Hh]is+ +[Ll]ittle+ +[^ ]+ )" )
resultado_08
## [[1]]
## girl guy boy gem brother blog
## 98 63 45 34 32 28
## one sister beauty man project bit
## 27 23 21 19 17 16
## cutie baby kid piece piggy thing
## 16 15 14 14 12 12
## corner heart head lady number book
## 11 11 10 10 10 9
## brothers dog league light game house
## 9 9 9 9 8 8
## list old blue finger place town
## 8 8 7 7 7 7
## white angel body box character face
## 7 6 6 6 6 6
## family hand hands person spot tidbit
## 6 6 6 6 6 6
## video bag break card ditty dude
## 6 5 5 5 5 5
## girls legs life note tiny car
## 5 5 5 5 5 4
## child feet fella film fingers journey
## 4 4 4 4 4 4
## nugget ones patch princess scene son
## 4 4 4 4 4 4
## story sweetheart sweetie world yellow area
## 4 4 4 4 4 3
## ass belly bird bundle chair daughter
## 3 3 3 3 3 3
## exercise eyes fact friend gift girly
## 3 3 3 3 3 3
## group idea island kitty mama miracle
## 3 3 3 3 3 3
## monkey peek personality pink
## 3 3 3 3
resultado_09<-NextWordIs("during the ", "([Dd]uring+ +[TT]he+ +[^ ]+ )" )
resultado_09
## [[1]]
## character list amendment biggest day
## 6 5 2 2 2
## great language 100 118 4176108837128
## 2 2 1 1 1
## act author bees believetour big
## 1 1 1 1 1
## birthday bloody booth born buccos
## 1 1 1 1 1
## centennial chronicles cna cnn communist
## 1 1 1 1 1
## content datetimestamp description first heading
## 1 1 1 1 1
## hour institute isdst last lion
## 1 1 1 1 1
## lonely lovely mday meta min
## 1 1 1 1 1
## mon movie notebook origin oscars
## 1 1 1 1 1
## palio plain rise saga sec
## 1 1 1 1 1
## wday width yday year
## 1 1 1 1
resultado_10<-NextWordIs("must be ", "([Mm]ust+ +[Bb]e+ +[^ ]+ )" )
resultado_10
## [[1]]
## able done made something taken approved
## 130 125 100 89 77 69
## nice received following getting really one
## 60 57 56 53 51 50
## paid willing submitted good like given
## 49 45 43 41 41 38
## going said prepared pretty used careful
## 37 37 35 35 33 32
## hard kept true held seen accompanied
## 32 32 31 30 30 28
## addressed ready present considered crazy wrong
## 28 27 26 25 23 23
## completed followed put close feeling lived
## 22 22 22 21 21 21
## removed stopped time new part provided
## 21 21 21 20 20 20
## born met destroyed thinking filed noted
## 18 18 17 17 16 16
## tired allowed brought felt protected read
## 16 15 15 15 15 15
## talking watching wondering aware based cut
## 15 15 15 14 14 14
## open understood balanced better bored coming
## 14 14 13 13 13 13
## placed postmarked purchased quite replaced someone
## 13 13 13 13 13 13
## turned free great just sent spent
## 13 12 12 12 12 12
## tough viewed working dealt involved preserved
## 12 12 12 11 11 11
## respected set strong tested told treated
## 11 11 11 11 11 11
## within another available broken
## 11 10 10 10
Thanks for reading.