Data Source

The corpora are collected from publicly available sources by a web crawler. The crawler checks for language, so as to mainly get texts consisting of the desired language*. Each entry is tagged with it’s date of publication. Where user comments are included they will be tagged with the date of the main entry. Once the raw corpus has been collected, it is parsed further, to remove duplicate entries and split into individual lines. Approximately 50% of each entry is then deleted. Since you cannot fully recreate any entries, the entries are anonymised and this is a non-profit venture.

#Raw Data File
fl_zipnm<-"Coursera-SwiftKey.zip"
fl_zipurl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
setwd("D:/Data Science/10-Project Capstone")
if(!file.exists(fl_zipnm)) {
    download.file(fl_zipurl,fl_zipnm)
    unzip(fl_zipnm)
}

con <- file("final/en_US/en_US.blogs.txt", open="rb")
blog <- readLines(con, encoding="UTF-8")
close(con)
rm(con)

con <- file("final/en_US/en_US.twitter.txt", open="rb")
twitter <- readLines(con, encoding="UTF-8", skipNul=TRUE)
close(con)
rm(con)

con <- file("final/en_US/en_US.news.txt", open="rb")
news <- readLines(con, encoding="UTF-8")
close(con)
rm(con)

Exploratory Analysis

After load the three files (blog, twitter, news) we can see some of the data

summary(blog)
##    Length     Class      Mode 
##    899288 character character
head(blog,2)
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan <U+0093>gods<U+0094>."
## [2] "We love you Mr. Brown."
summary(twitter)
##    Length     Class      Mode 
##   2360148 character character
head(twitter,2)
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."  
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
summary(news)
##    Length     Class      Mode 
##   1010242 character character
head(news,2)
## [1] "He wasn't home alone, apparently."                                                                                                                        
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."

And get some summaries from them

st_blog<-stri_stats_general(blog)
st_twitter<-stri_stats_general(twitter)
st_news<-stri_stats_general(news)
wordcounts<-c(sum(stri_count_words(blog)),
             sum(stri_count_words(twitter)),
             sum(stri_count_words(news)))
maxchar<-c(max(nchar(blog)),
          max(nchar(twitter)),
          max(nchar(news)))
size<-c(object.size(blog)/1024/1024,
       object.size(twitter)/1024/1024,
       object.size(news)/1024/1024)
dt_stats<-data.frame(c(st_blog[1],st_twitter[1],st_news[1]),
                     c(st_blog[3],st_twitter[3],st_news[3]),
                     wordcounts,maxchar,size)
rownames(dt_stats)<-c("Blog","Twitter","News")
colnames(dt_stats)<-c("Lines","Chars","Word Count","Max Char","Size MB")
dt_stats
##           Lines     Chars Word Count Max Char  Size MB
## Blog     899288 206824382   37546246    40833 231.3409
## Twitter 2360148 162096241   30093410      140 257.2081
## News    1010242 203223154   34762395    11384 230.3640

Data Cleaning

Given the amount of data in each of the files and try to work with a 40% of training dataset requires an amount of resources way too large to process in my own computer. That’s why i choose to get a random 10,000 sample of each: blog, twitter, news. Then merge the three data sets in one data training set.

tr_blog<-sample(blog,10000)
tr_twitter<-sample(twitter,10000)
tr_news<-sample(news,10000)

dt_trnng<-c(tr_blog,tr_twitter,tr_news)
head(dt_trnng,2)
## [1] "Chicken- 4 Lbs"                                                                                                                                                                                                                                                                      
## [2] "If this is really what you want, then it is worth all the work, the writing, the re-writing, the doubts and the fears. if you 'Believe' in your work and you are prepared to settle in for the long haul, then you will go through the eye of the needle and your work will be read."

Remove punctuation, numbers, other speciales caracters and then define the term matrix.

load("dt_trnng.RData")
dt_trnng<-tolower(gsub("[^[:alnum:][:space:]]+","",dt_trnng))
vc_trnng<-VCorpus(VectorSource(dt_trnng))
vc_trnng<-tm_map(vc_trnng,stripWhitespace)
vc_trnng<-tm_map(vc_trnng,removeNumbers)
vc_trnng<-tm_map(vc_trnng,removeWords,stopwords("english"))

NGram Model

Now i try to build a basic n-gram model to predict the next word based on previous 1, 2 or 3 words.

fn_unigram<-function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
fn_bigram<-function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
fn_trigram<-function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

tm_unigram<-DocumentTermMatrix(vc_trnng,control=list(tokenize=fn_unigram))
tm_bigram<-DocumentTermMatrix(vc_trnng,control=list(tokenize=fn_bigram))
tm_trigram<-DocumentTermMatrix(vc_trnng,control=list(tokenize=fn_trigram))

inspect(tm_unigram[,1:6])
## <<DocumentTermMatrix (documents: 30000, terms: 6)>>
## Non-/sparse entries: 7/179993
## Sparsity           : 100%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## Sample             :
##        Terms
## Docs    aaa aaaaargh aaahhh aaas aafpgas aahahaha
##   1       0        0      0    0       0        0
##   10621   0        0      0    0       1        0
##   12695   0        0      0    0       0        1
##   12935   0        0      1    0       0        0
##   15255   0        1      0    0       0        0
##   2       0        0      0    0       0        0
##   27024   0        0      0    1       0        0
##   27416   1        0      0    0       0        0
##   29919   1        0      0    0       0        0
##   3       0        0      0    0       0        0
inspect(tm_bigram[,1:6])
## <<DocumentTermMatrix (documents: 30000, terms: 6)>>
## Non-/sparse entries: 6/179994
## Sparsity           : 100%
## Maximal term length: 17
## Weighting          : term frequency (tf)
## Sample             :
##        Terms
## Docs    aa christian aa credit aa first aa intersectional aa longterm
##   1                0         0        0                 0           0
##   2                0         0        0                 0           0
##   28076            0         1        0                 0           1
##   28675            0         0        0                 1           0
##   3                0         0        0                 0           0
##   4                0         0        0                 0           0
##   4819             1         0        1                 0           0
##   5                0         0        0                 0           0
##   6                0         0        0                 0           0
##   7425             0         0        0                 0           0
##        Terms
## Docs    aa min
##   1          0
##   2          0
##   28076      0
##   28675      0
##   3          0
##   4          0
##   4819       0
##   5          0
##   6          0
##   7425       1
inspect(tm_trigram[,1:6])
## <<DocumentTermMatrix (documents: 30000, terms: 6)>>
## Non-/sparse entries: 6/179994
## Sparsity           : 100%
## Maximal term length: 24
## Weighting          : term frequency (tf)
## Sample             :
##        Terms
## Docs    aa christian fellowship aa credit rating aa first century
##   1                           0                0                0
##   2                           0                0                0
##   28076                       0                1                0
##   28675                       0                0                0
##   3                           0                0                0
##   4                           0                0                0
##   4819                        1                0                1
##   5                           0                0                0
##   6                           0                0                0
##   7425                        0                0                0
##        Terms
## Docs    aa intersectional titles aa longterm rating aa min flowers
##   1                            0                  0              0
##   2                            0                  0              0
##   28076                        0                  1              0
##   28675                        1                  0              0
##   3                            0                  0              0
##   4                            0                  0              0
##   4819                         0                  0              0
##   5                            0                  0              0
##   6                            0                  0              0
##   7425                         0                  0              1

Plotting

Now we can make calculate the data frequency for 1, 2 or 3 words gram.

df_unigram<-group_by(data.frame(tidy(tm_unigram), stringsAsFactors=FALSE), term) %>%  summarise(freq=sum(count))
df_bigram<-group_by(data.frame(tidy(tm_bigram), stringsAsFactors=FALSE), term) %>%  summarise(freq=sum(count))
df_trigram<-group_by(data.frame(tidy(tm_trigram), stringsAsFactors=FALSE), term) %>%  summarise(freq=sum(count))

Then i get the top 40, 30 and 20 for each group of n-gram.

tp_unigram<-df_unigram[order(-df_unigram$freq), ][1:40, ]
wordcloud(tp_unigram$term,tp_unigram$freq,colors=brewer.pal(8, "RdPu"))

ggplot(tp_unigram,aes(x=term,y=freq)) +
        geom_bar(stat="identity",fill="#fa9fb5") +
        theme(axis.text.x=element_text(angle=45, hjust=1))

tp_bigram<-df_bigram[order(-df_bigram$freq), ][1:30, ]
wordcloud(tp_bigram$term,tp_bigram$freq,colors=brewer.pal(8, "YlOrRd"))

ggplot(tp_bigram,aes(x=term,y=freq)) +
        geom_bar(stat="identity",fill="#ffcc99") +
        theme(axis.text.x=element_text(angle=45, hjust=1))

tp_trigram<-df_trigram[order(-df_trigram$freq), ][1:20, ]
wordcloud(tp_trigram$term,tp_trigram$freq,colors=brewer.pal(8, "PuBuGn"))

ggplot(tp_trigram,aes(x=term,y=freq)) +
        geom_bar(stat="identity",fill="#1c9099") +
        theme(axis.text.x=element_text(angle=45, hjust=1))