The corpora are collected from publicly available sources by a web crawler. The crawler checks for language, so as to mainly get texts consisting of the desired language*. Each entry is tagged with it’s date of publication. Where user comments are included they will be tagged with the date of the main entry. Once the raw corpus has been collected, it is parsed further, to remove duplicate entries and split into individual lines. Approximately 50% of each entry is then deleted. Since you cannot fully recreate any entries, the entries are anonymised and this is a non-profit venture.
#Raw Data File
fl_zipnm<-"Coursera-SwiftKey.zip"
fl_zipurl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
setwd("D:/Data Science/10-Project Capstone")
if(!file.exists(fl_zipnm)) {
download.file(fl_zipurl,fl_zipnm)
unzip(fl_zipnm)
}
con <- file("final/en_US/en_US.blogs.txt", open="rb")
blog <- readLines(con, encoding="UTF-8")
close(con)
rm(con)
con <- file("final/en_US/en_US.twitter.txt", open="rb")
twitter <- readLines(con, encoding="UTF-8", skipNul=TRUE)
close(con)
rm(con)
con <- file("final/en_US/en_US.news.txt", open="rb")
news <- readLines(con, encoding="UTF-8")
close(con)
rm(con)
After load the three files (blog, twitter, news) we can see some of the data
summary(blog)
## Length Class Mode
## 899288 character character
head(blog,2)
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan <U+0093>gods<U+0094>."
## [2] "We love you Mr. Brown."
summary(twitter)
## Length Class Mode
## 2360148 character character
head(twitter,2)
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
summary(news)
## Length Class Mode
## 1010242 character character
head(news,2)
## [1] "He wasn't home alone, apparently."
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."
And get some summaries from them
st_blog<-stri_stats_general(blog)
st_twitter<-stri_stats_general(twitter)
st_news<-stri_stats_general(news)
wordcounts<-c(sum(stri_count_words(blog)),
sum(stri_count_words(twitter)),
sum(stri_count_words(news)))
maxchar<-c(max(nchar(blog)),
max(nchar(twitter)),
max(nchar(news)))
size<-c(object.size(blog)/1024/1024,
object.size(twitter)/1024/1024,
object.size(news)/1024/1024)
dt_stats<-data.frame(c(st_blog[1],st_twitter[1],st_news[1]),
c(st_blog[3],st_twitter[3],st_news[3]),
wordcounts,maxchar,size)
rownames(dt_stats)<-c("Blog","Twitter","News")
colnames(dt_stats)<-c("Lines","Chars","Word Count","Max Char","Size MB")
dt_stats
## Lines Chars Word Count Max Char Size MB
## Blog 899288 206824382 37546246 40833 231.3409
## Twitter 2360148 162096241 30093410 140 257.2081
## News 1010242 203223154 34762395 11384 230.3640
Given the amount of data in each of the files and try to work with a 40% of training dataset requires an amount of resources way too large to process in my own computer. That’s why i choose to get a random 10,000 sample of each: blog, twitter, news. Then merge the three data sets in one data training set.
tr_blog<-sample(blog,10000)
tr_twitter<-sample(twitter,10000)
tr_news<-sample(news,10000)
dt_trnng<-c(tr_blog,tr_twitter,tr_news)
head(dt_trnng,2)
## [1] "Chicken- 4 Lbs"
## [2] "If this is really what you want, then it is worth all the work, the writing, the re-writing, the doubts and the fears. if you 'Believe' in your work and you are prepared to settle in for the long haul, then you will go through the eye of the needle and your work will be read."
Remove punctuation, numbers, other speciales caracters and then define the term matrix.
load("dt_trnng.RData")
dt_trnng<-tolower(gsub("[^[:alnum:][:space:]]+","",dt_trnng))
vc_trnng<-VCorpus(VectorSource(dt_trnng))
vc_trnng<-tm_map(vc_trnng,stripWhitespace)
vc_trnng<-tm_map(vc_trnng,removeNumbers)
vc_trnng<-tm_map(vc_trnng,removeWords,stopwords("english"))
Now i try to build a basic n-gram model to predict the next word based on previous 1, 2 or 3 words.
fn_unigram<-function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
fn_bigram<-function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
fn_trigram<-function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tm_unigram<-DocumentTermMatrix(vc_trnng,control=list(tokenize=fn_unigram))
tm_bigram<-DocumentTermMatrix(vc_trnng,control=list(tokenize=fn_bigram))
tm_trigram<-DocumentTermMatrix(vc_trnng,control=list(tokenize=fn_trigram))
inspect(tm_unigram[,1:6])
## <<DocumentTermMatrix (documents: 30000, terms: 6)>>
## Non-/sparse entries: 7/179993
## Sparsity : 100%
## Maximal term length: 8
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs aaa aaaaargh aaahhh aaas aafpgas aahahaha
## 1 0 0 0 0 0 0
## 10621 0 0 0 0 1 0
## 12695 0 0 0 0 0 1
## 12935 0 0 1 0 0 0
## 15255 0 1 0 0 0 0
## 2 0 0 0 0 0 0
## 27024 0 0 0 1 0 0
## 27416 1 0 0 0 0 0
## 29919 1 0 0 0 0 0
## 3 0 0 0 0 0 0
inspect(tm_bigram[,1:6])
## <<DocumentTermMatrix (documents: 30000, terms: 6)>>
## Non-/sparse entries: 6/179994
## Sparsity : 100%
## Maximal term length: 17
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs aa christian aa credit aa first aa intersectional aa longterm
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 28076 0 1 0 0 1
## 28675 0 0 0 1 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 4819 1 0 1 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## 7425 0 0 0 0 0
## Terms
## Docs aa min
## 1 0
## 2 0
## 28076 0
## 28675 0
## 3 0
## 4 0
## 4819 0
## 5 0
## 6 0
## 7425 1
inspect(tm_trigram[,1:6])
## <<DocumentTermMatrix (documents: 30000, terms: 6)>>
## Non-/sparse entries: 6/179994
## Sparsity : 100%
## Maximal term length: 24
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs aa christian fellowship aa credit rating aa first century
## 1 0 0 0
## 2 0 0 0
## 28076 0 1 0
## 28675 0 0 0
## 3 0 0 0
## 4 0 0 0
## 4819 1 0 1
## 5 0 0 0
## 6 0 0 0
## 7425 0 0 0
## Terms
## Docs aa intersectional titles aa longterm rating aa min flowers
## 1 0 0 0
## 2 0 0 0
## 28076 0 1 0
## 28675 1 0 0
## 3 0 0 0
## 4 0 0 0
## 4819 0 0 0
## 5 0 0 0
## 6 0 0 0
## 7425 0 0 1
Now we can make calculate the data frequency for 1, 2 or 3 words gram.
df_unigram<-group_by(data.frame(tidy(tm_unigram), stringsAsFactors=FALSE), term) %>% summarise(freq=sum(count))
df_bigram<-group_by(data.frame(tidy(tm_bigram), stringsAsFactors=FALSE), term) %>% summarise(freq=sum(count))
df_trigram<-group_by(data.frame(tidy(tm_trigram), stringsAsFactors=FALSE), term) %>% summarise(freq=sum(count))
Then i get the top 40, 30 and 20 for each group of n-gram.
tp_unigram<-df_unigram[order(-df_unigram$freq), ][1:40, ]
wordcloud(tp_unigram$term,tp_unigram$freq,colors=brewer.pal(8, "RdPu"))
ggplot(tp_unigram,aes(x=term,y=freq)) +
geom_bar(stat="identity",fill="#fa9fb5") +
theme(axis.text.x=element_text(angle=45, hjust=1))
tp_bigram<-df_bigram[order(-df_bigram$freq), ][1:30, ]
wordcloud(tp_bigram$term,tp_bigram$freq,colors=brewer.pal(8, "YlOrRd"))
ggplot(tp_bigram,aes(x=term,y=freq)) +
geom_bar(stat="identity",fill="#ffcc99") +
theme(axis.text.x=element_text(angle=45, hjust=1))
tp_trigram<-df_trigram[order(-df_trigram$freq), ][1:20, ]
wordcloud(tp_trigram$term,tp_trigram$freq,colors=brewer.pal(8, "PuBuGn"))
ggplot(tp_trigram,aes(x=term,y=freq)) +
geom_bar(stat="identity",fill="#1c9099") +
theme(axis.text.x=element_text(angle=45, hjust=1))