From the course dataset information, the data comes from a corpus called HC Corpora (the original site is not reachable, but an archive of it can be seen at https://web-beta.archive.org/web/20160930083655/http://www.corpora.heliohost.org/aboutcorpus.html). Corpora are collected from publicly available sources by a web crawler.
The dataset can be downloaded from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip, and consists, once being downloaded and uncompressed, of 4 folders corresponding to 4 different languages (German, English, Finnish, and Russian), and each folder containing 3 files from 3 different text sources (blogs, news, and Twitter):
setwd("C:\\Users\\joeljgeorge\\Downloads\\final\\en_US")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
I set the directory and load 3 data.
For any data analysis experiment, we need to summarize the data. Our analysis starts with a summary table including, for each file in the bundle.
size_blogs<-file.size(path="C:\\Users\\joeljgeorge\\Downloads\\final\\en_US\\en_US.blogs.txt")/2^20
size_news<-file.size(path="C:\\Users\\joeljgeorge\\Downloads\\final\\en_US\\en_US.news.txt")/2^20
size_twitter<-file.size(path="C:\\Users\\joeljgeorge\\Downloads\\final\\en_US\\en_US.twitter.txt")/2^20
len_blogs<-length(blogs)
len_news<-length(news)
len_twitter<-length(twitter)
nchar_blogs<-sum(nchar(blogs))
nchar_news<-sum(nchar(news))
nchar_twitter<-sum(nchar(twitter))
library(stringi)
nword_blogs<-stri_stats_latex(blogs)[4]
nword_news<-stri_stats_latex(news)[4]
nword_twitter<-stri_stats_latex(twitter)[4]
table<-data.frame("File Name"=c("Blogs","News","Twitter"),
"File Size(MB)"=c(size_blogs,size_news,size_twitter),
"Num of rows"=c(len_blogs,len_news,len_twitter),
"Num of character"=c(nchar_blogs,nchar_news,nchar_twitter),
"Num of words"=c(nword_blogs,nword_news,nword_twitter))
table
## File.Name File.Size.MB. Num.of.rows Num.of.character Num.of.words
## 1 Blogs 200.4242 899288 206824505 37570839
## 2 News 196.2775 77259 15639408 2651432
## 3 Twitter 159.3641 2360148 162096031 30451128
Summarize the contents, which has file size, number of rows, number of character and number of words in each file. And make the table
Once the dataset and its details have been introduced, the following step is its cleaning. Firstly, total of text from the blog, Twitter and news files is loaded, considering the flag skipNul = TRUE for line reading in order to skip nulls, and the opening option ‘rb’ when reading en_US.news.txt so that the warning “incomplete final line found on …” , as suggested from course discussion forums. Finished the loading task, a sampling of 1% of the data is performed.
set.seed(12345)
blogs1<-iconv(blogs,"latin1","ASCII",sub="")
news1<-iconv(news,"latin1","ASCII",sub="")
twitter1<-iconv(twitter,"latin1","ASCII",sub="")
rm(blogs)
rm(news)
rm(twitter)
# sample data set only 1% of each file
sample_data<-c(sample(blogs1,length(blogs1)*0.01),
sample(news1,length(news1)*0.01),
sample(twitter1,length(twitter1)*0.01))
rm(blogs1)
rm(news1)
rm(twitter1)
Data sets are really big, so using sample() function, I sample 1% of each file.
library(tm)
## Loading required package: NLP
library(NLP)
corpus<-VCorpus(VectorSource(sample_data))
corpus1<-tm_map(corpus,removePunctuation)
corpus2<-tm_map(corpus1,stripWhitespace)
corpus3<-tm_map(corpus2,tolower)
corpus4<-tm_map(corpus3,removeNumbers)
corpus5<-tm_map(corpus4,PlainTextDocument)
corpus6<-tm_map(corpus5,removeWords,stopwords("english"))
corpus_result<-data.frame(text=unlist(sapply(corpus6,'[',"content")),stringsAsFactors = FALSE)
head(corpus_result)
## text
## 1 put another way spirit sites mission bollocks
## 2 regrets youth
## 3 tom see
## 4 see fault evolution
## 5 seriously wells youngs bull crap selling uk next year get sorted want drinking christmas
## 6 answer pretty straightforward need muscle biopsy painful muscles yes know something looking forward however place can find virus will happy help find virus causing severe disabling disease mitochondria
rm(corpus)
rm(corpus1)
rm(corpus2)
rm(corpus3)
rm(corpus4)
rm(corpus5)
Build corpus, and check it making data frame.
This step starts with the creation of Document Term Matrices (DTM), which allows one can find the occurrences of words in he corpus, that is, which words/combinations present higher frequencies. For this, the DocumentTermMatrix() function from the tm package and N-Gram tokenizers from RWeka package are used. Specifically, three DTMs are build, for words(1-Grams), 2-Grams and 3-Grams. Built the DTMs, frequencies are calculated and sorted. As a result, different plots displaying the 10 most frequent words/combinations are shown.
library(RWeka)
one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
thr<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpus6,control=list(tokenize=one))
two_table<-TermDocumentMatrix(corpus6,control=list(tokenize=two))
thr_table<-TermDocumentMatrix(corpus6,control=list(tokenize=thr))
one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
thr_corpus<-findFreqTerms(thr_table,lowfreq=10)
one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)
## Word frequency
## just just 2576
## like like 2218
## will will 2211
## one one 2049
## get get 1869
## can can 1866
two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)
## Word frequency
## cant wait cant wait 208
## right now right now 206
## dont know dont know 164
## last night last night 148
## im going im going 130
## feel like feel like 125
thr_corpus_num<-rowSums(as.matrix(thr_table[thr_corpus,]))
thr_corpus_table<-data.frame(Word=names(thr_corpus_num),frequency=thr_corpus_num)
thr_corpus_sort<-thr_corpus_table[order(-thr_corpus_table$frequency),]
head(thr_corpus_sort)
## Word frequency
## cant wait see cant wait see 45
## happy mothers day happy mothers day 36
## happy new year happy new year 24
## im pretty sure im pretty sure 18
## italy lakes holidays italy lakes holidays 18
## little italy boston little italy boston 17
Extract the word and frequency of N-grams.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity")
one_g<-one_g+labs(title="Unigrams",x="Words",y="Frequency")
one_g<-one_g+theme(axis.text.x=element_text(angle=90))
one_g
two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity")
two_g<-two_g+labs(title="Bigrams",x="Words",y="Frequency")
two_g<-two_g+theme(axis.text.x=element_text(angle=90))
two_g
thr_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
thr_g<-thr_g+geom_bar(stat="identity")
thr_g<-thr_g+labs(title="Trigrams",x="Words",y="Frequency")
thr_g<-thr_g+theme(axis.text.x=element_text(angle=90))
thr_g
Plot graphs of each N-gram words. I can confirm which word is the most frequency in those files.
The next steps of the project will be to build a predictive algorithm using N-Grams lookups, in order to compute probabilites for the next occurence regarding to the previous words, backing off to a lower level (e.g. from 3-gram to 2-gram, and so forth) as needed. Later, developing a web app (using Shiny) that uses such algorithm, suggesting to the user the next word.
This document has been generated using R Mardown. Its .Rmd source code that can be found at: https://github.com/laplata2003/data-science-capstone-week2-milestone-report.