## Warning: package 'caret' was built under R version 3.2.3
## Warning: package 'tm' was built under R version 3.2.3
## Warning: package 'NLP' was built under R version 3.2.3
## Warning: package 'RWeka' was built under R version 3.2.3
## Warning: package 'R.utils' was built under R version 3.2.3
## Warning: package 'R.oo' was built under R version 3.2.3
## Warning: package 'R.methodsS3' was built under R version 3.2.3
## Warning: package 'stringi' was built under R version 3.2.3
## Warning: package 'stringr' was built under R version 3.2.3
## Warning: package 'SnowballC' was built under R version 3.2.3
## Warning: package 'wordcloud' was built under R version 3.2.3
## Warning: package 'textcat' was built under R version 3.2.3
## Warning: package 'xtable' was built under R version 3.2.3
## Warning: package 'markovchain' was built under R version 3.2.3
## Warning: package 'data.table' was built under R version 3.2.3
HC corpora is a collection of corpora for various languages freely available to download. The corpora have been collected from numerous different webpages, with the aim of getting a varied and comprehensive corpus of current use of the respective language.
This analysis is based on three text files in English lenguage, taken from Blogs, News and Twitter. The Goal of this project is create a predictive model that can provide suggestion to users in a line text.Dowloaded from here
enNews<-readLines("en_US.news.txt",skipNul=TRUE,encoding='UTF-8')
enTwitter<-readLines("en_US.twitter.txt",skipNul=TRUE,encoding='UTF-8')
enBlogs<-readLines("en_US.blogs.txt",skipNul=TRUE,encoding='UTF-8')
Summary of en_US.blogs.txt
## % latex table generated in R 3.2.2 by xtable 1.8-0 package
## % Tue Dec 29 19:33:13 2015
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrr}
## \hline
## & Number of Lines & Total Word Count (Millions) \\
## \hline
## 1 & 899288.00 & 37.19 \\
## \hline
## \end{tabular}
## \end{table}
Summary of en_US.News.txt
## % latex table generated in R 3.2.2 by xtable 1.8-0 package
## % Tue Dec 29 19:33:55 2015
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrr}
## \hline
## & Number of Lines & Total Word Count (Millions) \\
## \hline
## 1 & 1010242.00 & 34.39 \\
## \hline
## \end{tabular}
## \end{table}
Summary of en_US.Twitter.txt
## % latex table generated in R 3.2.2 by xtable 1.8-0 package
## % Tue Dec 29 19:34:32 2015
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrr}
## \hline
## & Number of Lines & Total Word Count (Millions) \\
## \hline
## 1 & 2360148.00 & 30.44 \\
## \hline
## \end{tabular}
## \end{table}
Percentage<-0.1
Dir<-'./Data'
OutDir<-'./Data/OutData'
dir.create(file.path(OutDir))
for(cur in dir(Dir, pattern="(.)*.txt"))
{
curOutFile=paste0(strsplit(cur,"\\.txt"),"sample.txt")
lines_to_read<-ceiling(10/(Percentage/100))
fileConn = file(file.path(Dir,cur), "rb")
num_lines = R.utils::countLines(fileConn)
maxLinesToRead=ceiling(num_lines/10)
minLinesToRead=ceiling(num_lines/100)
if (lines_to_read > maxLinesToRead)
{
lines_to_read = maxLinesToRead
}
else if (lines_to_read < minLinesToRead)
{
lines_to_read = minLinesToRead
}
sample_line_idx = numeric()
file_subset = character()
h_conn = file(file.path(Dir,cur), "r", blocking=FALSE)
lines_read = 0
repeat
{
cur_chunk=readLines(h_conn, lines_to_read, skipNul=TRUE)
if (length(cur_chunk) == 0)
{
break
}
else
{
cur_sample_line_idx = which(rbinom(lines_to_read,1,Percentage/100) == 1)
file_subset = append(file_subset,cur_chunk[cur_sample_line_idx])
sample_line_idx = append(sample_line_idx,cur_sample_line_idx + lines_read)
lines_read = lines_read + lines_to_read
}
}
if(!file.exists(file.path(OutDir,curOutFile))
{
file.create(file.path(OutDir,curOutFile))
}
h_conn = file(file.path(OutDir,curOutFile), "w")
write(file_subset, file=h_conn)
}
Now, let’s save the Corpus data and load to clean.
corpusFile=file.path(OutDir,"Corpus.Rdata")
corpusList=list()
LoadCorpus=function(Path)
{
fileConn=file(file.pathPath),"r",blocking=FALSE)
Chunk= eadLines(fileConn,skipNul=TRUE)
corp = Corpus(VectorSource(Chunk))
corp = tm_map(corp, removeNonASCII)
corp = tm_map(corp, customRemovePunctuation)
corp = tm_map(corp, removeNumbers)
corp = tm_map(corp, stripWhitespace)
corp = tm_map(corp, convertToLowerCase)
return (corp)
}
for(file in dir(OutDir,pattern="(.)*.txt"))
{
corpusList[[file]]=LoadCorpus(paste0(OutDirectory,"/",file))
}
save(file=corpusFileName, corpusList)
# Combine all three corpus
sample=names(corpusList)
for(n in seq_len(length(sample)))
{
if(n==1)
{
combCorpus = corpusList[[sample[n]]]
}
else
{
combCorpus = c(combCorpus, corpusList[[sample[n]]])
}
}
Top 20th Popular Unigrams words used.
Top 20th Popular Bigrams words used.
Top 20th Popular Trigrams words used.
In the next report, we will work on the predictive model based on Markov chains and transition matrices, using the Corpus data sampled from .txt files, showed in this report.