The purpose of this document is just to display that exploring data and plan about app and algorithm.
Data files named LOCALE.blogs.txt, LOCALE.news.txt and LOCALE.twitter.txt where LOCALE is the each of the four locales en_US, de_DE, ru_RU and fi_FI. The data is from a corpus called HC Corpora (https://web-beta.archive.org/web/20161014134025/http://www.corpora.heliohost.org:80/index.html).
See the readme file at https://web-beta.archive.org/web/20160930083655/http://www.corpora.heliohost.org/aboutcorpus.html for details on the corpora available.
DATA_PATH = './'
SK.download <- function(){
if(!file.exists(DATA_PATH)){
dir.create(DATA_PATH)
localPath <- paste(DATA_PATH, "data.zip", sep = "/")
sourceUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(sourceUrl, localPath, method="curl")
unzip(localPath, exdir = DATA_PATH)
}
}
downloadedFilePath = paste(DATA_PATH, "final", sep = "/")
en_US
locale and blogs data only on this document (for reducing time)blogsTxtPath <- paste(downloadedFilePath, "en_US", "en_US.blogs.txt", sep = "/")
blogs <- readLines(blogsTxtPath, 10000)
Using tm
package for cleaning some words - remove whitespace - remove punctuation - convert to lowercase
blogs.corpus <- VCorpus(VectorSource(blogs))
blogs.corpus <- tm_map(blogs.corpus, stripWhitespace)
blogs.corpus <- tm_map(blogs.corpus, removeNumbers)
blogs.corpus <- tm_map(blogs.corpus, removePunctuation)
blogs.corpus <- tm_map(blogs.corpus, content_transformer(tolower))
Make 3 types of tokenizer. a word, two words and three words.
uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
uniWord <- TermDocumentMatrix(blogs.corpus, control = list(tokenize = uniGramTokenizer))
biWords <- TermDocumentMatrix(blogs.corpus, control = list(tokenize = biGramTokenizer))
triWords <- TermDocumentMatrix(blogs.corpus, control = list(tokenize = triGramTokenizer))
Calucate frequency 30 words
calcFrequency <- function(gramTDM){
freqTerms <- findFreqTerms(gramTDM, lowfreq = 50)
freq <- rowSums(as.matrix(gramTDM[freqTerms,]))
freq <- data.frame(word=names(freq), freq=freq)
freq[order(-freq$freq), ][1:30, ]
}
unigram <- calcFrequency(uniWord)
barplot(unigram$freq, names.arg=unigram$word)
unigram[1:10, ]
## word freq
## the the 20282
## and and 11912
## that that 5040
## for for 3845
## with with 3188
## you you 3164
## was was 3073
## this this 2845
## have have 2343
## but but 2317
bigram <- calcFrequency(biWords)
barplot(bigram$freq, names.arg=bigram$word)
bigram[1:10, ]
## word freq
## of the of the 2047
## in the in the 1674
## to the to the 982
## on the on the 795
## to be to be 759
## and the and the 659
## for the for the 631
## and i and i 560
## i was i was 541
## it was it was 524
trigram <- calcFrequency(triWords)
barplot(trigram$freq, names.arg=trigram$word)
trigram[1:10, ]
## word freq
## one of the one of the 169
## a lot of a lot of 150
## to be a to be a 78
## as well as as well as 70
## some of the some of the 70
## out of the out of the 67
## the end of the end of 66
## i want to i want to 63
## a couple of a couple of 62
## part of the part of the 62