This milestone report is working with the data and build prediction algorithm to predict next word.
DATA_PATH = './datas'
SK.download <- function(){
if(!file.exists(DATA_PATH)){
dir.create(DATA_PATH)
localPath <- paste(DATA_PATH, "final", sep = "/")
sourceUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(sourceUrl, localPath, method="curl")
unzip(localPath, exdir = DATA_PATH)
}
}
downloadedFilePath = paste(DATA_PATH, "final", sep = "/")
blogsTxtPath <- paste(downloadedFilePath,"en_US.blogs.txt", sep = "/")
blogs <- readLines(blogsTxtPath, 10000)
Using tm package for cleaning some words
blogs.corpus <- VCorpus(VectorSource(blogs))
blogs.corpus <- tm_map(blogs.corpus, stripWhitespace)
blogs.corpus <- tm_map(blogs.corpus, removeNumbers)
blogs.corpus <- tm_map(blogs.corpus, removePunctuation)
blogs.corpus <- tm_map(blogs.corpus, content_transformer(tolower))
Make 3 types of tokenizer. 1 word, 2 words and 3 words.
uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
uniWord <- TermDocumentMatrix(blogs.corpus, control = list(tokenize = uniGramTokenizer))
biWords <- TermDocumentMatrix(blogs.corpus, control = list(tokenize = biGramTokenizer))
triWords <- TermDocumentMatrix(blogs.corpus, control = list(tokenize = triGramTokenizer))
Calucate frequency 30 words
calcFrequency <- function(gramTDM){
freqTerms <- findFreqTerms(gramTDM, lowfreq = 50)
freq <- rowSums(as.matrix(gramTDM[freqTerms,]))
freq <- data.frame(word=names(freq), freq=freq)
freq[order(-freq$freq), ][1:30, ]
}
unigram <- calcFrequency(uniWord)
barplot(unigram$freq, names.arg=unigram$word)
unigram[1:10, ]
## word freq
## the the 20363
## and and 11918
## that that 5056
## for for 3851
## you you 3198
## with with 3189
## was was 3074
## this this 2857
## have have 2349
## but but 2327
bigram <- calcFrequency(biWords)
barplot(bigram$freq, names.arg=bigram$word)
bigram[1:10, ]
## word freq
## of the of the 2050
## in the in the 1677
## to the to the 983
## on the on the 797
## to be to be 761
## and the and the 659
## for the for the 636
## and i and i 561
## i was i was 543
## it was it was 527
trigram <- calcFrequency(triWords)
barplot(trigram$freq, names.arg=trigram$word)
trigram[1:10, ]
## word freq
## one of the one of the 170
## a lot of a lot of 152
## to be a to be a 78
## as well as as well as 70
## some of the some of the 70
## out of the out of the 67
## the end of the end of 66
## i want to i want to 63
## a couple of a couple of 62
## it was a it was a 62