Introduction

The purpose of this document is just to display that exploring data and plan about app and algorithm.

Data files named LOCALE.blogs.txt, LOCALE.news.txt and LOCALE.twitter.txt where LOCALE is the each of the four locales en_US, de_DE, ru_RU and fi_FI. The data is from a corpus called HC Corpora (https://web-beta.archive.org/web/20161014134025/http://www.corpora.heliohost.org:80/index.html).

See the readme file at https://web-beta.archive.org/web/20160930083655/http://www.corpora.heliohost.org/aboutcorpus.html for details on the corpora available.

Prepare data

  1. Download data file
DATA_PATH = './'
SK.download <- function(){
    if(!file.exists(DATA_PATH)){
        dir.create(DATA_PATH)
        localPath <- paste(DATA_PATH, "data.zip", sep = "/")
        sourceUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
        download.file(sourceUrl, localPath, method="curl")
        unzip(localPath, exdir = DATA_PATH)
    }
}
downloadedFilePath = paste(DATA_PATH, "final", sep = "/")
  1. Basic data information
  1. load data from txt file.
blogsTxtPath <- paste(downloadedFilePath, "en_US", "en_US.blogs.txt", sep = "/")
blogs <- readLines(blogsTxtPath, 10000)

Cleaning and exploring data

Using tm package for cleaning some words - remove whitespace - remove punctuation - convert to lowercase

blogs.corpus <- VCorpus(VectorSource(blogs))
blogs.corpus <- tm_map(blogs.corpus, stripWhitespace)
blogs.corpus <- tm_map(blogs.corpus, removeNumbers)
blogs.corpus <- tm_map(blogs.corpus, removePunctuation)
blogs.corpus <- tm_map(blogs.corpus, content_transformer(tolower))

Tokenize

Make 3 types of tokenizer. a word, two words and three words.

uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

uniWord <- TermDocumentMatrix(blogs.corpus, control = list(tokenize = uniGramTokenizer))
biWords <- TermDocumentMatrix(blogs.corpus, control = list(tokenize = biGramTokenizer))
triWords <- TermDocumentMatrix(blogs.corpus, control = list(tokenize = triGramTokenizer))

Frequency of words

Calucate frequency 30 words

calcFrequency <- function(gramTDM){
  freqTerms <- findFreqTerms(gramTDM, lowfreq = 50)
  freq <- rowSums(as.matrix(gramTDM[freqTerms,]))
  freq <- data.frame(word=names(freq), freq=freq)
  freq[order(-freq$freq), ][1:30, ]
}

1-Gram

unigram <- calcFrequency(uniWord)
barplot(unigram$freq, names.arg=unigram$word)

unigram[1:10, ]
##      word  freq
## the   the 20282
## and   and 11912
## that that  5040
## for   for  3845
## with with  3188
## you   you  3164
## was   was  3073
## this this  2845
## have have  2343
## but   but  2317

2-Gram

bigram <- calcFrequency(biWords)
barplot(bigram$freq, names.arg=bigram$word)

bigram[1:10, ]
##            word freq
## of the   of the 2047
## in the   in the 1674
## to the   to the  982
## on the   on the  795
## to be     to be  759
## and the and the  659
## for the for the  631
## and i     and i  560
## i was     i was  541
## it was   it was  524

3-Gram

trigram <- calcFrequency(triWords)
barplot(trigram$freq, names.arg=trigram$word)

trigram[1:10, ]
##                    word freq
## one of the   one of the  169
## a lot of       a lot of  150
## to be a         to be a   78
## as well as   as well as   70
## some of the some of the   70
## out of the   out of the   67
## the end of   the end of   66
## i want to     i want to   63
## a couple of a couple of   62
## part of the part of the   62

Plans for prediction alogrithm and app