The goal of this project is just to display that I’ve gotten used to working with the data and that I’m on track to create my prediction algorithm, explaining an exploratory analysis and the goals for the eventual app and algorithm.
The Dataset has been downloaded from the below link and unzipped manually.
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
library(knitr)
library(stringi)
library(NLP)
library(tm)
library(RWeka)
workingDir <- getwd()
dataDir <-file.path(workingDir, "data/")
resultsDir <- file.path(workingDir,"results")
dir(path = dataDir)
[1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
blogs_lines <- readLines(paste0(dataDir, "en_US.blogs.txt"), encoding = "UTF-8", skipNul = TRUE)
news_lines <- readLines(paste0(dataDir, "en_US.news.txt"), encoding = "UTF-8", skipNul= TRUE)
twitter_lines <- readLines(paste0(dataDir, "en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
words <- sapply(list(blogs_lines,news_lines,twitter_lines), function(x) summary(stri_count_words(x))[c('Min.','Mean','Max.')])
rownames(words) <- c('WPL_Min','WPL_Mean','WPL_Max')
abstract <- data.frame(
FileName=c("en_US.blogs.txt","en_US.news.txt","en_US.twitter.txt"), t(rbind(sapply(list(blogs_lines,news_lines,twitter_lines), stri_stats_general)[c('Lines','Chars'),], Words=sapply(list(blogs_lines,news_lines,twitter_lines), stri_stats_latex)['Words',], words)
))
print(abstract)
FileName Lines Chars Words WPL_Min WPL_Mean WPL_Max
1 en_US.blogs.txt 899288 206824382 37570839 0 41.75107 6726
2 en_US.news.txt 1010242 203223154 34494539 1 34.40997 1796
3 en_US.twitter.txt 2360148 162096241 30451170 1 12.75065 47
set.seed(12345)
s_blogs <- blogs_lines[sample(1:length(blogs_lines), 0.01*length(blogs_lines), replace=FALSE)]
s_blogs<- paste(s_blogs, collapse = " ")
s_news <- news_lines[sample(1:length(news_lines), 0.01*length(news_lines), replace=FALSE)]
s_news <- paste(s_news, collapse = " ")
s_twitter <- twitter_lines[sample(1:length(twitter_lines), 0.01*length(twitter_lines), replace=FALSE)]
s_twitter<- paste(s_twitter, collapse = " ")
s_blogs <- iconv(s_blogs, "UTF-8", "ASCII", sub="")
s_news <- iconv(s_news, "UTF-8", "ASCII", sub="")
s_twitter <- iconv(s_twitter, "UTF-8", "ASCII", sub="")
s_data <- c(s_blogs, s_news, s_twitter)
s_corpus <- VCorpus(VectorSource(s_data))
s_corpus <- tm_map(s_corpus, tolower)
s_corpus <- tm_map(s_corpus, removeNumbers)
s_corpus <- tm_map(s_corpus, removePunctuation)
s_corpus <- tm_map(s_corpus, removeWords, stopwords("english"))
s_corpus <- tm_map(s_corpus, stripWhitespace)
s_corpus <- tm_map(s_corpus, PlainTextDocument)
uni_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bi_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tri_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
uni_matrix <- TermDocumentMatrix(s_corpus, control = list(tokenize = uni_tokenizer))
bi_matrix <- TermDocumentMatrix(s_corpus, control = list(tokenize = bi_tokenizer))
tri_matrix <- TermDocumentMatrix(s_corpus, control = list(tokenize = tri_tokenizer))
uni_corpus <- findFreqTerms(uni_matrix,lowfreq = 10)
uni_corpus_freq <- rowSums(as.matrix(uni_matrix[uni_corpus,]))
uni_corpus_freq <- sort(uni_corpus_freq, decreasing = TRUE)
bi_corpus <- findFreqTerms(bi_matrix,lowfreq=10)
bi_corpus_freq <- rowSums(as.matrix(bi_matrix[bi_corpus,]))
bi_corpus_freq <- sort(bi_corpus_freq, decreasing = TRUE)
tri_corpus <- findFreqTerms(tri_matrix,lowfreq=10)
tri_corpus_freq <- rowSums(as.matrix(tri_matrix[tri_corpus,]))
tri_corpus_freq <- sort(tri_corpus_freq, decreasing = TRUE)
kable(head(uni_corpus_freq, 10))
| x | |
|---|---|
| will | 3160 |
| just | 3079 |
| said | 3051 |
| one | 2805 |
| like | 2670 |
| can | 2458 |
| get | 2352 |
| time | 2089 |
| new | 1966 |
| dont | 1867 |
barplot(uni_corpus_freq[1:20], col = "deepskyblue", las = 2, cex.names = 0.6)
kable(head(bi_corpus_freq, 10))
| x | |
|---|---|
| right now | 242 |
| cant wait | 215 |
| dont know | 205 |
| last year | 191 |
| new york | 177 |
| last night | 155 |
| im going | 150 |
| feel like | 145 |
| high school | 143 |
| first time | 125 |
barplot(bi_corpus_freq[1:20], col = "deepskyblue", las = 2, cex.names = 0.6)
kable(head(tri_corpus_freq, 10))
| x | |
|---|---|
| cant wait see | 51 |
| happy mothers day | 33 |
| call call call | 23 |
| happy new year | 21 |
| new york city | 21 |
| let us know | 20 |
| italy lakes holidays | 18 |
| little italy boston | 17 |
| magianos little italy | 17 |
| im pretty sure | 16 |
barplot(tri_corpus_freq[1:20], col = "deepskyblue", las = 2, cex.names = 0.6)
It would be convenient to perform several test with different sample sizes of the dataset to find a balance between computational costs and accuracy of the results. And also it will be necessary to think the way to create a Shiny web app with this algorithm incorporated.