Set Working Directory, Download the Data, and Unzip
Also assign these files to a specific directory we will be referencing later
setwd("C:/Users/gamartin/R/Coursera Swiftkey Capstone")
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
"Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip", exdir = "final")
Load the Libraries
library(stringi)
library(tm)
## Warning: package 'tm' was built under R version 3.3.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.2
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
Understand the Scope
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("final/en_US/en_US.news.txt", encoding =
## "UTF-8", skipNul = TRUE): incomplete final line found on 'final/en_US/
## en_US.news.txt'
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
blogs_dsize <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 / 1024
news_dsize <- file.info("final/en_US/en_US.news.txt")$size / 1024 / 1024
twitter_dsize <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 / 1024
blogs_msize<-object.size(blogs) / 1024 / 1024
news_msize<-object.size(news) / 1024 / 1024
twitter_msize<-object.size(twitter) / 1024 / 1024
blogs_words <- stri_count_words(blogs)
news_words <- stri_count_words(news)
twitter_words <- stri_count_words(twitter)
Data Frame
The data frame shows that we are working wtih a lot of lines and word counts, which could cause processing time to lag.
data.frame(source = c("blogs", "news", "twitter"),
files_MB = c(blogs_dsize, news_dsize, twitter_dsize),
in_memory_MB = c(blogs_msize, news_msize, twitter_msize),
lines = c(length(blogs), length(news), length(twitter)),
words_num = c(sum(blogs_words), sum(news_words), sum(twitter_words)),
mean_words_num = c(mean(blogs_words), mean(news_words), mean(twitter_words)))
## source files_MB in_memory_MB lines words_num mean_words_num
## 1 blogs 200.4242 248.49350 899288 37546246 41.75108
## 2 news 196.2775 19.17972 77259 2674536 34.61779
## 3 twitter 159.3641 301.39694 2360148 30093410 12.75065
Limit Data
We are going to limit the amount of lines to 5000 for our exploratory anaylsis and N-graming work
data<-c(blogs,news,twitter)
data2 <- head(data, 5000)
Create CORPUS and Clean
From our new data frame (5000 lines) we are going to clean and create the CORPUS. Corpora are collections of documents containing (natural language) text. We create Corpora for analysis.
vector_doc <- VectorSource(data2)
corpus <- VCorpus(vector_doc)
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to='UTF-8', sub='byte')), mc.cores=1)
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x,fixed=TRUE))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus<-tm_map(corpus, content_transformer(tolower))
corpus<-tm_map(corpus, stripWhitespace)
corpus<-tm_map(corpus, removePunctuation)
corpus<-tm_map(corpus, removeNumbers)
corpus<-tm_map(corpus, removeWords, stopwords('english'))
Tokenizers
We need to create sets of words (single words (unigrams), sets of two (bigrams), and sets of three (trigrams) for analysis. The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, then it emits N-grams of each word of the specified length.
BigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
#Count Words
freq_df <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_df <- data.frame(word=names(freq), freq=freq)
return(freq_df)
}
memory.size(max = TRUE)
## [1] 819.44
memory.limit(size=36000)
## [1] 36000
unigram <- removeSparseTerms(TermDocumentMatrix(corpus), 0.9999)
unigram_freq <- freq_df(unigram)
bigram <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer)), 0.9999)
bigram_freq <- freq_df(bigram)
trigram <- removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = TrigramTokenizer)), 0.9999)
trigram_freq <- freq_df(trigram)
Plotting Results
freq_plot <- function(data, title) {
ggplot(data[1:25,], aes(reorder(word, -freq), freq)) +
labs(x = "Words/Phrases", y = "Frequency") +
ggtitle(title) +
theme(axis.text.x = element_text(angle = 90, size = 12, hjust = 1)) +
geom_bar(stat = "identity")
}
freq_plot(unigram_freq, "Top-25 Unigrams")

freq_plot(bigram_freq, "Top-25 Bigrams")

freq_plot(trigram_freq, "Top-25 Trigrams")
