First, download the necessary packages, and then the data set.
library(stringi)
library(tm)
library(RWeka)
library(ggplot2)
fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
fileZIP <- "Coursera-SwiftKey.zip"
if(!file.exists(fileZIP)) {
download.file(fileUrl, destfile = fileZIP)
unzip("Coursera-SwiftKey.zip")
}
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
Let’s make a simple summary of our data set.
## File.Name Word.counts Line.counts Num.of.character
## 1 twitter 37570839 2360148 206824505
## 2 blogs 2651432 899288 15639408
## 3 news 30451170 77259 162096241
A clean data set is needed for further research, therefore perform pre-processing of data.
# remove all non english characters as they cause issues
blogs <- iconv(blogs, "latin1", "ASCII", sub = "")
news <- iconv(news, "latin1", "ASCII", sub = "")
twitter <- iconv(twitter, "latin1", "ASCII", sub = "")
# sample the data
sample_data <- c(sample(twitter, length(twitter) * 0.01),
sample(blogs, length(blogs) * 0.01),
sample(news, length(news) * 0.01))
# create corpus and clean the data
corpus <- VCorpus(VectorSource(sample_data))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
To find the most used words and phrases we use tools of n-gram tokenization, in particular unigrams, bigrams and trigrams.
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
unigram_tab <- TermDocumentMatrix(corpus, control = list(tokenize = unigram))
bigram_tab <- TermDocumentMatrix(corpus,control = list(tokenize = bigram))
trigram_tab <- TermDocumentMatrix(corpus,control = list(tokenize = trigram))
At this stage, prepared the data for display, and directly made plots of data set.
unigram_corpus <- findFreqTerms(unigram_tab, lowfreq = 1000)
unigram_corpusnum <- rowSums(as.matrix(unigram_tab[unigram_corpus, ]))
unigram_corpustab <- data.frame(Word = names(unigram_corpusnum), frequency = unigram_corpusnum)
unigram_corpussort <- unigram_corpustab[order(-unigram_corpustab$frequency), ]
ggplot(unigram_corpussort, aes(x = reorder(Word, frequency),y = frequency)) +
geom_bar(stat = "identity") +
labs(title="Unigrams",x = "Most Words", y = "Frequency") +
coord_flip()
bigram_corpus <- findFreqTerms(bigram_tab, lowfreq = 80)
bigram_corpusnum <- rowSums(as.matrix(bigram_tab[bigram_corpus, ]))
bigram_corpustab <- data.frame(Word = names(bigram_corpusnum), frequency = bigram_corpusnum)
bigram_corpussort <- bigram_corpustab[order(-bigram_corpustab$frequency), ]
ggplot(bigram_corpussort, aes(x = reorder(Word, frequency), y = frequency)) +
geom_bar(stat = "identity") +
labs(title = "Bigrams", x = "Most Words", y = "Frequency") +
coord_flip()
trigram_corpus <- findFreqTerms(trigram_tab, lowfreq = 10)
trigram_corpusnum <- rowSums(as.matrix(trigram_tab[trigram_corpus, ]))
trigram_corpustab <- data.frame(Word = names(trigram_corpusnum), frequency = trigram_corpusnum)
trigram_corpussort <- trigram_corpustab[order(-trigram_corpustab$frequency), ]
ggplot(trigram_corpussort,aes(x = reorder(Word, frequency),y = frequency)) +
geom_bar(stat = "identity")+
labs(title = "Trigrams", x = "Most Words", y = "Frequency") +
coord_flip()