Read the data from files. The news file is read differently to circumvent an incomplete file read. Otherwise only 77259 out of the lines are read.
blogs <- readLines("Coursera-SwiftKey/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
conNews = file("Coursera-SwiftKey/final/en_US/en_US.news.txt", open="rb")
news <- readLines(conNews, encoding = "UTF-8", skipNul = TRUE)
close(conNews)
twitter <- readLines("Coursera-SwiftKey/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
Next we calculate some descriptive analysis of our three datasources to get a sense of what the datasets contain. We calculate the number of lines (nlines), the number of characters (ncharacter), the number of words (nwords) and the minimum, mean and maximum words per line (wpl_min, wpl_mean, wpl_max).
DescriptiveAnaylsis <- function(data){
# Perform descriptive analysis of the data
nlines <- length(data)
ncharacter <- sum(nchar(data))
# Count the number of words on each line
wpl <- stri_count_words(data)
# Extract, sum, min, mean, and max
nwords <- sum(wpl)
wpl_min <- min(wpl)
wpl_mean <- mean(wpl)
wpl_max <- max(wpl)
c(nlines, nwords, ncharacter, wpl_min, wpl_mean, wpl_max)
}
x1 <- c('blogs', DescriptiveAnaylsis(blogs))
x2 <- c('news', DescriptiveAnaylsis(news))
x3 <- c('twitter', DescriptiveAnaylsis(twitter))
DescriptiveAnalysisData <- data.frame(matrix(ncol = 7, nrow = 0))
DescriptiveAnalysisData <- rbind(DescriptiveAnalysisData, x1, x2, x3)
colnames(DescriptiveAnalysisData) <- c('Datafile', 'Lines', 'Words', 'Characters', 'wpl_min', 'wpl_mean', 'wpl_max')
DescriptiveAnalysisData
## Datafile Lines Words Characters wpl_min wpl_mean wpl_max
## 1 blogs 899288 37546246 206824505 0 41.7510808550765 6726
## 2 news 1010242 34762395 203223159 1 34.4099681066517 1796
## 3 twitter 2360148 30093410 162096241 1 12.7506452985152 47
Note that the twitter set contains the most lines but also has the lowest mean and maximum words per line due to the character restriction in twitter messages. Furthermore blog messages contain both the mean and max words per message.
Next we clean the datasets by removing any non-English characters.
blogs <- iconv(blogs, from="latin1", to="ASCII", sub="")
news <- iconv(news, from="latin1", to="ASCII", sub="")
twitter <- iconv(twitter, from="latin1", to="ASCII", sub="")
Next we create an \(2\%\) sample of the original datasets.
data <- c(sample(blogs, length(blogs)*0.02), sample(news, length(news)*0.02), sample(twitter, length(twitter)*0.02))
We use the \(tm\) package to create and clean the corpus by removing multiple spaces, convert to lower cast, remove stopwords, any punctuation and numbers.
corpus <- VCorpus(VectorSource(data))
# Strip multiple whitespaces
corpus <- tm_map(corpus, stripWhitespace)
# Convert all to lower cases
corpus <- tm_map(corpus, content_transformer(tolower))
# Remove stop words from dataset
corpus <- tm_map(corpus, removeWords, stopwords("english"))
# Remove punctuation
corpus <- tm_map(corpus, removePunctuation)
# Remove numbers
corpus <- tm_map(corpus, removeNumbers)
UnigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
BigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
TDMUni <- TermDocumentMatrix(corpus, control=list(tokenize=UnigramTokenizer))
TDMBi <- TermDocumentMatrix(corpus, control=list(tokenize=BigramTokenizer))
TDMTri <- TermDocumentMatrix(corpus, control=list(tokenize=TrigramTokenizer))
freqUni <- findFreqTerms(TDMUni, lowfreq = 200)
freqBi <- findFreqTerms(TDMBi, lowfreq = 25)
freqTri <- findFreqTerms(TDMTri, lowfreq = 5)
FreqCorpusUni <- rowSums(as.matrix(TDMUni[freqUni,]))
FreqCorpusBi <- rowSums(as.matrix(TDMBi[freqBi,]))
FreqCorpusTri <- rowSums(as.matrix(TDMTri[freqTri,]))
FreqCorpusUni <- data.frame(word=names(FreqCorpusUni), frequency=FreqCorpusUni)
FreqCorpusBi <- data.frame(word=names(FreqCorpusBi), frequency=FreqCorpusBi)
FreqCorpusTri <- data.frame(word=names(FreqCorpusTri), frequency=FreqCorpusTri)
plotNGrams <- function(data, title, num) {
df <- data[order(-data$frequency),][1:num,]
ggplot(df) +
geom_bar(aes(x = reorder(word, -frequency), y = frequency), stat = "identity", fill = "red", colour = "black", width = 0.80) +
coord_cartesian(xlim = c(0, num)) + labs(title = title) + xlab("Words") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
}
plotNGrams(FreqCorpusUni,"Most occuring Unigrams",20)
plotNGrams(FreqCorpusBi,"Most occuring Bigrams",20)
plotNGrams(FreqCorpusTri,"Most occuring Trigrams",20)
The next step is construct a prediction algorithm based on frequency lookup. Using the methods above, the next word will be predicted based on frequency.