Below is a basic exploratory analysis of the training data set that will be used to develop a text prediction algorithm similiar to the one used by SwiftKey. Also is a list of observations from exploring the data and goals for the eventual prediction algorithm and creation of a Shiny app.
Dataset https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
setwd("C:/Users/Alex/Desktop/en_US") #store files in working directory
library(tm)
library(stringi)
library(ggplot2)
con <- file("en_US.blogs.txt", "r")
blog <- readLines(con, skipNul = TRUE)
close(con)
con <- file("en_US.news.txt", "r")
news <- readLines(con, skipNul = TRUE)
close(con)
con <- file("en_US.twitter.txt", "r")
twit <- readLines(con, skipNul = TRUE)
close(con)
sum(stri_count_words(blog))
## [1] 38154238
sum(stri_count_words(news))
## [1] 2693898
sum(stri_count_words(twit))
## [1] 30218166
The number of words in each of the files:
length(blog)
## [1] 899288
length(news)
## [1] 77259
length(twit)
## [1] 2360148
The number of lines in each of the files:
Here is the process for calculating the highest frequencies of the groups of 2 consecutive words and 3 consecutive words in all three files combined. The charts below show the top 25 most freqent bigrams and trigrams of the sample dataset.
The first step will be to take samples of the blog, news and twitter text files and then combine them into one corpus.
setwd("C:/Users/Alex/Desktop/en_US")
dir.create("./Sample")
con <- file("en_US.blogs.txt", "r")
blog <-readLines(con, length(blog)*0.01)
write(blog, file = "./Sample/blog.txt")
close(con)
con <- file("en_US.news.txt", "r")
news <-readLines(con, length(news)*0.01)
write(news, file = "./Sample/news.txt")
close(con)
con <- file("en_US.twitter.txt", "r")
twit <-readLines(con, length(twit)*0.01)
write(twit, file = "./Sample/twit.txt")
close(con)
docs <- VCorpus(DirSource("./Sample"))
The next step is to clean the data in order to get the most accurate word groups. We do this by changing all words to lower case and removing special symbols, punctuation, numbers and extra space. The following code also removes stop words such as in, an, & the, but the purpose of that is just to explore more unique word groupings at this time.
docs <- tm_map(docs, content_transformer(function(x) iconv(enc2utf8(x), sub = "bytes")))
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
BigramTokenizer <- function(x) {
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
}
dtm <- DocumentTermMatrix(docs, control=list(tokenize = BigramTokenizer))
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
df <- data.frame(word=names(freq), freq=freq)
ggplot(head(df, 25), aes(reorder(word,-freq), freq)) +
geom_col(fill="steelblue4") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigrams", y="Frequency")
TrigramTokenizer <- function(x) {
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
}
dtm <- DocumentTermMatrix(docs, control=list(tokenize = TrigramTokenizer))
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
df <- data.frame(word=names(freq), freq=freq)
ggplot(head(df, 25), aes(reorder(word,-freq), freq)) +
geom_col(fill="firebrick4") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Trigrams", y="Frequency")