Load SwiftKey Project Data

The three text documents are read in and converted to Latin 1 text. This removes a large number of incompatible text characters, and reduces the complexity of the Document Matrix. The three documents are then imported into a virtual corpus container. Some metadata labels are created to describe the documents.

library(tm)

# Read in one blogs text file directly
blogsTxt <- readLines("../final/en_US/en_US.blogs.txt")
blogsTxt <- iconv(blogsTxt, to = "latin1")
blogsTxt <- (blogsTxt[!is.na(blogsTxt)])
blogsTxt <-  paste(blogsTxt, collapse = "\n")

# Read in one news text file directly
newsTxt <- readLines("../final/en_US/en_US.news.txt")
newsTxt <- iconv(newsTxt, to = "latin1")
newsTxt <- (newsTxt[!is.na(newsTxt)])
newsTxt <- paste(newsTxt, collapse = "\n")

# Read in one twitter text file directly
tweetsTxt <- readLines("../final/en_US/en_US.twitter.txt")
tweetsTxt <- iconv(tweetsTxt, to = "latin1")
tweetsTxt <- (tweetsTxt[!is.na(tweetsTxt)])
tweetsTxt <- paste(tweetsTxt, collapse = "\n")

# Add text documents to corpus
docs <- VCorpus(VectorSource(c(blogsTxt, newsTxt, tweetsTxt)))

# Add metadata labels to the documents
meta(docs[[1]])$author <- "SwiftKey HC Corpora"
meta(docs[[2]])$author <- "SwiftKey HC Corpora"
meta(docs[[3]])$author <- "SwiftKey HC Corpora"

meta(docs[[1]])$description <- "Blog text"
meta(docs[[2]])$description <- "News text"
meta(docs[[3]])$description <- "Twitter text"

meta(docs[[1]])$language <- "English Latin 1"
meta(docs[[2]])$language <- "English Latin 1"
meta(docs[[3]])$language <- "English Latin 1"

# Summarize corpus of text documents
summary(docs) 
##   Length Class             Mode
## 1 2      PlainTextDocument list
## 2 2      PlainTextDocument list
## 3 2      PlainTextDocument list
# List # of characters per document
inspect(docs[1:3])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 119990037
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 13590504
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 160272706
# View metadata of document #1
meta(docs[[1]])
##   author       : SwiftKey HC Corpora
##   datetimestamp: 2017-08-02 02:41:37
##   description  : Blog text
##   heading      : character(0)
##   id           : 1
##   language     : English Latin 1
##   origin       : character(0)

Compare Corpus Document to Text

A comparison is made between the Blogs document contained in the corpus with the same text document read directly from the text file. The only difference is that the “tm” package adds row names to each character vector.

library(tm)

# Read in one text file directly
blogsTxt <- readLines("../final/en_US/en_US.blogs.txt")

# Extract character vector of same file
# from the corpus
blogs <- lapply(docs[1],as.character)
blogs <- unlist(blogs)

# Compare the character vectors; they  
# only differ by rownames
identical(blogsTxt, blogs)
## [1] FALSE
all.equal(blogsTxt, blogs)
## [1] "names for current but not for target"
# The tm package assigns row names
# to the character vectors 
blogsTxt[2]
## [1] "We love you Mr. Brown."
blogs[2]
##         en_US.blogs.txt2 
## "We love you Mr. Brown."
# Ok, comparison done.  We'll work with
# the documents pulled from the corpus
rm(blogsTxt)

Compile Stat Table

A table of document statistics is prepared to assess the volume of information contained in the three documents.

# Load libraries

library(tm)
library(knitr)
library(stringr)

# Get the other two documents:
## The unedited news 
news <- lapply(docs[2],as.character)
news <- unlist(news)

## The unedited tweets
tweets <- lapply(docs[3],as.character)
tweets <- unlist(tweets)

# Create lists of stats:  "Words", 
# "AlphaChars", "AllChars", "Lines"
BlogStat <- list()
NewsStat <- list()
TweetStat <- list()

# Count words in blogs, news, and tweets 
# blogs
cnt <- lapply(blogs, function(x) str_count(x,"\\W+"))
BlogStat$Words <- do.call(sum, cnt)
# news
cnt <- lapply(news, function(x) str_count(x,"\\W+"))
NewsStat$Words <- do.call(sum, cnt)
# tweets
cnt <- lapply(tweets, function(x) str_count(x,"\\W+"))
TweetStat$Words <- do.call(sum, cnt)

# Count alphabetic characters in blogs, news, and tweets 
## blogs
cnt <- lapply(blogs, function(x) str_count(x,"[a-zA-Z]"))
BlogStat$AlphaChars <-  do.call(sum, cnt)
## news
cnt <- lapply(news, function(x) str_count(x,"[a-zA-Z]"))
NewsStat$AlphaChars <-  do.call(sum, cnt)
## tweets
cnt <- lapply(tweets, function(x) str_count(x,"[a-zA-Z]"))
TweetStat$AlphaChars <-  do.call(sum, cnt)

# Count total characters in blogs, news, and tweets 
## blogs
cnt <- lapply(blogs,nchar)
BlogStat$AllChars <- do.call(sum, cnt)
## news
cnt <- lapply(news,nchar)
NewsStat$AllChars <- do.call(sum, cnt)
## tweets
cnt <- lapply(tweets,nchar)
TweetStat$AllChars <- do.call(sum, cnt)

# Count number of lines in each document
BlogStat$Lines <- length(blogs)
NewsStat$Lines <- length(news)
TweetStat$Lines <- length(tweets)

# Combine stat vectors
stat <- rbind(BlogStat, NewsStat, TweetStat)

# Produce table of document statistics (unprocessed)
kable(stat, caption="Unprocessed Document Statistic")
Unprocessed Document Statistic
Words AlphaChars AllChars Lines
BlogStat 38481492 162800902 208361438 899288
NewsStat 2760117 12349506 15683765 77259
TweetStat 30534962 124977127 162384825 2360148

Process documents

Punctuation, white space, numbers, and stop words are removed from the documents. All the document words are converted to lowercase and stemmed in order to normalize the terms.

library(tm)

# Remove punctuation
docs <- tm_map(docs,removePunctuation)

# Remove numbers
docs <- tm_map(docs, removeNumbers)  

# Convert characters to lower case
docs <- tm_map(docs, content_transformer(tolower))

# Remove stop words
docs <- tm_map(docs, removeWords, stopwords("english"))

# Strip white space 
docs <- tm_map(docs, stripWhitespace)

# Stem the document
docs <- tm_map(docs, stemDocument)

# Treat as plain text document
docs <- tm_map(docs, PlainTextDocument)

Create and Examine the Term-Document Matrix

In order to limit the number of frequently occurring terms, the sparse Document Text Matrix was limited to words occurring more than 75,000 times. The most frequently occurring term, “just,” appeared in the corpus over 250,000 times. A histogram of word frequencies was prepared on the 47 most frequently used terms. The correlation limits were set very high to limit closely associated terms to fewer than 50 words.

library(tm)
library(ggplot2)
library(SnowballC)

# Create document term matrix
dtm <- DocumentTermMatrix(docs)

# Remove sparse terms
dtms <- removeSparseTerms(dtm, 0.4)

# Find some frequently occurring terms
findFreqTerms(dtms, 75000)
##  [1] "also"   "back"   "can"    "come"   "day"    "dont"   "even"  
##  [8] "feel"   "first"  "follow" "get"    "good"   "got"    "great" 
## [15] "just"   "know"   "last"   "like"   "look"   "love"   "make"  
## [22] "much"   "need"   "new"    "now"    "one"    "peopl"  "realli"
## [29] "right"  "say"    "see"    "start"  "still"  "take"   "thank" 
## [36] "thing"  "think"  "time"   "today"  "use"    "want"   "way"   
## [43] "week"   "well"   "will"   "work"   "year"
# Sort matrix in descending order 
wordFreq <- sort(colSums(as.matrix(dtms)), decreasing=TRUE)   
head(wordFreq, 10) 
##   just    get   like    one   will   time    can   love    day   make 
## 253300 244829 243628 224651 219982 195027 192664 188296 181732 157591
# Create dataframe for ggplot
wordFreqDF <- data.frame(Word=names(wordFreq), Freq=wordFreq)   
head(wordFreqDF)
##      Word   Freq
## just just 253300
## get   get 244829
## like like 243628
## one   one 224651
## will will 219982
## time time 195027
# Subset to the most frequent words 
wordFreqDF75k <- subset(wordFreqDF, Freq>75000)

# Create word plot
wordPlot <- ggplot(wordFreqDF75k, aes(x = reorder(Word, -Freq), y = Freq)) +
  geom_bar(stat = "identity") + 
  labs(x = "Word", y = "Frequency") +
  theme(axis.text.x=element_text(angle=45, hjust=1))

# Display plot
wordPlot  

# Create an even sparser matrix
dtmss <- removeSparseTerms(dtm, 0.01)

# Find close associates to the 3 most common terms
findAssocs(dtmss, "just", corlimit = 0.999999)
## $just
## bulldog   burpe    cain    cobb    code     dds   donna  impost    moos 
##       1       1       1       1       1       1       1       1       1 
##    toot    yoyo 
##       1       1
findAssocs(dtmss, "get", corlimit = 0.999999)
## $get
##       anatomi        bethel         biter           cds        cicero 
##             1             1             1             1             1 
##       firefli    girlfriend          gump          join           los 
##             1             1             1             1             1 
##      mccarthi         monet participatori         pilat         sinus 
##             1             1             1             1             1 
##       smoothi   thunderbolt    wanderlust          zeta 
##             1             1             1             1
findAssocs(dtmss, "like", corlimit = 0.999999)
## $like
##    alba     boy  brandi charliz hampton     leo plagiar  pollen   senor 
##       1       1       1       1       1       1       1       1       1 
##    wink 
##       1

Goal and Plan of Project

The goal of this project is to create a prediction algorithm and R Shiny application that can predict the words a user will type, and suggest possible word selections using n-grams and close word associations. The size of the corpus may have to be limited even further to facilitate the clustering of words and near term associates.

Although the News document used in the corpus is the smallest text document, it contains less informal language and fewer incorrectly spelled words. This type of source document may be preferrable to the less formal language used in the Blog and Twitter posts.