The three text documents are read in and converted to Latin 1 text. This removes a large number of incompatible text characters, and reduces the complexity of the Document Matrix. The three documents are then imported into a virtual corpus container. Some metadata labels are created to describe the documents.
library(tm)
# Read in one blogs text file directly
blogsTxt <- readLines("../final/en_US/en_US.blogs.txt")
blogsTxt <- iconv(blogsTxt, to = "latin1")
blogsTxt <- (blogsTxt[!is.na(blogsTxt)])
blogsTxt <- paste(blogsTxt, collapse = "\n")
# Read in one news text file directly
newsTxt <- readLines("../final/en_US/en_US.news.txt")
newsTxt <- iconv(newsTxt, to = "latin1")
newsTxt <- (newsTxt[!is.na(newsTxt)])
newsTxt <- paste(newsTxt, collapse = "\n")
# Read in one twitter text file directly
tweetsTxt <- readLines("../final/en_US/en_US.twitter.txt")
tweetsTxt <- iconv(tweetsTxt, to = "latin1")
tweetsTxt <- (tweetsTxt[!is.na(tweetsTxt)])
tweetsTxt <- paste(tweetsTxt, collapse = "\n")
# Add text documents to corpus
docs <- VCorpus(VectorSource(c(blogsTxt, newsTxt, tweetsTxt)))
# Add metadata labels to the documents
meta(docs[[1]])$author <- "SwiftKey HC Corpora"
meta(docs[[2]])$author <- "SwiftKey HC Corpora"
meta(docs[[3]])$author <- "SwiftKey HC Corpora"
meta(docs[[1]])$description <- "Blog text"
meta(docs[[2]])$description <- "News text"
meta(docs[[3]])$description <- "Twitter text"
meta(docs[[1]])$language <- "English Latin 1"
meta(docs[[2]])$language <- "English Latin 1"
meta(docs[[3]])$language <- "English Latin 1"
# Summarize corpus of text documents
summary(docs)
## Length Class Mode
## 1 2 PlainTextDocument list
## 2 2 PlainTextDocument list
## 3 2 PlainTextDocument list
# List # of characters per document
inspect(docs[1:3])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 119990037
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 13590504
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 160272706
# View metadata of document #1
meta(docs[[1]])
## author : SwiftKey HC Corpora
## datetimestamp: 2017-08-02 02:41:37
## description : Blog text
## heading : character(0)
## id : 1
## language : English Latin 1
## origin : character(0)
A comparison is made between the Blogs document contained in the corpus with the same text document read directly from the text file. The only difference is that the “tm” package adds row names to each character vector.
library(tm)
# Read in one text file directly
blogsTxt <- readLines("../final/en_US/en_US.blogs.txt")
# Extract character vector of same file
# from the corpus
blogs <- lapply(docs[1],as.character)
blogs <- unlist(blogs)
# Compare the character vectors; they
# only differ by rownames
identical(blogsTxt, blogs)
## [1] FALSE
all.equal(blogsTxt, blogs)
## [1] "names for current but not for target"
# The tm package assigns row names
# to the character vectors
blogsTxt[2]
## [1] "We love you Mr. Brown."
blogs[2]
## en_US.blogs.txt2
## "We love you Mr. Brown."
# Ok, comparison done. We'll work with
# the documents pulled from the corpus
rm(blogsTxt)
A table of document statistics is prepared to assess the volume of information contained in the three documents.
# Load libraries
library(tm)
library(knitr)
library(stringr)
# Get the other two documents:
## The unedited news
news <- lapply(docs[2],as.character)
news <- unlist(news)
## The unedited tweets
tweets <- lapply(docs[3],as.character)
tweets <- unlist(tweets)
# Create lists of stats: "Words",
# "AlphaChars", "AllChars", "Lines"
BlogStat <- list()
NewsStat <- list()
TweetStat <- list()
# Count words in blogs, news, and tweets
# blogs
cnt <- lapply(blogs, function(x) str_count(x,"\\W+"))
BlogStat$Words <- do.call(sum, cnt)
# news
cnt <- lapply(news, function(x) str_count(x,"\\W+"))
NewsStat$Words <- do.call(sum, cnt)
# tweets
cnt <- lapply(tweets, function(x) str_count(x,"\\W+"))
TweetStat$Words <- do.call(sum, cnt)
# Count alphabetic characters in blogs, news, and tweets
## blogs
cnt <- lapply(blogs, function(x) str_count(x,"[a-zA-Z]"))
BlogStat$AlphaChars <- do.call(sum, cnt)
## news
cnt <- lapply(news, function(x) str_count(x,"[a-zA-Z]"))
NewsStat$AlphaChars <- do.call(sum, cnt)
## tweets
cnt <- lapply(tweets, function(x) str_count(x,"[a-zA-Z]"))
TweetStat$AlphaChars <- do.call(sum, cnt)
# Count total characters in blogs, news, and tweets
## blogs
cnt <- lapply(blogs,nchar)
BlogStat$AllChars <- do.call(sum, cnt)
## news
cnt <- lapply(news,nchar)
NewsStat$AllChars <- do.call(sum, cnt)
## tweets
cnt <- lapply(tweets,nchar)
TweetStat$AllChars <- do.call(sum, cnt)
# Count number of lines in each document
BlogStat$Lines <- length(blogs)
NewsStat$Lines <- length(news)
TweetStat$Lines <- length(tweets)
# Combine stat vectors
stat <- rbind(BlogStat, NewsStat, TweetStat)
# Produce table of document statistics (unprocessed)
kable(stat, caption="Unprocessed Document Statistic")
| Words | AlphaChars | AllChars | Lines | |
|---|---|---|---|---|
| BlogStat | 38481492 | 162800902 | 208361438 | 899288 |
| NewsStat | 2760117 | 12349506 | 15683765 | 77259 |
| TweetStat | 30534962 | 124977127 | 162384825 | 2360148 |
Punctuation, white space, numbers, and stop words are removed from the documents. All the document words are converted to lowercase and stemmed in order to normalize the terms.
library(tm)
# Remove punctuation
docs <- tm_map(docs,removePunctuation)
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Convert characters to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove stop words
docs <- tm_map(docs, removeWords, stopwords("english"))
# Strip white space
docs <- tm_map(docs, stripWhitespace)
# Stem the document
docs <- tm_map(docs, stemDocument)
# Treat as plain text document
docs <- tm_map(docs, PlainTextDocument)
In order to limit the number of frequently occurring terms, the sparse Document Text Matrix was limited to words occurring more than 75,000 times. The most frequently occurring term, “just,” appeared in the corpus over 250,000 times. A histogram of word frequencies was prepared on the 47 most frequently used terms. The correlation limits were set very high to limit closely associated terms to fewer than 50 words.
library(tm)
library(ggplot2)
library(SnowballC)
# Create document term matrix
dtm <- DocumentTermMatrix(docs)
# Remove sparse terms
dtms <- removeSparseTerms(dtm, 0.4)
# Find some frequently occurring terms
findFreqTerms(dtms, 75000)
## [1] "also" "back" "can" "come" "day" "dont" "even"
## [8] "feel" "first" "follow" "get" "good" "got" "great"
## [15] "just" "know" "last" "like" "look" "love" "make"
## [22] "much" "need" "new" "now" "one" "peopl" "realli"
## [29] "right" "say" "see" "start" "still" "take" "thank"
## [36] "thing" "think" "time" "today" "use" "want" "way"
## [43] "week" "well" "will" "work" "year"
# Sort matrix in descending order
wordFreq <- sort(colSums(as.matrix(dtms)), decreasing=TRUE)
head(wordFreq, 10)
## just get like one will time can love day make
## 253300 244829 243628 224651 219982 195027 192664 188296 181732 157591
# Create dataframe for ggplot
wordFreqDF <- data.frame(Word=names(wordFreq), Freq=wordFreq)
head(wordFreqDF)
## Word Freq
## just just 253300
## get get 244829
## like like 243628
## one one 224651
## will will 219982
## time time 195027
# Subset to the most frequent words
wordFreqDF75k <- subset(wordFreqDF, Freq>75000)
# Create word plot
wordPlot <- ggplot(wordFreqDF75k, aes(x = reorder(Word, -Freq), y = Freq)) +
geom_bar(stat = "identity") +
labs(x = "Word", y = "Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
# Display plot
wordPlot
# Create an even sparser matrix
dtmss <- removeSparseTerms(dtm, 0.01)
# Find close associates to the 3 most common terms
findAssocs(dtmss, "just", corlimit = 0.999999)
## $just
## bulldog burpe cain cobb code dds donna impost moos
## 1 1 1 1 1 1 1 1 1
## toot yoyo
## 1 1
findAssocs(dtmss, "get", corlimit = 0.999999)
## $get
## anatomi bethel biter cds cicero
## 1 1 1 1 1
## firefli girlfriend gump join los
## 1 1 1 1 1
## mccarthi monet participatori pilat sinus
## 1 1 1 1 1
## smoothi thunderbolt wanderlust zeta
## 1 1 1 1
findAssocs(dtmss, "like", corlimit = 0.999999)
## $like
## alba boy brandi charliz hampton leo plagiar pollen senor
## 1 1 1 1 1 1 1 1 1
## wink
## 1
The goal of this project is to create a prediction algorithm and R Shiny application that can predict the words a user will type, and suggest possible word selections using n-grams and close word associations. The size of the corpus may have to be limited even further to facilitate the clustering of words and near term associates.
Although the News document used in the corpus is the smallest text document, it contains less informal language and fewer incorrectly spelled words. This type of source document may be preferrable to the less formal language used in the Blog and Twitter posts.