The goal is to build a simple model for the relationship between words as a first step in building a predictive text mining application. The basic n-gram model will be used to predict the next word based on the previous 1, 2, or 3 words. The model will handle unseen n-grams, as well cases where a particular n-gram is not observed.
Three text files represent: - collection of Tweets - collection of blog entries - collection of news items
Download text files from: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
Loading text mining, string processing and plotting packages
library(knitr)
library(tm)
library(stringi)
library(quanteda)
library(ggplot2)
library(RWeka)
Read the files & get the file size
setwd("C:\\Users\\Ciru\\Documents\\Coursera\\Data Science Specialization\\Capstone Project\\Coursera-SwiftKey\\final\\en_US")
list.files(pattern = "^en_US.*txt$")
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
size <- round(file.info(c("en_US.blogs.txt",
"en_US.news.txt",
"en_US.twitter.txt"))$size/1024/1024, 2)
size
## [1] 200.42 196.28 159.36
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
# to be used in data munging step
profanity <- read.table("profanelist.txt")
The internal structure of the files:
#str(blogs)
#str(twitter)
#str(news)
summary(blogs)
## Length Class Mode
## 899288 character character
summary(twitter)
## Length Class Mode
## 2360148 character character
summary(news)
## Length Class Mode
## 77259 character character
Determine number of lines, characters and words. Create a dataframe of the files and plot the counts.
## Number of lines in each file
line_count <- c(length(blogs),
length(news),
length(twitter))
## Number of characters in each file
char_count <- c(sum(nchar(blogs)),
sum(nchar(news)),
sum(nchar(twitter)))
## Number of words
word_count <- c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter)))
filesstats <- cbind(line_count, char_count, word_count)
colnames(filesstats) <- c("Lines", "Characters", "Words")
rownames(filesstats) <- c("Blogs", "News", "Twitter")
filesstats
## Lines Characters Words
## Blogs 899288 206824505 37546246
## News 77259 15639408 2674536
## Twitter 2360148 162096241 30093410
# create a data frame to store counts
df <- data.frame(text.source = c("blogs", "news", "twitter"), char.count = NA, line.count = NA, word.count = NA)
df$char.count <- char_count
df$line.count <- line_count
df$word.count <- word_count
# plot the counts
g_char <- ggplot(df, aes(x = factor(text.source), y = char.count/1e+06)) +
geom_bar(stat = "identity") +
labs(y = "No. of characters (millions)", x = "Text File", title = "Number of characters per text file")
g_char
g_lines <- ggplot(df, aes(x = factor(text.source), y = line.count/1e+06)) +
geom_bar(stat = "identity") +
labs(y = "No. of lines (millions)", x = "Text File", title = "Number of lines per text file")
g_lines
g_words <- ggplot(df, aes(x = factor(text.source), y = word.count/1e+06)) +
geom_bar(stat = "identity") +
labs(y = "No. of words (millions)", x = "Text File", title = "Number of words per text file")
g_words
Create data samples of 10000 words from each file
# put files into a list
my.list <- list(blogs = blogs, news = news, twitter = twitter)
# create a new, empty list to store random selections
sample.list <- list(blog = NA, news = NA, twitter = NA)
# create a data frame for samples
sample.df <- data.frame(text.source = c("blog", "news", "twitter"),
line.count = NA, word.count = NA)
#Create data sample of 10000
sampleblogs <-sample(blogs, 10000)
samplenews <-sample(news, 10000)
sampletwitter <-sample(twitter, 10000)
sample.list <- list(blogs = sampleblogs, news = samplenews, twitter = sampletwitter)
# get counts of sample.list
samplewordcount <- c(sum(stri_count_words(sampleblogs)),
sum(stri_count_words(samplenews)),
sum(stri_count_words(sampletwitter)))
sample.df$line.count <- sapply(sample.list, length)
sample.df$word.count <- samplewordcount
sample.df
## text.source line.count word.count
## 1 blog 10000 414927
## 2 news 10000 344727
## 3 twitter 10000 127214
Remove: hash tags (#), twitter handles (@), URLs, profane language, numbers, punctuation, high-frequency words, white space; Convert: all text is converted to lower case; Create corpus class from the sample text files;
# replace strings that match the patterns below with blankstm
removeURL <- function(x) gsub("http:[[:alnum:]]*", "", x)
removeHashTags <- function(x) gsub("#\\S+", "", x)
removeTwitterHandles <- function(x) gsub("@\\S+", "", x)
# create corpus classs
text.corpus <- tm::Corpus(VectorSource(sample.list))
# Transformations
text.corpus <- tm::tm_map(text.corpus, content_transformer(removeHashTags))
text.corpus <- tm::tm_map(text.corpus, content_transformer(removeTwitterHandles))
text.corpus <- tm::tm_map(text.corpus, content_transformer(removeURL))
text.corpus <- tm::tm_map(text.corpus, tolower)
text.corpus <- tm::tm_map(text.corpus, removeNumbers)
text.corpus <- tm::tm_map(text.corpus, stripWhitespace)
text.corpus <- tm::tm_map(text.corpus, removePunctuation)
text.corpus <- tm::tm_map(text.corpus, removeWords, stopwords("english"))
#text.corpus <- tm::tm_map(text.corpus, removeWords, profanity)
For each text file
blog_dtm <- DocumentTermMatrix(text.corpus[1])
news_dtm <- DocumentTermMatrix(text.corpus[2])
twitter_dtm <- DocumentTermMatrix(text.corpus[3])
Determine the frequency of unique words
# Assign word count from DTM into data frames
freq.blog_df <- data.frame(word = blog_dtm$dimnames$Terms, frequency = blog_dtm$v)
freq.news_df <- data.frame(word = news_dtm$dimnames$Terms, frequency = news_dtm$v)
freq.twitter_df <- data.frame(word = twitter_dtm$dimnames$Terms, frequency = twitter_dtm$v)
# Order by descending frequency
freq.blog_df <- plyr::arrange(freq.blog_df, -frequency)
freq.news_df <- plyr::arrange(freq.news_df, -frequency)
freq.twitter_df <- plyr::arrange(freq.twitter_df, -frequency)
Plot the top 20 most frequent terms in each corpus
n <- 20L # variable to set top n words
# isolate top n words by decreasing frequency
blog.top <- freq.blog_df[1:n, ]
news.top <- freq.news_df[1:n, ]
twitter.top <- freq.twitter_df[1:n, ]
# reorder levels so charts plot in order of frequency
blog.top$word <- reorder(blog.top$word, blog.top$frequency)
news.top$word <- reorder(news.top$word, news.top$frequency)
twitter.top$word <- reorder(twitter.top$word, twitter.top$frequency)
# plots
g.blog.top <- ggplot(blog.top, aes(x = word, y = frequency)) +
geom_bar(stat = "identity", colour="white", fill="deepskyblue4") + coord_flip() +
labs(title = "Top 20 Most Frequent Terms: Blog Corpus")
g.blog.top
g.news.top <- ggplot(news.top, aes(x = word, y = frequency)) +
geom_bar(stat = "identity", colour="white", fill="firebrick2") + coord_flip() +
labs(title = "Top 20 Most Frequent Terms: News Corpus")
g.news.top
g.twitter.top <- ggplot(twitter.top, aes(x = word, y = frequency)) +
geom_bar(stat = "identity", colour="white", fill="darkolivegreen2") + coord_flip() +
labs(title = "Top 20 Most Frequent Terms: Twitter Corpus")
g.twitter.top
Develop a natural language prediction algorithm and display the results via a Shiny app.