Through exploratory analysis of sample selections of blog posts, news stories, and Twitter “tweets”, we eventually may build a Natural Language Processing application capable of predicting next-word selection based on existing word- choice scenarios extant in the unstructured data.
The first step in the application development is to source and load data, then to clean the data in R to tokenize n-grams as a basis for the predictive model.
The data source, dowloaded and unzipped from: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
Load the required R Libraries and setup the work environment.
library(knitr); library(dplyr); library(doParallel); library(tm)
library(SnowballC); library(stringi); library(tm)
library(wordcloud); library(RWeka); library(ggplot2)
setwd("C:/Users/bensmith/Desktop/DataScience/10 Capstone")
path1 <- "./Data/en_US.blogs.txt"
path2 <- "./Data/en_US.news.txt"
path3 <- "./Data/en_US.twitter.txt"
# Read blogs data in binary mode
conn <- file(path1, open="rb")
blogs <- readLines(conn, encoding="UTF-8", skipNul=TRUE); close(conn)
# Read news data in binary mode
conn <- file(path2, open="rb")
news <- readLines(conn, encoding="UTF-8", skipNul=TRUE); close(conn)
# Read twitter data in binary mode
conn <- file(path3, open="rb")
twitter <- readLines(conn, encoding="UTF-8", skipNul=TRUE); close(conn)
# Remove temporary variable
rm(conn)
# Compute statistics and summary info for each data type
WPL <- sapply(list(blogs,news,twitter),function(x) summary(stri_count_words(x))[c('Min.','Mean','Max.')])
rownames(WPL) <- c('WPL_Min','WPL_Mean','WPL_Max')
stats <- data.frame(
FileName=c("en_US.blogs","en_US.news","en_US.twitter"),
t(rbind(
sapply(list(blogs,news,twitter),stri_stats_general)[c('Lines','Chars'),],
Words=sapply(list(blogs,news,twitter),stri_stats_latex)['Words',],
WPL)
))
head(stats)
## FileName Lines Chars Words WPL_Min WPL_Mean WPL_Max
## 1 en_US.blogs 899288 206824382 37570839 0 41.75108 6726
## 2 en_US.news 1010242 203223154 34494539 1 34.40997 1796
## 3 en_US.twitter 2360148 162096241 30451170 1 12.75065 47
Blogs have the most words per line. Tweets have a system-imposed character limit, so it comes as no surprise that Twitter entries have the fewest characters per line. Further, intuitively, blogs are not edited for space/content the way news stories are, so the results are not surprising.
Due to the size of the data, a sample of the data will be needed to continue.
The data will be cleansed and converted to corpora of text.
Cleansing consists of tidying and wrangling data to convert to remove punctuation, numbers, URLs, double-spaces, repeated alphabets, stopwords, and profanity from the sample.
set.seed(5618) #set seed for reproduction
sample <- 0.02 #set sample size, 2% of data
#Create subset of data
b_index <- sample(seq_len(length(blogs)), length(blogs)*sample)
n_index <- sample(seq_len(length(news)), length(news)*sample)
t_index <- sample(seq_len(length(twitter)), length(twitter)*sample)
b_sub <- blogs[b_index[]]
n_sub <- news[n_index[]]
t_sub <- twitter[t_index[]]
#Incorporate samples into single text corpus.
#Tidy data
corpus <- VCorpus(VectorSource(c(b_sub, n_sub, t_sub)),
readerControl=list(reader=readPlain,language="en")) # Make corpus
#Load profanity filter content
profanity<-readLines("profanity.csv")
#Create functions to transform the data
removeURL<-function(x) gsub("http[[:alnum:]]*","",x)
removeSign<-function(x) gsub("[[:punct:]]","",x)
removeNum<-function(x) gsub("[[:digit:]]","",x)
removeapo<-function(x) gsub("'","",x)
removeNonASCII<-function(x) iconv(x, "latin1", "ASCII", sub="")
removerepeat<- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
toLowerCase <- function(x) sapply(x,tolower)
removeSpace<-function(x) gsub("\\s+"," ",x)
removeTh<-function(x) gsub(" th", "",x)
#Wrangle data; remove punctuation, numbers, URLs, repeated alphabets, stopwords
#and profanity from the sample.
corpus<-tm_map(corpus,content_transformer(removeNonASCII))#remove non-ASCII characters
corpus<-tm_map(corpus,content_transformer(removeapo))#remove apostrophe
corpus<-tm_map(corpus,content_transformer(removeNum))#remove numbers
corpus<-tm_map(corpus,content_transformer(removeURL)) #remove web url
corpus<-tm_map(corpus,content_transformer(removeSign)) #remove number and punctuation except apostrophe
corpus<-tm_map(corpus,content_transformer(toLowerCase))#convert uppercase to lowercase
corpus<-tm_map(corpus,content_transformer(removerepeat))#remove repeated alphabets in a words
corpus<-tm_map(corpus,removeWords,stopwords("english")) #remove ultra-common english words
corpus<-tm_map(corpus,removeWords,profanity) #remove profanity words
Define functions to tokenize the sample data and construct matrices for 1-grams,
2-grams, and 3-grams.
#Tokenizing functions
uni_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bi_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tri_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
#Create Matrices
uni_matrix <- tm::TermDocumentMatrix(corpus, control = list(tokenize = uni_tokenizer))
bi_matrix <- tm::TermDocumentMatrix(corpus, control = list(tokenize = bi_tokenizer))
tri_matrix <- tm::TermDocumentMatrix(corpus, control = list(tokenize = tri_tokenizer))
Calculate frequency of terms in each matrix and construct a corresponding
dataframe. Set low frequency threshold at 20 for all3-grams in order to get a full population of 20 from the small sample (2%) of the original data.
#Limit matrices to a low frequency threshold
uni_corpus <- findFreqTerms(uni_matrix,lowfreq = 50)
bi_corpus <- findFreqTerms(bi_matrix,lowfreq = 50)
tri_corpus <- findFreqTerms(tri_matrix,lowfreq = 20)
#Create data frames of N-Gram Frequencies
uni_corpus_freq <- rowSums(as.matrix(uni_matrix[uni_corpus,]))
uni_corpus_freq <- data.frame(word=names(uni_corpus_freq), frequency=uni_corpus_freq)
uni_corpus_freq <- uni_corpus_freq[order(-uni_corpus_freq$frequency),]
bi_corpus_freq <- rowSums(as.matrix(bi_matrix[bi_corpus,]))
bi_corpus_freq <- data.frame(word=names(bi_corpus_freq), frequency=bi_corpus_freq)
bi_corpus_freq <- bi_corpus_freq[order(-bi_corpus_freq$frequency),]
tri_corpus_freq <- rowSums(as.matrix(tri_matrix[tri_corpus,]))
tri_corpus_freq <- data.frame(word=names(tri_corpus_freq), frequency=tri_corpus_freq)
tri_corpus_freq <- tri_corpus_freq[order(-tri_corpus_freq$frequency),]
#display top 20 trigrams
head(tri_corpus_freq, n=20)
## word frequency
## happy mothers day happy mothers day 81
## cant wait see cant wait see 64
## new york city new york city 63
## let us know let us know 51
## two years ago two years ago 38
## happy new year happy new year 37
## im pretty sure im pretty sure 37
## dont even know dont even know 35
## president barack obama president barack obama 35
## new york times new york times 31
## cinco de mayo cinco de mayo 26
## looking forward seeing looking forward seeing 24
## new years eve new years eve 24
## cant wait get cant wait get 23
## im looking forward im looking forward 23
## ive ever seen ive ever seen 23
## st louis county st louis county 22
## world war ii world war ii 22
## two weeks ago two weeks ago 20
## will take place will take place 20
plot_n_grams <- function(data, title, num) {
df2 <- data[order(-data$frequency),][1:num,]
ggplot(df2, aes(x = factor(df2$word), y = frequency)) +
geom_bar(stat = "identity", fill = "royalblue1", colour = "slategray4", width = 0.80) +
coord_cartesian(xlim = c(0, num+1)) +
aes(x = reorder(df2$word, -df2$frequency)) +
labs(title = title) +
xlab("n-Grams") +
ylab("Frequency") +
# scale_x_discrete(breaks = seq(1, num, by = 1), labels = df2$word) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
plot_n_grams(uni_corpus_freq,"Top 1-grams",20)
plot_n_grams(bi_corpus_freq,"Top 2-grams",20)
plot_n_grams(tri_corpus_freq,"Top 3-grams",20)
From exploring the data and constructing N-Grams from a sample set, data support a plan to construct a predictive model of next word selection for later incorporation into a Shiny web application that can make predictions based on a user input to a form.
Any feedback on this plan is greatly appreciated.