This is a milestone report of Week 2 Data Science Specialization Capstone Project. This report is the first step to complete the final goal, to build a prediction algorithm and Shiny app that allows a predictive text model. The model will be trained using a collection of English text that is compiled from 3 sources - news, blogs, and tweets.
The main parts are loading and cleaning the data, as well as using NLM (natural language processing) applications in R s, in the direction of building the model.
The data to use can be downloaded at:
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
I have selected to use only english_US dataset for this project.
Twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
New <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE )
Blog <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE )
I calculated summary statistics for the data by determining size. the number of lines, number of characters for each of the 3 datasets (twitter, blogs, and news).
Files <- c("Twitter", "New", "Blog")
FileInfoTwitter <- file.info("en_US.twitter.txt")
sizeTwitter <- FileInfoTwitter$size
sizeTwitterMB <- round(sizeTwitter/1024^2, 4)
FileInfoNew <- file.info("en_US.news.txt")
sizeNew <- FileInfoNew$size
sizeNewMB <- round(sizeNew/1024^2, 4)
FileInfoBlog <- file.info("en_US.blogs.txt")
sizeBlog <- FileInfoBlog$size
sizeBlogMB <- round(sizeBlog/1024^2, 4)
SizeMB <- c(sizeTwitterMB, sizeNewMB, sizeBlogMB)
Lines <- c(length(Twitter), length(New), length(Blog))
Chars <- c(sum(nchar(Twitter)), sum(nchar(New)), sum(nchar(Blog)))
DataSummary <- cbind.data.frame(Files, SizeMB, Lines, Chars)
DataSummary
## Files SizeMB Lines Chars
## 1 Twitter 159.3641 2360148 162096241
## 2 New 196.2775 77259 15639408
## 3 Blog 200.4242 899288 206824505
Because the data is very long, we will take samples from each file to create the model. Likewise, the data will be cleaned.
set.seed(2009)
sTwitter <- sample(Twitter, length(Twitter)*.01)
sNew <- sample(New, length(New)*.01)
sBlog <- sample(Blog, length(Blog)*.01)
sampleData <- c(sTwitter, sNew, sBlog)
sampleData <- iconv(sampleData, "UTF-8", "ASCII", sub="")
Using the tm package I will clean the data by changing all characters to lowercase, trimming white space, removing numbers, removing punctuation, removing profanity and english stopwords, and to convert the text to plaintext.
library(tm)
library(NLP)
corpus <- VCorpus(VectorSource(sampleData))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
#Read in Profanity List
profanity <- readLines('badWords.txt')
corpus <- tm_map(corpus, removeWords, profanity)
corpus <- tm_map(corpus, removeWords, stopwords('english') )
corpus <- tm_map(corpus, PlainTextDocument)
I use the RWeka package to construct functions that tokenize the sample and construct matrices of uniqrams, bigrams, and trigrams.
library(RWeka)
unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
unigrams <- TermDocumentMatrix(corpus, control = list(tokenize = unigramTokenizer))
bigrams <- TermDocumentMatrix(corpus, control = list(tokenize = bigramTokenizer))
trigrams <- TermDocumentMatrix(corpus, control = list(tokenize = trigramTokenizer))
unigrams
## <<TermDocumentMatrix (terms: 44095, documents: 33365)>>
## Non-/sparse entries: 341663/1470888012
## Sparsity : 100%
## Maximal term length: 119
## Weighting : term frequency (tf)
bigrams
## <<TermDocumentMatrix (terms: 289055, documents: 33365)>>
## Non-/sparse entries: 342134/9643977941
## Sparsity : 100%
## Maximal term length: 125
## Weighting : term frequency (tf)
trigrams
## <<TermDocumentMatrix (terms: 309238, documents: 33365)>>
## Non-/sparse entries: 312001/10317413869
## Sparsity : 100%
## Maximal term length: 131
## Weighting : term frequency (tf)
We can find the frequency for each n-gram and plot it.
unigrams_freqTerm <- findFreqTerms(unigrams,lowfreq = 50)
bigrams_freqTerm <- findFreqTerms(bigrams,lowfreq=50)
trigrams_freqTerm <- findFreqTerms(trigrams,lowfreq=8)
## Unigram freqeuncy dataframe
unigrams_freq <- rowSums(as.matrix(unigrams[unigrams_freqTerm,]))
unigrams_freq <- data.frame(word=names(unigrams_freq), frequency=unigrams_freq)
head(unigrams_freq)
## word frequency
## able able 186
## absolutely absolutely 69
## accept accept 52
## access access 62
## according according 86
## account account 80
## Bigram freqeuncy dataframe
bigrams_freq <- rowSums(as.matrix(bigrams[bigrams_freqTerm,]))
bigrams_freq <- data.frame(word=names(bigrams_freq), frequency=bigrams_freq)
head(bigrams_freq)
## word frequency
## can get can get 101
## can see can see 62
## cant wait cant wait 172
## come back come back 50
## dont get dont get 59
## dont know dont know 154
## Trigram freqeuncy dataframe
trigrams_freq <- rowSums(as.matrix(trigrams[trigrams_freqTerm,]))
trigrams_freq <- data.frame(word=names(trigrams_freq), frequency=trigrams_freq)
head(trigrams_freq)
## word frequency
## cake cake cake cake cake cake 8
## cant wait get cant wait get 8
## cant wait see cant wait see 39
## cinco de mayo cinco de mayo 9
## come see us come see us 11
## dont even know dont even know 15
library(ggplot2)
plot_n_grams <- function(df_gram, title, num, barC) {
df_sort <- df_gram[order(-df_gram$frequency),][1:num,]
ggplot(data = df_sort[1:num,], aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = barC, colour = "black") +
coord_cartesian(xlim = c(0, num+1)) +
labs(title = title) +
xlab("Words") +
ylab("Count") +
theme(axis.text.x=element_text(angle=90))
}
plot_n_grams(unigrams_freq,"Top 40 Unigrams",40,"blue")
plot_n_grams(bigrams_freq,"Top 40 Bigrams",40,"orange")
plot_n_grams(trigrams_freq,"Top 40 Bigrams",40,"green")
Once the exploratory analysis of the data is completed, we can move forward in building the predictive model, using the data from the N-Grams.
The model will be built using the Shiny package and should allow the user to enter text and then suggest the following words.