The objective of this report is to demonstrate competency in working with the project’s training data and explains using exploratory analysis major features identified.
We will create a predictive text model using a large text corpus of documents as training data.
#Dir
setwd("D:/coursera/capstone/milestone"); list.files()
## [1] "data" "Milestone.html" "milestone.R" "Milestone.Rmd"
## [5] "sample"
#Load Libraries
library(stringi)
library(tm); library(NLP) # Loading required package: NLP
## Warning: package 'tm' was built under R version 3.2.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3
library(RWeka) # for the n-gram models.
## Warning: package 'RWeka' was built under R version 3.2.4
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.2.4
## Loading required package: RColorBrewer
# Download data from Coursera site & Unzip it
# Do this step only if it hasn't been run already.
if (!file.exists("data/Coursera-SwiftKey.zip")) {
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
destfile = "data/Coursera-SwiftKey.zip")
unzip("data/Coursera-SwiftKey.zip", exdir = "data")
}
# Read Data files. "US" training data is used for this project.
blogs<-readLines("data/final/en_US/en_US.blogs.txt", encoding="UTF-8")
twitter<- readLines("data/final/en_US/en_US.twitter.txt", encoding="UTF-8", skipNul = TRUE)
# binary format so as to preserve all characters.
conn <- file("data/final/en_US/en_US.news.txt", open = "rb")
news <- readLines(conn, encoding="UTF-8")
close(conn)
rm(conn)
There are 4 language files but I’m only using English.
rawWPL<-lapply(list(blogs,news,twitter),function(x) stri_count_words(x))
#File Stats
fileStats<-data.frame(
File=c("Blogs","News","Twitter"),
t(rbind(sapply(list(blogs,news,twitter),stri_stats_general),
TotalWords=sapply(list(blogs,news,twitter),stri_stats_latex)[4,]))
)
# Words per line summary
WPLStats<-data.frame(
File=c("Blogs","News","Twitter"),
WPL=rbind(summary(rawWPL[[1]]),summary(rawWPL[[2]]),summary(rawWPL[[3]]))
)
print(fileStats)
## File Lines LinesNEmpty Chars CharsNWhite TotalWords
## 1 Blogs 899288 899288 206824382 170389539 37570839
## 2 News 1010242 1010242 203223154 169860866 34494539
## 3 Twitter 2360148 2360148 162096241 134082806 30451170
print(WPLStats)
## File WPL.Min. WPL.1st.Qu. WPL.Median WPL.Mean WPL.3rd.Qu. WPL.Max.
## 1 Blogs 0 9 28 41.75 60 6726
## 2 News 1 19 32 34.41 46 1796
## 3 Twitter 1 7 12 12.75 18 47
All three (Blogs, News and Twitter) files have about 35 Mil words each. On average Blogs has about 42 words per line but most of them have 28 word per line. This indicate that few lines have really large number of words.
# Set sample size
set.seed(999)
percentage <- 0.2
# Binomial sampling
samplingFunc <- function(data, percent)
{ return(data[as.logical(rbinom(length(data),1,percent))]) }
blogsSmpl <- samplingFunc(blogs, percentage)
newsSmpl <- samplingFunc(news, percentage)
twittSmpl <- samplingFunc(twitter, percentage)
# remove words with non-ASCII characters
# convert string to vector of words
dat2 <- unlist(strsplit(blogsSmpl, split=", "))
# find indices of words with non-ASCII characters
dat3 <- grep("dat2", iconv(dat2, "latin1", "ASCII", sub="dat2"))
# subset original vector of words to exclude words with non-ASCII char
dat4 <- dat2[-dat3]
# convert vector back to a string
blogsSmpl2 <- paste(dat4, collapse = ", ")
dat2 <- unlist(strsplit(newsSmpl, split=", "))
dat3 <- grep("dat2", iconv(dat2, "latin1", "ASCII", sub="dat2"))
dat4 <- dat2[-dat3]
newsSmpl2 <- paste(dat4, collapse = ", ")
dat2 <- unlist(strsplit(twittSmpl, split=", "))
dat3 <- grep("dat2", iconv(dat2, "latin1", "ASCII", sub="dat2"))
dat4 <- dat2[-dat3]
twittSmpl2 <- paste(dat4, collapse = ", ")
sampleAll <- c(blogsSmpl2, newsSmpl2, twittSmpl2)
dir.create("sample", showWarnings = FALSE)
write(blogsSmpl, "sample/blogsSmpl.txt")
write(newsSmpl, "sample/newsSmpl.txt")
write(twittSmpl, "sample/twittSmpl.txt")
remove(blogs); remove(news); remove(twitter)
remove(dat2); remove(dat3); remove(dat4);
# Create corpus and clean the data
corpus <- VCorpus(VectorSource(sampleAll))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x)) # transformer funtion
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+") # Remove URLs
corpus <- tm_map(corpus, toSpace, "RT |via ") # Remove RTs and vias
corpus <- tm_map(corpus, toSpace, "@[^\\s]+") # Replace twitter accounts (@XXXXXX) by space
corpus <- tm_map(corpus, tolower) # Convert all to lower case
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
profanity <- readLines("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt")
corpus <- tm_map(corpus, removeWords, profanity)
save(corpus, file = "sample/corpus.RData")
# N-Gram Models.
# Create 1,2 and 3-gram functions for later use in the TermDocumentMatrix
uniTok <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biTok <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triTok <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
load("sample/corpus.RData")
#Create 1,2 or 3-gram TermDocumentMatrix files
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
FreqPlot <- function(data, label) {
ggplot(data[1:20,], aes(reorder(word, freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 90, size = 10, hjust = 1)) +
geom_bar(stat = "identity", fill = I("green"), colour="darkgreen") +
coord_flip()
}
# Get frequencies of most common n-grams in data sample
freqUni <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = uniTok)), 0.999))
freqBi <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = biTok)), 0.999))
freqTri <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = triTok)), 0.9999))
FreqPlot(freqUni, "20 Most Frequent Unigrams")
FreqPlot(freqBi, "20 Most Frequent Bigrams")
FreqPlot(freqTri, "20 Most Frequent Trigrams")
The word ‘just’ is the most common signle word. The phrase ‘right now’ is the most common two-word-phrase. The phrase ‘happy mothers day’ is the most common three-word-phrase.
# Word Cloud.
WCPlot <- function(data, label) {
wordcloud(words = data$word, freq = data$freq,
scale = c(3, 1), max.words = 50, random.order = FALSE, rot.per = 0.35,
use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))
title(label)
}
par(mfrow = c(1, 3)) # Plotting Panel
WCPlot(freqUni, "50 Most Frequent Unigrams")
WCPlot(freqBi, "50 Most Frequent Bigrams")
WCPlot(freqTri, "50 Most Frequent Trigrams")
Above word cloud clearly shows the most frequent words for unigram, bigram and trigram.
Data sets are obtained amd cleaned. Few exploratory analysis are performed. I’m confident that I can start building the predictive models and eventually the data product. Following are high-level plans: 1. Develop algorithm using N-grams. Possibly with tokens of one to four words. 2. Find more ways to clean up the data 3. Obtain required insights and find association between tokens. 4. Finally develop a shiny app or similar to make word recommendation.