This study is about exploratory analysis conducted on the dataset and an illustration how to applying data science in the area of natural language processing. The training data came from public website like newspaper, personnal blog, or twitter, which is collected by a corpus called HC Corpora.
The data is downloaded from Captsone Dataset.
options(warn=-1)
## -- Load the stingi library for text manipulation --
library(stringi)
## -- Set Env --
set.seed(14432)
setwd("~/coursera/work/capstone")
## -- Declare Variables --
sourceUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
sourceFile <- "swiftkey.zip"
destDir <- "data"
## -- Check and Create dir --
if (!file.exists(destDir)) {
dir.create(destDir)
}
## -- Check if file downloaded, if not download from Web --
if (!file.exists(sourceFile)) {
download.file(sourceUrl, sourceFile, method="curl")
unzip(sourceFile, exdir=destDir)
}
The training data come with four langauge, English(US), Detch(German), Swedish(Finland) and Russian(Russia).
However, here we only study on English(US)
since we do not know understand the rest.
## -- List file under directory data/final/en_US/ --
# ls -alh data/final/en_US/
total 557M
drwx------+ 1 I062308 ???????? 0 2015-03-21 17:53 .
drwx------+ 1 I062308 ???????? 0 2015-03-21 17:53 ..
-rwx------+ 1 I062308 ???????? 201M 2015-03-21 17:53 en_US.blogs.txt
-rwx------+ 1 I062308 ???????? 197M 2015-03-21 17:53 en_US.news.txt
-rwx------+ 1 I062308 ???????? 160M 2015-03-21 17:53 en_US.twitter.txt
## -- Word Count --
# wc -l data/final/en_US/en_US.blogs.txt
899288 data/final/en_US/en_US.blogs.txt
# wc -l data/final/en_US/en_US.news.txt
1010242 data/final/en_US/en_US.news.txt
# wc -l data/final/en_US/en_US.twitter.txt
2360148 data/final/en_US/en_US.twitter.txt
From an initial study of the data, the file size of Blogs, News, and Twitter is 210M, 197M, and 160M respectively, and there is 899,288, 1,010,242, and 2,360,148 of words for each file.
Observed that the Blogs file is the biggest file is size but somehow contains least words, interesting…
The was some non-english or non-UTF8 text in the dataset, we specified encoding type as UTF-8
and try to remove non-UTF8 text with regular expression.
## -- Encoding Type UTF-8 --
encodingType <- "UTF-8"
## -- Import Blogs Data save the data to file --
blogsRaw <- readLines("data/final/en_US/en_US.blogs.txt", encoding=encodingType)
save(blogsRaw, file="data/raw/blogs.Raw")
## -- Import News Data save the data to file --
newsConn <- file("data/final/en_US/en_US.news.txt", open="rb")
newsRaw <- readLines(newsConn, encoding=encodingType)
close(newsConn)
rm(newsConn)
save(newsRaw, file="data/raw/news.Raw")
## -- Import Twitter Data save the data to file --
tweetRaw <- readLines("data/final/en_US/en_US.twitter.txt", encoding=encodingType)
## -- Drop non UTF-8 characters --
tweetRaw <- iconv(tweetRaw, from="latin1", to=encodingType, sub="")
tweetRaw <- stri_replace_all_regex(tweetRaw, "\u2019|`","'")
tweetRaw <- stri_replace_all_regex(tweetRaw, "\u201c|\u201d|u201f|``",'"')
save(tweetRaw, file="data/raw/tweet.Raw")
Since the corpus is quite large, we will use a random sample of 20 percent
of the length of each dataset to reduce the computation time.
## -- Sample data (20%) --
sampleBlogs <- sample(blogsRaw, round(0.2*length(blogsRaw)))
sampleNews <- sample(newsRaw, round(0.2*length(newsRaw)))
sampleTweet <- sample(tweetRaw, round(0.2*length(tweetRaw)))
## -- Save samples --
save(sampleBlogs, file= "data/sample/sampleBlogs.RData")
save(sampleNews, file= "data/sample/sampleNews.RData")
save(sampleTweet, file= "data/sample/sampleTweet.RData")
Before doing the exploratory analysis, we conducted some cleaning steps include converting all charaters to lower case, removing numbers and punctuation, removing stop words and profanity words (from Google’s “what do you love” project)
options(warn=-1)
## -- Load Text Minig Package --
library(tm)
## -- Small function to clean corpus --
cleanData <- function(corpusData, profanity){
## -- Strip whitespace --
corpusData <- tm_map(corpusData, stripWhitespace)
## -- To lower case --
corpusData <- tm_map(corpusData, content_transformer(tolower))
## -- Remove stop words --
corpusData <- tm_map(corpusData, removeWords, stopwords("english"))
## -- Remove punctuation --
corpusData <- tm_map(corpusData, removePunctuation)
## -- Remove numbers --
corpusData <- tm_map(corpusData, removeNumbers)
## -- Remove profanity words --
corpusData <- tm_map(corpusData, removeWords, profanity)
return(corpusData)
}
## -- Load profanity filter --
profanity <- read.csv("data/ProfanityWords.csv", header=FALSE)
profanity <- profanity$V1
## -- Check if sample file exist --
if (!file.exists("data/sample")) {
## -- Language as "en_US" --
en <- "en_US"
## -- Corpus Sampled Blogs Data --
corpusSampleBlogs <- Corpus(VectorSource(sampleBlogs), readerControl = list(language=en))
rm(sampleBlogs)
corpusSampleBlogs <- cleanData(corpusSampleBlogs, profanity)
save(corpusSampleBlogs, file= "data/sample/corpusSampleBlogs.RData")
## -- Corpus Sampled News Data --
corpusSampleNews <- Corpus(VectorSource(sampleNews), readerControl = list(language=en))
rm(sampleNews)
corpusSampleNews <- cleanData(corpusSampleNews, profanity)
save(corpusSampleNews, file= "data/sample/corpusSampleNews.RData")
## -- Corpus Sampled Tweet Data --
corpusSampleTweet <- Corpus(VectorSource(sampleTweet), readerControl = list(language=en))
rm(sampleTweet)
corpusSampleTweet <- cleanData(corpusSampleTweet, profanity)
save(corpusSampleTweet, file= "data/sample/corpusSampleTweet.RData")
} else {
setwd("~/coursera/work/capstone")
## -- Load Sample data --
load("data/sample/corpusSampleBlogs.RData")
load("data/sample/corpusSampleNews.RData")
load("data/sample/corpusSampleTweet.RData")
}
length(corpusSampleBlogs$content)
## [1] 179858
length(corpusSampleNews$content)
## [1] 202048
length(corpusSampleTweet$content)
## [1] 472030
## -- Convert to term document matrix --
dtmBlogs <- DocumentTermMatrix(corpusSampleBlogs)
dtmNews <- DocumentTermMatrix(corpusSampleNews)
dtmTweet <- DocumentTermMatrix(corpusSampleTweet)
## -- Remove less frequent terms --
dtmBlogs <- removeSparseTerms(dtmBlogs,0.995)
dtmNews <- removeSparseTerms(dtmNews,0.995)
dtmTweet <- removeSparseTerms(dtmTweet,0.995)
## -- Term frequency --
dtmBlogsFreq <- sort(colSums(as.matrix(dtmBlogs)),decreasing = TRUE)
dtmNewsFreq <- sort(colSums(as.matrix(dtmNews)),decreasing = TRUE)
dtmTweetFreq <- sort(colSums(as.matrix(dtmTweet)),decreasing = TRUE)
## -- Show Term frequency structure --
str(dtmBlogsFreq)
## Named num [1:596] 24781 22541 19878 19460 19304 ...
## - attr(*, "names")= chr [1:596] "one" "will" "just" "like" ...
str(dtmNewsFreq)
## Named num [1:571] 50028 21810 16498 14047 11838 ...
## - attr(*, "names")= chr [1:571] "said" "will" "one" "new" ...
str(dtmTweetFreq)
## Named num [1:180] 30119 24360 22323 21054 20208 ...
## - attr(*, "names")= chr [1:180] "just" "like" "get" "love" ...
The term frequency distribution is shown in the histograms below for three types of text input. Since the sampled tweets is only 180
words, the frequency of words is smaller comparing to blog and news.
dtmBlogsFreq[1:10]
## one will just like can time get know people now
## 24781 22541 19878 19460 19304 17756 14167 11875 11867 11804
dtmNewsFreq[1:10]
## said will one new also two year can first just
## 50028 21810 16498 14047 11838 11578 11538 11522 10565 10556
dtmTweetFreq[1:10]
## just like get love good will day can thanks now
## 30119 24360 22323 21054 20208 18804 18194 17747 17746 16450
Blog, news and tweet share most the charateristics that they use certain words intensively, like will, one, just, like and can. Another observation is the most frequent words in blog and news are mostly noun and adjective, while tweets contain more verbs and web phrases such as lol.
## -- Histograms of word frequency --
options(warn=-1)
par(mfrow=c(1,3))
hist(dtmBlogsFreq, breaks=100, main="Histogram of Blogs term-freq")
abline(v=mean(dtmBlogsFreq),col=2)
hist(dtmNewsFreq, breaks=100, main="Histogram of News term-freq")
abline(v=mean(dtmNewsFreq),col=2)
hist(dtmTweetFreq, breaks=100, main="Histogram of Tweets term-freq")
abline(v=mean(dtmTweetFreq),col=2)
## -- Word cloud --
library(wordcloud)
library(RColorBrewer)
options(warn=-1)
palPaired <- brewer.pal(8,"Paired")
palDark2 <- brewer.pal(8,"Dark2")
par(mfrow=c(1,3))
wordcloud(names(dtmBlogsFreq), dtmBlogsFreq, min.freq=1000,
max.words=Inf, random.order=FALSE, rot.per=.15, colors=palDark2)
wordcloud(names(dtmNewsFreq), dtmNewsFreq, min.freq=2000,
max.words=Inf, random.order=FALSE, rot.per=.15, colors=palPaired)
wordcloud(names(dtmTweetFreq), dtmTweetFreq, min.freq=1000,
max.words=Inf, random.order=FALSE, rot.per=.15, colors=palDark2)
At this stage, we are exploring and familiaring with the data and the text mining infrastructure in R.
The next step would be developing a text prediction algorithm.
1. Shall start from basic n-gram model using the exploratory analysis I performed and explore more sophisticated algorithms if possible.
2. Would try to propose metrics for model efficiency and accuracy using cross-validation.
3. Lastly, build a data product as a Shiney app that acceps an n-gram and predicts the next word.