knitr::opts_chunk$set(echo = TRUE, warning = FALSE, results = "show", message = FALSE)
time <- format(Sys.time(), "%a %b %d %X %Y")
require('quanteda')
## Loading required package: quanteda
## quanteda version 0.9.6.9
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:base':
##
## sample
require('ggplot2')
## Loading required package: ggplot2
This milestone report is the first assignment as part of the Johns Hopkins Data Science specialization. In it we will explore a corpus of documents sourced from random blogs, news and Twitter feeds. The goals are straight foward:
The data was obtained from the Coursera database of documents and included three English language text files representing distinct types of communication styles in the written form:
Because the text files are quite large and my system suffers from being a few years old I decided to sample from the text documents by creating character vectors of the raw data and using it later for analysis. The data sampled was equivalent to 1% of the total lines per file.
#read in the files of interest
#sample from the raw text prior to creating a corpa to reduce processing times
set.seed(1234) #set a psuedorandom control for reproducibility
#read in the blogs and sample @ 1%
blog <- file('corpus/en_US.blogs.txt', open='rb')
blog.dat <- readLines(blog, encoding='UTF-8')
blog.sample <- sample(blog.dat, NROW(blog.dat)*0.01)
writeLines(blog.sample, 'corpus/sample/blogSample.txt')
close(blog)
#read in the news and sample @ 1%
news <- file('corpus/en_US.news.txt', open='rb')
news.dat <- readLines(news, encoding='UTF-8')
news.sample <- sample(news.dat, NROW(news.dat) * 0.01)
writeLines(news.sample, 'corpus/sample/newsSample.txt')
close(news)
#read in the tweets and sample @ 1%
tweets <- file('corpus/en_US.twitter.txt', open='rb')
tweets.dat <- readLines(tweets, encoding='UTF-8')
tweets.sample <- sample(tweets.dat, NROW(tweets.dat) * 0.01)
writeLines(tweets.sample, 'corpus/sample/tweetsSample.txt')
close(tweets)
#created a single massive character vector for tokenization
combinedSample <- c(blog.sample, news.sample, tweets.sample)
#Create a data.frame of evidence to show the number of lines & words in each file
numLines <- c(length(blog.dat), length(news.dat), length(tweets.dat))
numWords <- c(sum(nchar(blog.dat)),sum(nchar(news.dat)),sum(nchar(tweets.dat)))
datCounts <- data.frame(numLines, numWords)
datCounts$names <- c('Blogs', 'News', 'Tweets')
rm(numLines, numWords)
datCounts
## numLines numWords names
## 1 899288 206824505 Blogs
## 2 1010242 203223159 News
## 3 2360148 162096031 Tweets
I tried vairous methods for sampling and preparing Document Feature Matrices (quanteda[1]) or Document Term or Term Document matrices (Cran::tm). In the end the lift much easier using quanteda, which is built upon data.table a package who has the distinction of being wicked fast at processing large quantities of data in R. Even with this speed up, I decided to stick at the 1% sample level to make processing times faster and reduce cycle testing time. Subsequent to preparing the corpus a number of interim exploratory steps were contrived and borrowed from various resourecs in the web as well as StackOverflow and similar Natural Language Processing resources [2, 3, 4, 5, 6, 7]. I tokenized the text and created unigrams, bigrams and trigrams to show the distribution of word and word pairs.
corpus <- corpus(textfile(file = 'corpus/sample/*'))
summary(corpus)
## Corpus consisting of 3 documents.
##
## Text Types Tokens Sentences
## blogSample.txt 34555 437975 21023
## newsSample.txt 35357 413153 19042
## tweetsSample.txt 33644 375301 25818
##
## Source: C:/Users/everhz05/Documents/datascience/capstone/* on x86-64 by everhz05
## Created: Sun Jun 12 21:31:21 2016
## Notes:
dfm <- dfm(corpus, stem = TRUE, ignoredFeatures = stopwords("english"))
## Creating a dfm from a corpus ...
## ... lowercasing
## ... tokenizing
## ... indexing documents: 3 documents
## ... indexing features: 58,262 feature types
## ... removed 173 features, from 174 supplied (glob) feature types
## ... stemming features (English), trimmed 16039 feature variants
## ... created a 3 x 42051 sparse dfm
## ... complete.
## Elapsed time: 4.15 seconds.
dfm
## Document-feature matrix of: 3 documents, 42,051 features.
topfeatures(dfm, 50)
## one will get said just like go time can u
## 3218 3201 3114 3042 3027 2952 2688 2572 2510 2325
## day year make love new now know good work think
## 2265 2127 1994 1985 1934 1849 1835 1802 1741 1648
## peopl thank want say see look back come need also
## 1635 1603 1594 1579 1569 1546 1518 1465 1409 1371
## first way thing last take use great even us much
## 1363 1360 1318 1313 1291 1289 1263 1257 1234 1226
## two today got well right start follow week realli still
## 1168 1164 1152 1140 1125 1115 1094 1078 1050 1019
#create unigrams
dfm_unigram <- dfm(combinedSample, ngrams = 1,
removeTwitter = TRUE
, stem = TRUE,
concatenor = " ",
ignoredFeatures = stopwords("english"))
##
## ... lowercasing
## ... tokenizing
## ... indexing documents: 42,695 documents
## ... indexing features: 57,839 feature types
## ... removed 172 features, from 174 supplied (glob) feature types
## ... stemming features (English), trimmed 15679 feature variants
## ... created a 42695 x 41989 sparse dfm
## ... complete.
## Elapsed time: 2.01 seconds.
unigram_freq <- sort(colSums(dfm_unigram), decreasing = T)
topfeatures(dfm_unigram)
## will one get said just like go time can day
## 3201 3200 3114 3042 3027 2952 2688 2572 2510 2263
options(width = 200)
plot(dfm_unigram, max.words = 100)
unigram_df <- data.frame(Unigrams = names(unigram_freq), Frequency = unigram_freq)
plot1 <- ggplot(within(unigram_df[1:25, ], Unigrams <- factor(Unigrams, levels = Unigrams)),
aes(x = reorder(Unigrams, Frequency), y = Frequency))
plot1 <- plot1 + geom_bar(stat = "identity", fill = "grey") + ggtitle("Top 25 Unigrams plot") +
xlab("Unigrams") + ylab("Frequency")
plot1 <- plot1 + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip()
plot1
#create bigrams (two word combinations)
dfm_bigrams <- dfm(combinedSample, ngrams = 2,
removeTwitter = TRUE,
stem = TRUE,
concatenor = " ",
ignoredFeatures = stopwords("english"))
##
## ... lowercasing
## ... tokenizing
## ... indexing documents: 42,695 documents
## ... indexing features: 446,683 feature types
## ... removed 239,251 features, from 174 supplied (glob) feature types
## ... stemming features (English), trimmed 4945 feature variants
## ... created a 42695 x 202488 sparse dfm
## ... complete.
## Elapsed time: 18.55 seconds.
bigram_freq <- sort(colSums(dfm_bigrams), decreasing = T)
topfeatures(dfm_bigrams)
## right_now last_year new_york last_night high_school years_ago feel_lik last_week looking_forward looks_lik
## 259 225 206 159 154 140 138 131 108 105
options(width = 200)
plot(dfm_bigrams, max.words = 25) #plot the top 25 bigrams in a wordcloud
bigram_df <- data.frame(Bigrams = names(bigram_freq), Frequency = bigram_freq)
plot2 <- ggplot(within(bigram_df[1:25, ], Bigrams <- factor(Bigrams)), aes(x = reorder(Bigrams,
Frequency), y = Frequency))
plot2 <- plot2 + geom_bar(stat = "identity", fill = "grey") + ggtitle("Top 25 Bigrams plot") +
xlab("Bigrams") + ylab("Frequency")
plot2 <- plot2 + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip()
plot2
#create trigrams (three word combinations)
dfm_trigrams <- dfm(combinedSample, ngrams = 3,
removeTwitter = TRUE,
#stem = TRUE,
ignoredFeatures = stopwords("english"))
##
## ... lowercasing
## ... tokenizing
## ... indexing documents: 42,695 documents
## ... indexing features: 775,548 feature types
## ... removed 662,616 features, from 174 supplied (glob) feature types
## ... created a 42695 x 112933 sparse dfm
## ... complete.
## Elapsed time: 25.99 seconds.
trigram_freq <- sort(colSums(dfm_trigrams), decreasing = T)
topfeatures(dfm_trigrams)
## new_york_city cinco_de_mayo let_us_know happy_new_year two_years_ago happy_mother's_day happy_mothers_day st_louis_county new_york_times world_war_ii
## 27 23 23 19 18 18 17 16 15 14
options(width = 200)
plot(dfm_trigrams, max.words = 10) #plot the top 10 trigrams in a wordcloud
trigram_df <- data.frame(Trigrams = names(trigram_freq),
Frequency = trigram_freq)
plot3 <- ggplot(within(trigram_df[1:25, ], Trigrams <- factor(Trigrams, levels = Trigrams)),
aes(x = reorder(Trigrams, Frequency), y = Frequency))
plot3 <- plot3 + geom_bar(stat = "identity", fill = "grey") + ggtitle("Top 25 Trigrams plot") +
xlab("Trigrams") + ylab("Frequency")
plot3 <- plot3 + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip()
plot3
I found this phase of the course to be difficult and it required a lot of research, testing, back testing, trial and error and borrowing bits and pieces from all over the web to cobble this together. Having said that, since this was literally my first ever experience with NPL and some of these techniques I found the work to be invaluable and felt as if I learned a fair bit about what was once [to me] a completely unknown domain.
In terms of next steps, I supposed I will build an app that takes some input and predicts missing words in line with the course requirement. There appear to be significant amounts of additional research and testing forthcoming.
This HTML markdown file was generated: Sun Jun 12 9:30:28 PM 2016.