This is the Milestone Report for the Coursera Capstone SwiftKey Application. We created a corpus using data from the internet and performed the following exploratory analysis Download the text files (Twitter, Blogs and News) from the WWW.
Obtained a random sample of 0.1% combining the three sources.
Created a comparison table showing the file sizes, the number of lines (length) and the number of words in each one, including the sample file. Created a corpus. Normalized the text to lower case, eliminating punctuation, whitespaces, numbers, URLs and stopwords. Extracted n-grams, bi-grams, tri-grams and four-grams. Plot data to identify features and patterns.
# Loading libraries
library(tm)
## Warning: package 'tm' was built under R version 3.1.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.1.3
library(RWeka)
library(SnowballC)
library(googleVis)
##
## Welcome to googleVis version 0.5.8
##
## Please read the Google API Terms of Use
## before you start using the package:
## https://developers.google.com/terms/
##
## Note, the plot method of googleVis will by default use
## the standard browser to display its output.
##
## See the googleVis package vignettes for more details,
## or visit http://github.com/mages/googleVis.
##
## To suppress this message use:
## suppressPackageStartupMessages(library(googleVis))
library(RColorBrewer) # Generate palette of colours for plots.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
library(magrittr)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.1.3
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.1.3
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(stringi)# for character string analysis
library(wordcloud)
setwd("~/Documents/COURSERA_CAPSTONE/final/en_US")
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul=TRUE)
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul=TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul=TRUE)
sTwitter <- twitter[sample(1:length(twitter),20000)]
sNews <- news[sample(1:length(news), 1000)]
sBlogs <- blogs[sample(1:length(blogs), 10000)]
textSample <- c(sTwitter,sNews,sBlogs)
writeLines(textSample, "textSample.txt")
con <- file("textSample.txt", "r")
txtSample <- readLines(con)
close(con)
# Determining number of lines for each file
twitterLength <- length(twitter) # 2360148
blogsLength <- length(blogs) # 899288
newsLength <- length(news) # 1010242
sampleLength <- length(txtSample) # 31000
# Determining Word counts for each file
twitterWords <- sum(sapply(gregexpr("\\S+", twitter), length))
blogsWords <- sum(sapply(gregexpr("\\S+", blogs), length))
newsWords <- sum(sapply(gregexpr("\\S+", news), length))
sampleWords <- sum(sapply(gregexpr("\\S+", txtSample), length))
# Determining File size
twitterSize <- file.info("en_US.twitter.txt")$size / 1024.0 / 1024.0
blogsSize <- file.info("en_US.blogs.txt")$size / 1024.0 / 1024.0
newsSize <- file.info("en_US.news.txt")$size / 1024.0 / 1024.0
sampleSize <- file.info("textSample.txt")$size / 1024.0 / 1024.0
# Creating a table
FileNames <- c("Twitter","Blogs","News", "Sample")
FileSizes <- c(round(twitterSize, digits = 2), round(blogsSize,digits = 2),
round(newsSize, digits = 2), round(sampleSize, digits = 2))
fileTable <- data.frame(
fileName = c("Twitter","Blogs","News", "Sample"),
fileSize = FileSizes,
lineCount = c(twitterLength,blogsLength, newsLength, sampleLength),
wordCount = c(blogsWords, newsWords, twitterWords, sampleWords)
)
colnames(fileTable) <- c("File Name", "File Size (MB)", "Line Count", "Word Count")
saveRDS(fileTable, file = "fileTable.Rda")
fileTableDF <- readRDS("fileTable.Rda")
fileTable
## File Name File Size (MB) Line Count Word Count
## 1 Twitter 159.36 2360148 37334147
## 2 Blogs 200.42 899288 34372530
## 3 News 196.28 1010242 30373603
## 4 Sample 3.73 31000 705079
corpus <- Corpus(VectorSource(txtSample))
corpus <- tm_map(corpus, content_transformer(tolower), lazy = TRUE)
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, content_transformer(removeNumbers))
rmURL <- function(x) gsub("http[[:alnum:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(rmURL))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, PlainTextDocument)
corpusDF <- data.frame(text=unlist(sapply(corpus,"[", "content")), stringsAsFactors = FALSE)
meta(corpus)
## data frame with 0 columns and 31000 rows
dtm <- DocumentTermMatrix(corpus)
We can observe in each plot the most frequent terms and common terms between different size of n-grams and samples.
dtmCloud <- as.matrix(dtm) # Convert corpus to matrix
frequency <- colSums(dtmCloud) # Create a vector with term frequencies
frequency <- sort(frequency, decreasing=TRUE)
words <- names(frequency)
wordcloud(words[1:100], frequency[1:100], colors=brewer.pal(8, "Dark2"))
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
wdf <- data.frame(word=names(freq), freq=freq)
subset(wdf, freq>1000) %>%
ggplot(aes(word, freq)) +
geom_bar(stat="identity", fill="slateblue", colour="navyblue") +
theme(axis.text.x=element_text(angle=45, hjust=1))
unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max =1))
options(mc.cores=1)
dtm1 <- DocumentTermMatrix(corpus, control = list(tokenize = unigramTokenizer))
freq1 <- sort(colSums(as.matrix(dtm1)), decreasing=TRUE)
wcount1 <- data.frame(word=names(freq1), freq=freq1)
p1 <- ggplot(subset(wcount1, freq1 > 1000) ,aes(word, freq))
p1 <- p1 + geom_bar(stat="identity", fill="lightblue3", colour="navyblue")
p1 + theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("Unigram Frequency")
mTwitter <- twitter[sample(1:length(twitter),1000)]
mNews <- news[sample(1:length(news), 1000)]
mBlogs <- blogs[sample(1:length(blogs), 1000)]
miniSample <- c(mTwitter,mNews,mBlogs)
writeLines(miniSample, "miniSample.txt")
conMini <- file("miniSample.txt", "r")
miniSample <- readLines(conMini)
close(conMini)
mcorpus <- Corpus(VectorSource(miniSample))
mcorpus <- tm_map(mcorpus, content_transformer(tolower), lazy = TRUE)
mcorpus <- tm_map(mcorpus, content_transformer(removePunctuation))
mcorpus <- tm_map(mcorpus, content_transformer(removeNumbers))
rmURL <- function(x) gsub("http[[:alnum:]]*", "", x)
mcorpus <- tm_map(mcorpus, content_transformer(rmURL))
mcorpus <- tm_map(mcorpus, stripWhitespace)
mcorpus <- tm_map(mcorpus, removeWords, stopwords("english"))
mcorpus <- tm_map(mcorpus, PlainTextDocument)
mdtm <- DocumentTermMatrix(mcorpus)
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
options(mc.cores=1)
dtm2 <- DocumentTermMatrix(mcorpus, control = list(tokenize = bigramTokenizer))
freq2 <- sort(colSums(as.matrix(dtm2)), decreasing=TRUE)
wcount2 <- data.frame(word=names(freq2), freq=freq2)
p2 <- ggplot(subset(wcount2, freq2 > 10) ,aes(word, freq))
p2 <- p2 + geom_bar(stat="identity", fill="lightblue3", colour="navyblue")
p2 + theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("Bigram Frequency")
trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max =3))
options(mc.cores=1)
dtm3 <- DocumentTermMatrix(mcorpus, control = list(tokenize = trigramTokenizer))
freq3 <- sort(colSums(as.matrix(dtm3)), decreasing=TRUE)
wcount3 <- data.frame(word=names(freq3), freq=freq3)
p3 <- ggplot(subset(wcount3, freq3 > 2) ,aes(word, freq))
p3 <- p3 + geom_bar(stat="identity", fill="lightblue3", colour="navyblue")
p3 + theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("Trigram Frequency")
fourgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max =4))
options(mc.cores=1)
dtm4 <- DocumentTermMatrix(mcorpus, control = list(tokenize = fourgramTokenizer))
freq4 <- sort(colSums(as.matrix(dtm4)), decreasing=TRUE)
wcount4 <- data.frame(word=names(freq4), freq=freq4)
p4 <- ggplot(subset(wcount4, freq4 > 1) ,aes(word, freq))
p4 <- p4 + geom_bar(stat="identity", fill="lightblue3", colour="navyblue")
p4 + theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("Fourgram Frequency")
dtm2Cloud <- as.matrix(dtm2)
frequency2 <- colSums(dtm2Cloud)
frequency2 <- sort(frequency, decreasing=TRUE)
words <- names(frequency2)
wordcloud(words[1:50], frequency[1:50]) # For 50 top bigrams
dtm3Cloud <- as.matrix(dtm3)
frequency3 <- colSums(dtm3Cloud)
frequency3 <- sort(frequency, decreasing=TRUE)
words <- names(frequency3)
wordcloud(words[1:50], frequency[1:50]) # For 50 top trigrams
In order to avoid further difficulties in handling a large amount of data, I plan to use Python NLTK to download, clean and consolidate all the files, and then I will use R to create the shiny app. I created a basic algorithm to search for unigrams, bigrams, trigrams and fourgrams to solve Quiz2 -predict the next term. First I obtained a 60% accuracy. I assumed the low accuracy was due to the sample size. As increased the size accuracy improved, but I expect a signficant improvement when all the available information is ingested into the corpus.