This is the Milestone Report for the Coursera Data Science Capstone project. The goal of the capstone project is to create a predictive text model using a large text corpus of documents as training data. Natural language processing techniques will be used to perform the analysis and build the predictive model.
This milestone report describes the major features of the training data with our exploratory data analysis and summarizes our plans for creating the predictive model.
Location: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
The data sets consist of text from 3 different sources:
1) News,
2) Blogs and
3) Twitter feeds.
The text data are provided in 4 different languages:
library(tm)
## Loading required package: NLP
library(stringi)
library(RWeka)
library(wordcloud)
## Loading required package: RColorBrewer
library(stringi)
setwd("C:\\Users\\mahajvi1\\Desktop\\Coursera_capstone\\final\\en_US\\")
Examine the data sets and summarize our findings (file sizes, line counts, word counts, and mean words per line) below.
blogs <- readLines("en_US.blogs.txt", encoding = "ASCII", skipNul = TRUE, warn = FALSE)
news <- readLines("en_US.news.txt", encoding = "ASCII", skipNul = TRUE,warn = FALSE)
twitter <- readLines("en_US.twitter.txt", encoding = "ASCII", skipNul = TRUE, warn = FALSE)
# Get file sizes
blogs.size <- file.info("en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info("en_US.news.txt")$size / 1024 ^ 2
twitter.size <- file.info("en_US.twitter.txt")$size / 1024 ^ 2
# Get words in files
blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)
# Summary of the data sets
data.frame(source = c("blogs", "news", "twitter"),
file.size.MB = c(blogs.size, news.size, twitter.size),
num.lines = c(length(blogs), length(news), length(twitter)),
num.words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
mean.num.words = c(mean(blogs.words), mean(news.words), mean(twitter.words)))
## source file.size.MB num.lines num.words mean.num.words
## 1 blogs 200.4242 899288 38154238 42.42716
## 2 news 196.2775 77259 2693898 34.86840
## 3 twitter 159.3641 2360148 30218166 12.80350
# Load 5000 lines from every set in corpus
merged <- paste(news[1:5000], blogs[1:5000], twitter[1:5000])
corpus <- VCorpus(VectorSource(merged))
# Remove large files to clean up memory
rm (blogs.words)
rm(news.words)
rm(twitter.words)
rm(blogs)
rm(news)
rm(twitter)
Before performing exploratory analysis, we must clean the data first.
This involves removing
(1) URLs,
(2) special characters,
(3) punctuations,
(4) numbers,
(5) excess whitespace,
(6) stopwords, and
(7) changing the text to lower case.
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords())
corpusDf <-data.frame(text=unlist(sapply(corpus,
`[`, "content")), stringsAsFactors=F)
findNGrams <- function(corp, grams) {
ngram <- NGramTokenizer(corp, Weka_control(min = grams, max = grams,
delimiters = " \\r\\n\\t.,;:\"()?!"))
ngram2 <- data.frame(table(ngram))
#pick only top 25
ngram3 <- ngram2[order(ngram2$Freq,decreasing = TRUE),][1:100,]
colnames(ngram3) <- c("String","Count")
ngram3
}
TwoGrams <- findNGrams(corpusDf, 2)
ThreeGrams <- findNGrams(corpusDf, 3)
FourGrams <- findNGrams(corpusDf, 4)
require(RColorBrewer)
par(mfrow = c(1, 3))
palette <- brewer.pal(8,"Dark2")
wordcloud(TwoGrams[,1], TwoGrams[,2], min.freq =1,
random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "2-gram cloud")
wordcloud(ThreeGrams[,1], ThreeGrams[,2], min.freq =1,
random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "3-gram cloud")
wordcloud(FourGrams[,1], FourGrams[,2], min.freq =1,
random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "4-gram cloud")
par(mfrow = c(1, 1))
barplot(TwoGrams[1:20,2],
cex.names=0.5,
names.arg=TwoGrams[1:20,1],
col="red",
main="2-Grams",
las=2)
barplot(ThreeGrams[1:20,2],
cex.names=0.5,
names.arg=ThreeGrams[1:20,1],
col="green",
main="3-Grams",
las=2)
barplot(FourGrams[1:20,2],
cex.names=0.5,
names.arg=FourGrams[1:20,1],
col="blue",
main="4-Grams",
las=2)