This Milestone Report purpose is to explore the data of the capstone porject of the data sicence coursera specialization. The goal of the capstone project is to apply the knowledge acquired through the 9 courses. It is based on a large text corpus of documents to predict the next word on based on the input considered as the preceding word. This report will dive into the data and clean it:
if(require(tidyverse)==FALSE)(install.packages("tidyverse")); library(tidyverse)
if(require(downloader)==FALSE)(install.packages("downloader")); library(downloader)
if(require(tm)==FALSE)(install.packages("tm")); library(tm)
if(require(stringi)==FALSE)(install.packages("stringi")); library(stringi)
if(require(NLP)==FALSE)(install.packages("NLP")); library(NLP)
if(require(openNLP)==FALSE)(install.packages("openNLP")); library(openNLP)
if(require(RWeka)==FALSE)(install.packages("RWeka")); library(RWeka)
if(require(RWekajars)==FALSE)(install.packages("RWekajars")); library(RWekajars)
if(require(SnowballC)==FALSE)(install.packages("SnowballC")); library(SnowballC)
if(require(qdap)==FALSE)(install.packages("qdap")); library(qdap)
if(require(qdapDictionaries)==FALSE)(install.packages("qdapDictionaries"));library(qdapDictionaries)
if(require(wordcloud)==FALSE)(install.packages("wordcloud")); library(wordcloud)
if(require(RColorBrewer)==FALSE)(install.packages("RColorBrewer")); library(RColorBrewer)
if(require(quanteda)==FALSE)(install.packages("quanteda")); library(quanteda)
## Downloading the data and unzipping the folder
## Check if directory already exists?
if(!file.exists("./projectData")){
dir.create("./projectData")
}
Url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
## Check if zip has already been downloaded in projectData directory?
if(!file.exists("./projectData/Coursera-SwiftKey.zip")){
download.file(Url,destfile="./projectData/Coursera-SwiftKey.zip",mode = "wb")
}
## Check if zip has already been unzipped?
if(!file.exists("./projectData/final")){
unzip(zipfile="./projectData/Coursera-SwiftKey.zip",exdir="./projectData")
}
#We'll focus on english language
path_english<-file.path("./projectData/final" , "en_US")
list.files(path_english)
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
#There are three files (blogs,news and twitter)
#Since the data is downlaoded, let's read it using readline (reading line by line)
#setting up location and read it in binary mode
blogsLocation <- file(paste0(path_english,"/en_US.blogs.txt"), open="rb")
#reading the data
blogs <- readLines(blogsLocation, encoding = "UTF-8", skipNul=TRUE)
#Same steps for news and twitter data
newsLocation <- file(paste0(path_english,"/en_US.news.txt"), open = "rb")
news <- readLines(newsLocation, encoding = "UTF-8", skipNul=TRUE)
twitterLocation <- file(paste0(path_english,"/en_US.twitter.txt"), open = "rb") # open for reading in binary mode
twitter <- readLines(twitterLocation, encoding = "UTF-8", skipNul=TRUE)
There are 3 files: * en_US.blogs.txt * en_US.news.txt * en_US.twitter.txt.files
Let’s see what they look like
#Before sampling let's a first view on these 3 files.
## Number of lines
lB<- length(blogs)
lN <- length(news)
lT <- length(twitter)
## Counting the Words
sB<- sum(stri_count_words(blogs))
sN<- sum(stri_count_words(news))
sT <- sum(stri_count_words(twitter))
# mean number of words per line
mB<- mean(stri_count_words(blogs))
mN<- mean(stri_count_words(news))
mT <- mean(stri_count_words(twitter))
## Size of Files
sizeBlogs<- object.size(blogs)/1024^2
sizeNews<- object.size(news)/1024^2
sizeTwitter<- object.size(twitter)/1024^2
summary<-data.frame(dataSource = c("blogs", "news", "twitter"),
Lines = c(lB,lN,lT),
NrWords = c(sB,sN,sT),
MeanNrWords = c(mB,mN,mT),
Size= c(sizeBlogs,sizeNews, sizeTwitter))
summary
## dataSource Lines NrWords MeanNrWords Size
## 1 blogs 899288 37546246 41.75108 255.3545
## 2 news 1010242 34762395 34.40997 257.3404
## 3 twitter 2360148 30093410 12.75065 318.9897
As we could see above the size of the files is pretty large. The thing is that the NLP process are highly consuming and with the limited local memory of a single computer most of them won’t run with that big files. Hence we’ll sample them in order to enable a smooth process
## Sampling the data and create a unified file. In order to reduce the size and because NLP algorithm are computing consuming and our memory is limited let's sample them$
#Since we are sampling (and there's some randomness in samling) we'll set seet to enable reproducibility
set.seed(1604)
sTwitter <- sample(twitter, size = 5000, replace = TRUE)
sBlogs <- sample(blogs, size = 5000,replace = TRUE)
sNews <- sample(news, size = 5000, replace = TRUE)
sampleTotal <- c(sTwitter, sBlogs, sNews)
length(sampleTotal)
## [1] 15000
#Let's write it into a dedicated folder so that we can easily reuse it later without having to re-run the whole process
writeLines(sampleTotal, "./projectData/dataFinalSample.txt")
Before tokenizing or even anlaysing in depth the corpus we need to clean it. We need to remove URLs,mails,number,punctuation,stopwords as the,… Wehn the corpus is cleaned we transform in a termdocument matrix format to enable further analysis and visualizations such as a wordcloud
#Let's clean the data, todo it we 're ging to use tm package which
#has several built in functions enabling to easily clean text
fileCon<-file("./projectData/dataFinalSample.txt")
text<- readLines(fileCon)
corpus <- VCorpus(VectorSource(text))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
# Removing URL and Mails
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
# Converting to Lower
corpus <- tm_map(corpus, content_transformer(tolower))
# Removing stopwords (the,..)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
# Removing numbers
corpus <- tm_map(corpus, content_transformer(removeNumbers))
# WhiteSpace
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(removePunctuation), preserve_intra_word_dashes=TRUE)
# Twitter tags
tt<-function(x) gsub("RT |via", "", x)
corpus<- tm_map(corpus, content_transformer(tt))
# Twitter Usernames
tun<-function(x) gsub("[@][a - zA - Z0 - 9_]{1,15}", "", x)
corpus<- tm_map(corpus, content_transformer(tun))
# Text stemming
corpus <- tm_map(corpus, stemDocument)
# Transforming into term docuemnt matrix to process first basic analysis
tdm <- TermDocumentMatrix(corpus)
#ALet's see the 5 most frequent words
v <- sort(rowSums(as.matrix(tdm)),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
top5<-head(d,5)
g1 <- ggplot(data=d[1:5,], aes(x = word, y = freq))+
geom_bar(stat="identity") + coord_flip() + ggtitle("top 5 words")
g1
set.seed(1234)
wordcloud<-wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
Let’s save the cleand corpus
#Let's convert and save for further purposes
corpus <- tm_map(corpus, PlainTextDocument)
#Let's save it
saveRDS(corpus, file = "./projectData/cleanedCorpus.RData")
The next step consists in reading the text and breaking it into word and sentences and then getting the n-grams
What is tokenization and n-grams? Because with this process the text is broken into units of meaning, the tokens.In the fields of computational linguistics and probability, an n-gram is a contiguous sequence of n items from a given sample of text or speech (source: Wikipedia). Hence bi-grams are combination of 2 words, tri-grams combination of 3 words,…
In order to perform the tokenization and generate the n-grams we will use the package ** Quanteda . USing the built-in functions of Quanteda** to generate the bi-grams,tri-grams and quad-grams.
cleanCorpus<-readRDS("./projectData/cleanedCorpus.RData")
#Let's remove the extra content and convert it to a crpus to enable quanteda processing;
fcorpus<-data.frame(text=unlist(sapply(cleanCorpus,`[`, "content")),stringsAsFactors = FALSE)
corpusq<-quanteda::corpus(fcorpus)
#Let's generate the bigrams and re-applying the removal of some terms to ensure that our corpus is clean.
bigram<-quanteda::tokens(x=corpusq, what =c("word"), remove_numbers = TRUE,
remove_punct = TRUE, remove_symbols = TRUE, remove_separators = TRUE,
remove_twitter = TRUE, remove_hyphens = TRUE, remove_url = TRUE,
ngrams = 2, concatenator = " ")
#Generating the sparse document feature matrix from the tokens
bigFreq<-dfm(bigram)
#From the dfm create aw word frequncy data frame and sort it
bigFreq <- data.frame(words = featnames(bigFreq), freq= colSums(bigFreq),
row.names = NULL, sbingsAsFactors = FALSE)
bigFreq<- bigFreq[order(bigFreq$freq,decreasing = TRUE),]
#Plot the top 20 most frequent bigrams
plotBig <- ggplot(bigFreq[1:20,], aes(reorder(words, -freq), freq)) +
labs(x = "bi-gram", y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity")
#Let's generate the trigrams and re-applying the removal of some terms to ensure that our corpus is clean
trigram<-quanteda::tokens(x=corpusq, what =c("word"), remove_numbers = TRUE,
remove_punct = TRUE, remove_symbols = TRUE, remove_separators = TRUE,
remove_twitter = TRUE, remove_hyphens = TRUE, remove_url = TRUE,
ngrams = 3, concatenator = " ")
#Generating the sparse document feature matrix from the tokens
trigFreq<-dfm(trigram)
#From the dfm create aw word frequncy data frame and sort it
trigFreq <- data.frame(words = featnames(trigFreq), freq= colSums(trigFreq),
row.names = NULL, stringsAsFactors = FALSE)
trigFreq<- trigFreq[order(trigFreq$freq,decreasing = TRUE),]
#Plot the top 20 most frequent trigrams
plotTrig <- ggplot(trigFreq[1:20,], aes(reorder(words, -freq), freq)) +
labs(x = "tri-gram", y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity")
#Let's generate the quadgrams and re-applying the removal of some terms to ensure that our corpus is clean
quadram<-quanteda::tokens(x=corpusq, what =c("word"), remove_numbers = TRUE,
remove_punct = TRUE, remove_symbols = TRUE, remove_separators = TRUE,
remove_twitter = TRUE, remove_hyphens = TRUE, remove_url = TRUE,
ngrams = 4, concatenator = " ")
#Generating the sparse document feature matrix from the tokens
quadFreq<-dfm(quadram)
#From the dfm create aw word frequncy data frame and sort it
quadFreq <- data.frame(words = featnames(quadFreq), freq= colSums(quadFreq),
row.names = NULL, squadngsAsFactors = FALSE)
quadFreq<- quadFreq[order(quadFreq$freq,decreasing = TRUE),]
#Plot the top 20 most frequent quadgrams
plotQuad <- ggplot(quadFreq[1:20,], aes(reorder(words, -freq), freq)) +
labs(x = "quad-gram", y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity")
#enabling multiplot with ggplot
source("multiplotFunction.R")
#visualizing the results
multiplot(plotBig,plotTrig,plotQuad,cols=2)
#Let's save the the 3 files
saveRDS(bigFreq,"bigram.RData")
saveRDS(trigFreq,"trigram.RData")
saveRDS(quadFreq,"quadgram.RData")
The different step from getting and cleaning the data to creating the n-grams are highly resources consuming for the computer in the NLP field. That’s why decided to sample the different sources of data to enable a smooth processing of the text data.
There is some room to improve the quality of the cleaning and tokenization. WHichi will be critical fr the accuracy of the prediction.