vergès Maxime 13th May, 2019
This report stands for the second week 2 (milestone report) regarding Data Science Capstone from the Coursera Data Scienence Specialization. The challenge is to develop a prediction algorithm to get the next word in a sequence of words. This report highlights how to get, to clean and to process the data but alos how to explore data analysis with some patterns.
We load the data as below:
setwd("C:/Users/maxim/Desktop/Coursera-SwiftKey/final/en_US")
blogs <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
The packages rJava, knitr, NLP, tm, RWekajars, RWeka, ggplot2, stringi, RColorBrewer, _wordcloud_n ngram, slam, _htmlTable_n xtable and dplyr have to be loaded.
An overview is needed to understand the study, hence information have been summarized regarding the 3 datasets blogs, news and twitter. The table provides the name, the size (in MB), the number of lines, the number of characters but also the length of the longest entry from each dataset.
overview <- data.frame(
file_name=c("blogs","news","twitter"),
"file_size" = sapply(list(blogs, news, twitter), function(x){format(object.size(x),"MB")}),
'number_of_lines' = sapply(list(blogs, news, twitter), function(x){length(x)}),
'number_of_characters' = sapply(list(blogs, news, twitter), function(x){sum(nchar(x))}),
'longest_entry' = sapply(list(blogs, news, twitter), function(x){max(unlist(lapply(x, function(y) nchar(y))))})
)
kable(overview,caption = "the main datasets")
| file_name | file_size | number_of_lines | number_of_characters | longest_entry |
|---|---|---|---|---|
| blogs | 255.4 Mb | 899288 | 206824505 | 40833 |
| news | 19.8 Mb | 77259 | 15639408 | 5760 |
| 319 Mb | 2360148 | 162096031 | 140 |
As the size of each dataset is very big, we reduce the size of each dataset (0.5% of each dataset) to create a corpus and then we clean the corpus by removing non-ASCII characters, punctuation, numbers, useless white spaces, by creating plain text format and by converting all words to lowercase.
#we make the study reproducible
set.seed(12345)
b_subset <- sample(blogs, length(blogs) * 0.005)
n_subset <- sample(news, length(news) * 0.005)
t_subset <- sample(twitter, length(twitter) * 0.005)
#non ASCII characters have to be removed
blogs_subset <- iconv(b_subset, "UTF-8", "ASCII", sub="")
news_subset <- iconv(n_subset, "UTF-8", "ASCII", sub="")
twitter_subset <- iconv(t_subset, "UTF-8", "ASCII", sub="")
data_subset <- c(blogs_subset,news_subset,twitter_subset)
#function to get the cleaned corpus
corpus_processing <- function (x = data_subset) {
object <- VCorpus(VectorSource(data_subset))
object <- tm_map(object, tolower)
object <- tm_map(object, stripWhitespace)
object <- tm_map(object, removeNumbers)
object <- tm_map(object, removePunctuation)
object <- tm_map(object, PlainTextDocument)
}
corpus <- corpus_processing(data_subset)
The tm package is useful to tokenize the sample and get easily matrices of uniqrams, bigrams, and trigrams.
corpus_uni_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
corpus_bi_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
corpus_tri_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
corpus_uni_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = corpus_uni_tokenizer))
corpus_bi_matrix<- TermDocumentMatrix(corpus, control = list(tokenize = corpus_bi_tokenizer))
corpus_tri_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = corpus_tri_tokenizer))
corpus_uni <- findFreqTerms(corpus_uni_matrix,lowfreq = 10)
corpus_bi <- findFreqTerms(corpus_bi_matrix,lowfreq=10)
corpus_tri <- findFreqTerms(corpus_tri_matrix,lowfreq=10)
corpus_uni_frame <- rowSums(as.matrix(corpus_uni_matrix[corpus_uni,]))
corpus_uni_frame <- data.frame(word=names(corpus_uni_frame), frequency=corpus_uni_frame)
corpus_bi_frame<- rowSums(as.matrix(corpus_bi_matrix[corpus_bi,]))
corpus_bi_frame <- data.frame(word=names(corpus_bi_frame), frequency=corpus_bi_frame)
corpus_tri_frame <- rowSums(as.matrix(corpus_tri_matrix[corpus_tri,]))
corpus_tri_frame <- data.frame(word=names(corpus_tri_frame), frequency=corpus_tri_frame)
First we can get 3 graphs with top 20 of unigrams, 2-grams and 3-grams as it is useful to explore data analysis.
graph_uni <- ggplot(data = corpus_uni_frame[order(-corpus_uni_frame$frequency),][1:20,], aes(x = reorder(word, -frequency), y = frequency))+
geom_bar(stat="identity", fill = "darkred", colour = "black", width = 1.1) +
labs(x = "unigrams", y = "Frequency", title = "Top 20 of unigrams") +
theme(axis.text.x=element_text(angle=90))
graph_uni
graph_bi <- ggplot(data = corpus_bi_frame[order(-corpus_bi_frame$frequency),][1:20,], aes(x = reorder(word, -frequency), y = frequency))+
geom_bar(stat="identity", fill = "darkred", colour = "black", width = 1.1) +
labs(x = "2-grams", y = "Frequency", title = "Top 20 of 2-grams") +
theme(axis.text.x=element_text(angle=90))
graph_bi
graph_tri <- ggplot(data = corpus_tri_frame[order(-corpus_tri_frame$frequency),][1:20,], aes(x = reorder(word, -frequency), y = frequency))+
geom_bar(stat="identity", fill = "darkred", colour = "black", width = 1.1) +
labs(x = "3-grams", y = "Frequency", title = "Top 20 of 3-grams") +
theme(axis.text.x=element_text(angle=90))
graph_tri
We can get something more visual with wordclouds.
cloud_uni <- list(corpus_uni_frame)
wordcloud(cloud_uni[[1]]$word, cloud_uni[[1]]$frequency, scale = c(3,1), max.words=50, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(12, "Paired"))
cloud_bi <- list(corpus_bi_frame)
wordcloud(cloud_bi[[1]]$word, cloud_bi[[1]]$frequency, scale = c(3,1), max.words=50, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(12, "Paired"))
cloud_tri <- list(corpus_tri_frame)
wordcloud(cloud_tri[[1]]$word, cloud_tri[[1]]$frequency, scale = c(3,1), max.words=50, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(12, "Paired"))