Introduction

The capstone project for Coursera data science specialization is related to text data mining: guessing or sugesting next word which will follow a previous sequence of words. The milestone project part is related to exploratory data analyses.

The files which are used to create the dictionary and build the word prediction are downloaded from HC Corpora (www.corpora.heliohost.org) and were downloade from the next link https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip.

The zip file contains files in multiple languages, but for the project I choose the english version. For each language there are three files with data from different sources, styles and format.

Data load and summary

Load the files in r and find some basic statistics about the files and their content.

To automate the files statistics summary I wrote a small code in R which will load all the files from the current directory, delete the non-printable characters/words, calculate and display the next files summary information: file name, file size, number of lines, size of the longest line, number of words per file.

library(knitr);
library(NLP);
library(tm);
library(tau);
library(slam);
library(RColorBrewer);
library(wordcloud);
wc_l <- function(x) {length(unlist(strsplit(x," ")))};
files_dir <- c("./final/");
files_list <- list.files(files_dir);
files_info <- file.info(paste(files_dir,files_list,sep=""));
files_stats <- data.frame(FileName=character(),FileSize_Kb=numeric(),
               NoOfLines=numeric(), MaxLineSize=numeric(), TotalWords=numeric(),
               stringsAsFactors=FALSE);
for (file_name in files_list) {
    file_con <- file(paste(files_dir,file_name,sep=""), "rb");
    f_lines <- readLines(file_con, encoding="UTF-8", skipNul=TRUE);
    f_lines <- gsub("[^[:print:]]","",f_lines);
    lines <- length(f_lines);
    l_size <- sapply(f_lines,nchar);
    max_linesize <- max(l_size);
    wc_lines <- sapply(f_lines,wc_l);
    total_words <- sum(wc_lines);
    files_stats <- rbind(files_stats,cbind(file_name,files_info[paste(files_dir,file_name,sep=""),1],lines,max_linesize,total_words));
    close(file_con);
}
names(files_stats) <- c("FileName","FileSize_Kb","NoOfLines","MaxLineSize","TotalWords");
files_stats;
##            FileName FileSize_Kb NoOfLines MaxLineSize TotalWords
## 1   en_US.blogs.txt   210160014    899288       40832   37332758
## 2    en_US.news.txt   205811889   1010242       11384   34372090
## 3 en_US.twitter.txt   167105338   2360148         140   30353488

Documents corpus creation from sample data and transformation

Since the data is too large and to create the document word matrix will use a lot of computer resources and time, the data will be sampled before to create the corpus document. Teh sample size used is 5%.

SampleSize <- 0.05;
for (file_name in files_list) {
   file_con <- file(paste(files_dir,file_name,sep=""), "rb");
   f_lines <- readLines(file_con, encoding="UTF-8", skipNul=TRUE);
   f_sample_lines <-  sample(f_lines, length(f_lines)*SampleSize);
   f_corpus <- gsub("[^[:print:]]","",f_sample_lines);
   doc_vs <- VectorSource(f_corpus);
   close(file_con);
}
docs <- Corpus(doc_vs);
rm(f_lines);
rm(f_sample_lines);
rm(f_corpus);
docs <- tm_map(docs, content_transformer(tolower));
docs <- tm_map(docs, removeNumbers);
docs <- tm_map(docs, removePunctuation);
docs <- tm_map(docs, stripWhitespace);
docs <- tm_map(docs, removeWords, stopwords("english"));

Documents analyses using n-grams and graph presentations

In document analyses the statistics related to the frequencies of words and words combinatios (Ngrams) are calculated for teh documents provides. The words statistcs will allow to categorize the future documents, guess the next words or implement word corrections.

tdm_1gram <- TermDocumentMatrix(docs);
tdm_1gram_r <- rollup(tdm_1gram, 2, na.rm=TRUE, FUN = sum);
tdm_1gram_Freq <- sort((as.matrix(tdm_1gram_r)[,1]),decreasing=TRUE);
tdm_1gram_df <- data.frame(word=names(tdm_1gram_Freq), freq=tdm_1gram_Freq, stringsAsFactors = FALSE)
grams2 <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE);
tdm_2grams <- TermDocumentMatrix(docs, control = list(tokenize = grams2, bounds = list(global = c(1,Inf))));
tdm_2gram_r <- rollup(tdm_2grams, 2, na.rm=TRUE, FUN = sum);
tdm_2gram_Freq <- sort((as.matrix(tdm_2gram_r)[,1]),decreasing=TRUE);
tdm_2gram_df <- data.frame(word=names(tdm_2gram_Freq), freq=tdm_2gram_Freq, stringsAsFactors = FALSE);
grams3 <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE);
tdm_3grams <- TermDocumentMatrix(docs, control = list(tokenize = grams3, bounds = list(global = c(1,Inf))));
tdm_3gram_r <- rollup(tdm_3grams, 2, na.rm=TRUE, FUN = sum);
tdm_3gram_Freq <- sort((as.matrix(tdm_3gram_r)[,1]),decreasing=TRUE);
tdm_3gram_df <- data.frame(word=names(tdm_3gram_Freq), freq=tdm_3gram_Freq, stringsAsFactors = FALSE);

Most frequent words and words combination(Ngrams)

print("Top 10 words");
## [1] "Top 10 words"
tdm_1gram_df[1:10,c("freq"),drop=FALSE];
##        freq
## just   7368
## like   6042
## get    5493
## love   5326
## good   5008
## will   4749
## day    4519
## can    4515
## thanks 4416
## dont   4364
print("Top 10 ngrams-2");
## [1] "Top 10 ngrams-2"
tdm_2gram_df[1:10,c("freq"),drop=FALSE];
##                 freq
## cant wait        854
## right now        829
## last night       578
## looking forward  462
## happy birthday   418
## dont know        389
## im going         381
## good morning     365
## feel like        349
## looks like       344
print("Top 10 ngrams-3");
## [1] "Top 10 ngrams-3"
tdm_3gram_df[1:10,c("freq"),drop=FALSE];
##                        freq
## happy mothers day       179
## cant wait see           172
## let us know             118
## happy new year           89
## cinco de mayo            54
## im pretty sure           49
## dont even know           47
## looking forward seeing   47
## cant wait get            42
## cant wait till           39
barplot(tdm_1gram_df$freq[1:20], cex.names=0.5, names.arg=tdm_1gram_df$word[1:20], col="green", horiz=TRUE, main="Words frequency", las=2);

barplot(tdm_2gram_df$freq[1:20], cex.names=0.5, names.arg=tdm_2gram_df$word[1:20], col="blue", horiz=TRUE, main="2 Ngrams  frequency", las=2);

barplot(tdm_3gram_df$freq[1:20], cex.names=0.5, names.arg=tdm_3gram_df$word[1:20], col="red", horiz=TRUE, main="3 Ngrams  frequency", las=2);

Words clouds plot for 1, 2 and 3 word grams

wc_col <- brewer.pal(8,"Dark2");
wordcloud(tdm_1gram_df$word[1:50], tdm_1gram_df$freq[1:100], random.order=F, ordered.colors=F, colors=wc_col);
text(x=0.5, y=0, "Wordcloud map for 1-Ngrams");

wordcloud(tdm_2gram_df$word[1:50], tdm_2gram_df$freq[1:100], scale=c(3,.5), random.order=F, ordered.colors=F, colors=wc_col);
text(x=0.5, y=0, "Wordcloud map for 2-Ngrams");

wordcloud(tdm_3gram_df$word[1:50], tdm_3gram_df$freq[1:100], scale=c(3,.5), random.order=F, ordered.colors=F, colors=wc_col);
text(x=0.5, y=0, "Wordcloud map for 3-Ngrams");

Conclusions and next steps

From the data exploration and analyses for the next word guessing project next steps will be implemented and some parts will require more analyses: