Introduction

The capstone project for Coursera data science specialization is related to text data mining: guessing or sugesting next word which will follow a previous sequence of words. The milestone project part is related to exploratory data analyses.

The files which are used to create the dictionary and build the word prediction are downloaded from HC Corpora (www.corpora.heliohost.org) and were downloade from the next link https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip.

The zip file contains files in multiple languages, but for the project I choose the english version. For each language there are three files with data from different sources, styles and format.

Data load and summary

Load the files in r and find some basic statistics about the files and their content.

To automate the files statistics summary I wrote a small code in R which will load all the files from the current directory, delete the non-printable characters/words, calculate and display the next files summary information: file name, file size, number of lines, size of the longest line, number of words per file.

library(knitr);
library(NLP);
library(tm);
library(tau);
library(slam);
library(RColorBrewer);
library(wordcloud);

wc_l <- function(x) {length(unlist(strsplit(x," ")))};
files_dir <- c("./final/");
files_list <- list.files(files_dir);
files_info <- file.info(paste(files_dir,files_list,sep=""));
files_stats <- data.frame(FileName=character(),FileSize_Kb=numeric(),
               NoOfLines=numeric(), MaxLineSize=numeric(), TotalWords=numeric(),
               stringsAsFactors=FALSE);
for (file_name in files_list) {
    file_con <- file(paste(files_dir,file_name,sep=""), "rb");
    f_lines <- readLines(file_con, encoding="UTF-8", skipNul=TRUE);
    f_lines <- gsub("[^[:print:]]","",f_lines);
    lines <- length(f_lines);
    l_size <- sapply(f_lines,nchar);
    max_linesize <- max(l_size);
    wc_lines <- sapply(f_lines,wc_l);
    total_words <- sum(wc_lines);
    files_stats <- rbind(files_stats,cbind(file_name,files_info[paste(files_dir,file_name,sep=""),1],lines,max_linesize,total_words));
    close(file_con);
}
names(files_stats) <- c("FileName","FileSize_Kb","NoOfLines","MaxLineSize","TotalWords");
files_stats;

##            FileName FileSize_Kb NoOfLines MaxLineSize TotalWords
## 1   en_US.blogs.txt   210160014    899288       40832   37332758
## 2    en_US.news.txt   205811889   1010242       11384   34372090
## 3 en_US.twitter.txt   167105338   2360148         140   30353488

Documents corpus creation from sample data and transformation

Since the data is too large and to create the document word matrix will use a lot of computer resources and time, the data will be sampled before to create the corpus document. Teh sample size used is 5%.

Data load and sample at 5%

SampleSize <- 0.05;
for (file_name in files_list) {
   file_con <- file(paste(files_dir,file_name,sep=""), "rb");
   f_lines <- readLines(file_con, encoding="UTF-8", skipNul=TRUE);
   f_sample_lines <-  sample(f_lines, length(f_lines)*SampleSize);
   f_corpus <- gsub("[^[:print:]]","",f_sample_lines);
   doc_vs <- VectorSource(f_corpus);
   close(file_con);
}

Create the document corpus and delete the objects related to data load

docs <- Corpus(doc_vs);
rm(f_lines);
rm(f_sample_lines);
rm(f_corpus);

Apply transformation to the corpus document: change characters to lower case, delete whte spaces, punctuation, stop words

docs <- tm_map(docs, content_transformer(tolower));
docs <- tm_map(docs, removeNumbers);
docs <- tm_map(docs, removePunctuation);
docs <- tm_map(docs, stripWhitespace);
docs <- tm_map(docs, removeWords, stopwords("english"));

Documents analyses using n-grams and graph presentations

In document analyses the statistics related to the frequencies of words and words combinatios (Ngrams) are calculated for teh documents provides. The words statistcs will allow to categorize the future documents, guess the next words or implement word corrections.

ngram creation for 1 word

tdm_1gram <- TermDocumentMatrix(docs);
tdm_1gram_r <- rollup(tdm_1gram, 2, na.rm=TRUE, FUN = sum);
tdm_1gram_Freq <- sort((as.matrix(tdm_1gram_r)[,1]),decreasing=TRUE);
tdm_1gram_df <- data.frame(word=names(tdm_1gram_Freq), freq=tdm_1gram_Freq, stringsAsFactors = FALSE)

ngram creation for 2 words

grams2 <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE);
tdm_2grams <- TermDocumentMatrix(docs, control = list(tokenize = grams2, bounds = list(global = c(1,Inf))));
tdm_2gram_r <- rollup(tdm_2grams, 2, na.rm=TRUE, FUN = sum);
tdm_2gram_Freq <- sort((as.matrix(tdm_2gram_r)[,1]),decreasing=TRUE);
tdm_2gram_df <- data.frame(word=names(tdm_2gram_Freq), freq=tdm_2gram_Freq, stringsAsFactors = FALSE);

ngram creation for 3 words

grams3 <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE);
tdm_3grams <- TermDocumentMatrix(docs, control = list(tokenize = grams3, bounds = list(global = c(1,Inf))));
tdm_3gram_r <- rollup(tdm_3grams, 2, na.rm=TRUE, FUN = sum);
tdm_3gram_Freq <- sort((as.matrix(tdm_3gram_r)[,1]),decreasing=TRUE);
tdm_3gram_df <- data.frame(word=names(tdm_3gram_Freq), freq=tdm_3gram_Freq, stringsAsFactors = FALSE);

Most frequent words and words combination(Ngrams)

List with the most used top 10 words or words combination

print("Top 10 words");

## [1] "Top 10 words"

tdm_1gram_df[1:10,c("freq"),drop=FALSE];

##        freq
## just   7368
## like   6042
## get    5493
## love   5326
## good   5008
## will   4749
## day    4519
## can    4515
## thanks 4416
## dont   4364

print("Top 10 ngrams-2");

## [1] "Top 10 ngrams-2"

tdm_2gram_df[1:10,c("freq"),drop=FALSE];

##                 freq
## cant wait        854
## right now        829
## last night       578
## looking forward  462
## happy birthday   418
## dont know        389
## im going         381
## good morning     365
## feel like        349
## looks like       344

print("Top 10 ngrams-3");

## [1] "Top 10 ngrams-3"

tdm_3gram_df[1:10,c("freq"),drop=FALSE];

##                        freq
## happy mothers day       179
## cant wait see           172
## let us know             118
## happy new year           89
## cinco de mayo            54
## im pretty sure           49
## dont even know           47
## looking forward seeing   47
## cant wait get            42
## cant wait till           39

Histograms for the top words and words combination

barplot(tdm_1gram_df$freq[1:20], cex.names=0.5, names.arg=tdm_1gram_df$word[1:20], col="green", horiz=TRUE, main="Words frequency", las=2);

barplot(tdm_2gram_df$freq[1:20], cex.names=0.5, names.arg=tdm_2gram_df$word[1:20], col="blue", horiz=TRUE, main="2 Ngrams  frequency", las=2);

barplot(tdm_3gram_df$freq[1:20], cex.names=0.5, names.arg=tdm_3gram_df$word[1:20], col="red", horiz=TRUE, main="3 Ngrams  frequency", las=2);

Words clouds plot for 1, 2 and 3 word grams

wc_col <- brewer.pal(8,"Dark2");
wordcloud(tdm_1gram_df$word[1:50], tdm_1gram_df$freq[1:100], random.order=F, ordered.colors=F, colors=wc_col);
text(x=0.5, y=0, "Wordcloud map for 1-Ngrams");

wordcloud(tdm_2gram_df$word[1:50], tdm_2gram_df$freq[1:100], scale=c(3,.5), random.order=F, ordered.colors=F, colors=wc_col);
text(x=0.5, y=0, "Wordcloud map for 2-Ngrams");

wordcloud(tdm_3gram_df$word[1:50], tdm_3gram_df$freq[1:100], scale=c(3,.5), random.order=F, ordered.colors=F, colors=wc_col);
text(x=0.5, y=0, "Wordcloud map for 3-Ngrams");

Conclusions and next steps

From the data exploration and analyses for the next word guessing project next steps will be implemented and some parts will require more analyses:

Use sample data because of the limited computing resources and execution time
Analyze the twitter data to see if the data privde any prediction performance since a lot of abbreviations are used and don’t provide too much information for a more formal typing.
Investigate which algorithm to use for the words which are not in the documents corpus.
Possible to have an option for different type of documents like: more formal englsh and twitter style.
Analyze the possibility to use the 4-Ngrams for prediction and the impact on performance.
Investigate the impact of stopwords removal for the next word prediction.

Data Science Capstone Project - Milestone Report - Data Exploratory Analyses

Danut Bancea

July 26, 2015