Data Science Capstone Project Milestone Report

Synopsis

This project is a conception of Coursera and SwiftKey and aims in creating a shiny app that predicts the next english word given a short context in real-time. This DS Capstone Project milestone report presents some basic exploratory analysis on the given data in .txt file format.

Contents

Data Loading in RStudio Environment.
Extraction of General Info about the Original Text Files.
Selection and Saving of a Random Representative Sample of the Data.
Data Cleaning & Preprocessing.
Computation of Term Document Matrices for N-grams.
Plotting of Histograms for the Most Frequent N-grams in given English Corpus.
Calculation of General Corpus Coverage by Unique Words.
Some Thoughts and Plans about the Way Ahead…

Getting, Sampling and Cleaning Data

1. Data Loading in RStudio Environment

# Load the raw data (text files) on the workspace:
con1 <- file("final/en_US/complete/en_US.blogs.txt");
con2 <- file("final/en_US/complete/en_US.news.txt");
con3 <- file("final/en_US/complete/en_US.twitter.txt");

  blogs_dat <- readLines(con1, n=-1, warn=TRUE, skipNul = TRUE, encoding="UTF-8");
   news_dat <- readLines(con2, n=-1, warn=TRUE, skipNul = TRUE, encoding="UTF-8");
twitter_dat <- readLines(con3, n=-1, warn=TRUE, skipNul = TRUE, encoding="UTF-8");

close(con1);
close(con2);
close(con3);
remove(con1,con2,con3);

2. Extraction of General Info about the Original Text Files.

library(stringi);

blogs_word_count   <- stri_count_words(blogs_dat);
news_word_count    <- stri_count_words(news_dat);
twitter_word_count <- stri_count_words(twitter_dat);

size_blogs   <- file.info("final/en_US/complete/en_US.blogs.txt")$size/1024^2
size_news    <- file.info("final/en_US/complete/en_US.news.txt")$size/1024^2
size_twitter <- file.info("final/en_US/complete/en_US.twitter.txt")$size/1024^2

generic_file_stats <- data.frame(filename = c("blogs","news","twitter"),
                      file_size_in_MB = c(size_blogs, size_news, size_twitter),
                      line_count = c(length(blogs_dat),length(news_dat),length(twitter_dat)),
                      word_count = c(sum(blogs_word_count),sum(news_word_count),sum(twitter_word_count)),
                      mean_num_words = c(mean(blogs_word_count),mean(news_word_count),mean(twitter_word_count)));

generic_file_stats

##   filename file_size_in_MB line_count word_count mean_num_words
## 1    blogs        200.4242     899288   37546246       41.75108
## 2     news        196.2775    1010242   34762395       34.40997
## 3  twitter        159.3641    2360148   30093410       12.75065

3. Selection and Saving of a Random Representative Sample of the Data.

set.seed(133);
# We start by taking the perc_sel% sample of each text data file.
perc_sel = 0.01; # ... or select randomly 1% of the given data.
blogs_dat_red <- blogs_dat[sample(1:length(blogs_dat), round(perc_sel*length(blogs_dat)),replace = FALSE)];
remove(blogs_dat);

news_dat_red <- news_dat[sample(1:length(news_dat), round(perc_sel*length(news_dat)),replace = FALSE)];
remove(news_dat);

twitter_dat_red <- twitter_dat[sample(1:length(twitter_dat), round(perc_sel*length(twitter_dat)),replace = FALSE)];
remove(twitter_dat);

# Save Reduced Size Text Data in .txt format:
writeLines(  blogs_dat_red, con = "final/en_US/reduced_tmp/blogs_dat_red2.txt"  , sep = "\n");
writeLines(   news_dat_red, con = "final/en_US/reduced_tmp/news_dat_red2.txt"   , sep = "\n");
writeLines(twitter_dat_red, con = "final/en_US/reduced_tmp/twitter_dat_red2.txt", sep = "\n");
remove(blogs_dat_red, news_dat_red, twitter_dat_red);

4. Data Cleaning & Preprocessing

library(tm)
library(RWeka)
library(ggplot2)

# Create a Corpus with the reduced size text documents.
eng_text <- Corpus(DirSource("final/en_US/reduced_tmp"), readerControl = list(reader = readPlain
                                                       , language = "en", load = TRUE));

# Utility Functions
Pattern2Space <- content_transformer(function(x, pattern) gsub(pattern, " ", x));
Pattern2Words <- content_transformer(function(x, old_pattern, new_pattern) gsub(old_pattern, new_pattern, x));

# Conversion to lower case by:
eng_text <- tm_map(eng_text, content_transformer(tolower));

# Removal of punctuation marks by:
eng_text <- tm_map(eng_text, content_transformer(removePunctuation));

# Removal of Numbers by:
eng_text <- tm_map(eng_text, content_transformer(removeNumbers));

# Remove Special Characters and Symbols:
eng_text <- tm_map(eng_text, Pattern2Space, "~|`|#|@|“|”|‘|’|„|…|–+|&|%|_+|•|¦|_|>|<|\\+|\\$|—+|―+|†|‡|€");

# Remove non-english words:
eng_text <- tm_map(eng_text, Pattern2Space, "[^A-Za-z]+");

# Removing Extra Whitespaces
eng_text <- tm_map(eng_text, stripWhitespace);

# Profanity Filtering:
con4 <- file("badwords.txt");
bad_mouth <- readLines(con4, n=-1, warn=TRUE, skipNul = TRUE, encoding="UTF-8");
close(con4); remove(con4);

# Removal of bad words:
eng_text <- tm_map(eng_text, removeWords, bad_mouth);
remove(bad_mouth);

Utility Function definitions:

# The following function returns a data frame with the frequency counts 
# of each term given the Term Document Matrix.
tdm2freqframe <- function(tdm){
   freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE);
   freqframe <- data.frame(word=names(freq), freq=freq);
   return(freqframe);
}

# Create Tokenizer functions:
   BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2));
  TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3));
 QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4));
QuintgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=5, max=5));

5. Computation of Term Document Matrices for N-grams.

Term_Doc_Mat_uni1 <- TermDocumentMatrix(eng_text, control=list(wordLengths=c(1,Inf)));
 Term_Doc_Mat_uni <- removeSparseTerms(Term_Doc_Mat_uni1, 0.9);
   Freq_Frame_uni <- tdm2freqframe(Term_Doc_Mat_uni);
remove(Term_Doc_Mat_uni,Term_Doc_Mat_uni1);   

Term_Doc_Mat_bi1 <- TermDocumentMatrix(eng_text, control=list(tokenize=BigramTokenizer))
 Term_Doc_Mat_bi <- removeSparseTerms(Term_Doc_Mat_bi1, 0.9);
   Freq_Frame_bi <- tdm2freqframe(Term_Doc_Mat_bi);
remove(Term_Doc_Mat_bi,Term_Doc_Mat_bi1);

Term_Doc_Mat_tri1 <- TermDocumentMatrix(eng_text, control=list(tokenize=TrigramTokenizer))
 Term_Doc_Mat_tri <- removeSparseTerms(Term_Doc_Mat_tri1, 0.9);
   Freq_Frame_tri <- tdm2freqframe(Term_Doc_Mat_tri);
remove(Term_Doc_Mat_tri,Term_Doc_Mat_tri1);

Term_Doc_Mat_quad1 <- TermDocumentMatrix(eng_text, control=list(tokenize=QuadgramTokenizer))
 Term_Doc_Mat_quad <- removeSparseTerms(Term_Doc_Mat_quad1, 0.9);
   Freq_Frame_quad <- tdm2freqframe(Term_Doc_Mat_quad);
remove(Term_Doc_Mat_quad,Term_Doc_Mat_quad1);

Term_Doc_Mat_quint1 <- TermDocumentMatrix(eng_text, control=list(tokenize=QuintgramTokenizer))
 Term_Doc_Mat_quint <- removeSparseTerms(Term_Doc_Mat_quint1, 0.9);
   Freq_Frame_quint <- tdm2freqframe(Term_Doc_Mat_quint);
remove(Term_Doc_Mat_quint,Term_Doc_Mat_quint1);

6. Exploratory Analysis - Plotting of the Most Frequent N-grams in given English Corpus.

par(mfrow=c(1,2), mar = c(5, 3, 2, 1), cex.axis = 0.7, cex.lab = 0.85, cex.main = 0.9, las = 1)

barplot(Freq_Frame_uni$freq[1:20],names.arg = Freq_Frame_uni$word[1:20],col="cyan",
        main = "Top 20 Most frequent Unigrams in Corpus", 
        xlab = "Frequency", horiz = T);

barplot(Freq_Frame_bi$freq[1:20],names.arg = Freq_Frame_bi$word[1:20],col="magenta",
        main = "Top 20 Most frequent Bigrams in Corpus", 
        xlab = "Frequency", horiz = T);

par(mfrow=c(1,2), mar = c(5, 5, 2, 1), cex.axis = 0.7, cex.lab = 0.85, cex.main = 0.9, las = 1)

barplot(Freq_Frame_tri$freq[1:20],names.arg = Freq_Frame_tri$word[1:20],col="green",
        main = "Top 20 Most frequent Trigrams in Corpus", 
        xlab = "Frequency", horiz = T);

barplot(Freq_Frame_quad$freq[1:20],names.arg = Freq_Frame_quad$word[1:20],col="red",
        main = "Top 20 Most frequent Quadgrams in Corpus", 
        xlab = "Frequency", horiz = T);

7. Calculation of General Corpus Coverage by Unique Words.

# i.e. how many unique words are required to cover a given percent of a frequency sorted 
# dictionary:
total_word_instances = sum(Freq_Frame_uni$freq);
percentage = seq(0.025,0.975,by=0.025);
NumberOfUniqueWords = vector(mode = "numeric", length(percentage));

for (k in 1:length(percentage)){
    n = 0;
    while (sum(Freq_Frame_uni$freq[1:n]) < percentage[k]*total_word_instances){
          n = n+1;
    }
    NumberOfUniqueWords[k] = n;
}
  
qplot(NumberOfUniqueWords,100*percentage,xlab="Number of Unique Words" 
                         ,ylab="% of Corpus Covered",main="Corpus Language Coverage");

8. Some Thoughts and Plans about the Way Ahead…

Plan to create a shiny app that predicts the next word using the following methods:
- Maximum Likelihood Estimation (MLE)
- Katz Interpolated Smoothing
- Kneser-Ney Interpolated Smoothing.
If code optimization is successful in terms of the required computing resources for acceptable real-time execution, the app will provide each model’s predicted next 5 words in descending order of probability.
Also, I plan to develop a testing procedure for each model in order to measure its accuracy.