This project is a conception of Coursera and SwiftKey and aims in creating a shiny app that predicts the next english word given a short context in real-time. This DS Capstone Project milestone report presents some basic exploratory analysis on the given data in .txt file format.
Contents
1. Data Loading in RStudio Environment
# Load the raw data (text files) on the workspace:
con1 <- file("final/en_US/complete/en_US.blogs.txt");
con2 <- file("final/en_US/complete/en_US.news.txt");
con3 <- file("final/en_US/complete/en_US.twitter.txt");
blogs_dat <- readLines(con1, n=-1, warn=TRUE, skipNul = TRUE, encoding="UTF-8");
news_dat <- readLines(con2, n=-1, warn=TRUE, skipNul = TRUE, encoding="UTF-8");
twitter_dat <- readLines(con3, n=-1, warn=TRUE, skipNul = TRUE, encoding="UTF-8");
close(con1);
close(con2);
close(con3);
remove(con1,con2,con3);
2. Extraction of General Info about the Original Text Files.
library(stringi);
blogs_word_count <- stri_count_words(blogs_dat);
news_word_count <- stri_count_words(news_dat);
twitter_word_count <- stri_count_words(twitter_dat);
size_blogs <- file.info("final/en_US/complete/en_US.blogs.txt")$size/1024^2
size_news <- file.info("final/en_US/complete/en_US.news.txt")$size/1024^2
size_twitter <- file.info("final/en_US/complete/en_US.twitter.txt")$size/1024^2
generic_file_stats <- data.frame(filename = c("blogs","news","twitter"),
file_size_in_MB = c(size_blogs, size_news, size_twitter),
line_count = c(length(blogs_dat),length(news_dat),length(twitter_dat)),
word_count = c(sum(blogs_word_count),sum(news_word_count),sum(twitter_word_count)),
mean_num_words = c(mean(blogs_word_count),mean(news_word_count),mean(twitter_word_count)));
generic_file_stats
## filename file_size_in_MB line_count word_count mean_num_words
## 1 blogs 200.4242 899288 37546246 41.75108
## 2 news 196.2775 1010242 34762395 34.40997
## 3 twitter 159.3641 2360148 30093410 12.75065
3. Selection and Saving of a Random Representative Sample of the Data.
set.seed(133);
# We start by taking the perc_sel% sample of each text data file.
perc_sel = 0.01; # ... or select randomly 1% of the given data.
blogs_dat_red <- blogs_dat[sample(1:length(blogs_dat), round(perc_sel*length(blogs_dat)),replace = FALSE)];
remove(blogs_dat);
news_dat_red <- news_dat[sample(1:length(news_dat), round(perc_sel*length(news_dat)),replace = FALSE)];
remove(news_dat);
twitter_dat_red <- twitter_dat[sample(1:length(twitter_dat), round(perc_sel*length(twitter_dat)),replace = FALSE)];
remove(twitter_dat);
# Save Reduced Size Text Data in .txt format:
writeLines( blogs_dat_red, con = "final/en_US/reduced_tmp/blogs_dat_red2.txt" , sep = "\n");
writeLines( news_dat_red, con = "final/en_US/reduced_tmp/news_dat_red2.txt" , sep = "\n");
writeLines(twitter_dat_red, con = "final/en_US/reduced_tmp/twitter_dat_red2.txt", sep = "\n");
remove(blogs_dat_red, news_dat_red, twitter_dat_red);
4. Data Cleaning & Preprocessing
library(tm)
library(RWeka)
library(ggplot2)
# Create a Corpus with the reduced size text documents.
eng_text <- Corpus(DirSource("final/en_US/reduced_tmp"), readerControl = list(reader = readPlain
, language = "en", load = TRUE));
# Utility Functions
Pattern2Space <- content_transformer(function(x, pattern) gsub(pattern, " ", x));
Pattern2Words <- content_transformer(function(x, old_pattern, new_pattern) gsub(old_pattern, new_pattern, x));
# Conversion to lower case by:
eng_text <- tm_map(eng_text, content_transformer(tolower));
# Removal of punctuation marks by:
eng_text <- tm_map(eng_text, content_transformer(removePunctuation));
# Removal of Numbers by:
eng_text <- tm_map(eng_text, content_transformer(removeNumbers));
# Remove Special Characters and Symbols:
eng_text <- tm_map(eng_text, Pattern2Space, "~|`|#|@|“|”|‘|’|„|…|–+|&|%|_+|•|¦|_|>|<|\\+|\\$|—+|―+|†|‡|€");
# Remove non-english words:
eng_text <- tm_map(eng_text, Pattern2Space, "[^A-Za-z]+");
# Removing Extra Whitespaces
eng_text <- tm_map(eng_text, stripWhitespace);
# Profanity Filtering:
con4 <- file("badwords.txt");
bad_mouth <- readLines(con4, n=-1, warn=TRUE, skipNul = TRUE, encoding="UTF-8");
close(con4); remove(con4);
# Removal of bad words:
eng_text <- tm_map(eng_text, removeWords, bad_mouth);
remove(bad_mouth);
Utility Function definitions:
# The following function returns a data frame with the frequency counts
# of each term given the Term Document Matrix.
tdm2freqframe <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE);
freqframe <- data.frame(word=names(freq), freq=freq);
return(freqframe);
}
# Create Tokenizer functions:
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2));
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3));
QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4));
QuintgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=5, max=5));
5. Computation of Term Document Matrices for N-grams.
Term_Doc_Mat_uni1 <- TermDocumentMatrix(eng_text, control=list(wordLengths=c(1,Inf)));
Term_Doc_Mat_uni <- removeSparseTerms(Term_Doc_Mat_uni1, 0.9);
Freq_Frame_uni <- tdm2freqframe(Term_Doc_Mat_uni);
remove(Term_Doc_Mat_uni,Term_Doc_Mat_uni1);
Term_Doc_Mat_bi1 <- TermDocumentMatrix(eng_text, control=list(tokenize=BigramTokenizer))
Term_Doc_Mat_bi <- removeSparseTerms(Term_Doc_Mat_bi1, 0.9);
Freq_Frame_bi <- tdm2freqframe(Term_Doc_Mat_bi);
remove(Term_Doc_Mat_bi,Term_Doc_Mat_bi1);
Term_Doc_Mat_tri1 <- TermDocumentMatrix(eng_text, control=list(tokenize=TrigramTokenizer))
Term_Doc_Mat_tri <- removeSparseTerms(Term_Doc_Mat_tri1, 0.9);
Freq_Frame_tri <- tdm2freqframe(Term_Doc_Mat_tri);
remove(Term_Doc_Mat_tri,Term_Doc_Mat_tri1);
Term_Doc_Mat_quad1 <- TermDocumentMatrix(eng_text, control=list(tokenize=QuadgramTokenizer))
Term_Doc_Mat_quad <- removeSparseTerms(Term_Doc_Mat_quad1, 0.9);
Freq_Frame_quad <- tdm2freqframe(Term_Doc_Mat_quad);
remove(Term_Doc_Mat_quad,Term_Doc_Mat_quad1);
Term_Doc_Mat_quint1 <- TermDocumentMatrix(eng_text, control=list(tokenize=QuintgramTokenizer))
Term_Doc_Mat_quint <- removeSparseTerms(Term_Doc_Mat_quint1, 0.9);
Freq_Frame_quint <- tdm2freqframe(Term_Doc_Mat_quint);
remove(Term_Doc_Mat_quint,Term_Doc_Mat_quint1);
6. Exploratory Analysis - Plotting of the Most Frequent N-grams in given English Corpus.
par(mfrow=c(1,2), mar = c(5, 3, 2, 1), cex.axis = 0.7, cex.lab = 0.85, cex.main = 0.9, las = 1)
barplot(Freq_Frame_uni$freq[1:20],names.arg = Freq_Frame_uni$word[1:20],col="cyan",
main = "Top 20 Most frequent Unigrams in Corpus",
xlab = "Frequency", horiz = T);
barplot(Freq_Frame_bi$freq[1:20],names.arg = Freq_Frame_bi$word[1:20],col="magenta",
main = "Top 20 Most frequent Bigrams in Corpus",
xlab = "Frequency", horiz = T);
par(mfrow=c(1,2), mar = c(5, 5, 2, 1), cex.axis = 0.7, cex.lab = 0.85, cex.main = 0.9, las = 1)
barplot(Freq_Frame_tri$freq[1:20],names.arg = Freq_Frame_tri$word[1:20],col="green",
main = "Top 20 Most frequent Trigrams in Corpus",
xlab = "Frequency", horiz = T);
barplot(Freq_Frame_quad$freq[1:20],names.arg = Freq_Frame_quad$word[1:20],col="red",
main = "Top 20 Most frequent Quadgrams in Corpus",
xlab = "Frequency", horiz = T);
7. Calculation of General Corpus Coverage by Unique Words.
# i.e. how many unique words are required to cover a given percent of a frequency sorted
# dictionary:
total_word_instances = sum(Freq_Frame_uni$freq);
percentage = seq(0.025,0.975,by=0.025);
NumberOfUniqueWords = vector(mode = "numeric", length(percentage));
for (k in 1:length(percentage)){
n = 0;
while (sum(Freq_Frame_uni$freq[1:n]) < percentage[k]*total_word_instances){
n = n+1;
}
NumberOfUniqueWords[k] = n;
}
qplot(NumberOfUniqueWords,100*percentage,xlab="Number of Unique Words"
,ylab="% of Corpus Covered",main="Corpus Language Coverage");
8. Some Thoughts and Plans about the Way Ahead…
If code optimization is successful in terms of the required computing resources for acceptable real-time execution, the app will provide each model’s predicted next 5 words in descending order of probability.
Also, I plan to develop a testing procedure for each model in order to measure its accuracy.