Summary

This report is a brief summary of the exploration done on the sample of text that was provide provided. Using the insights of text relationships observed we propose to build a practical application using shiny app that help predict the next word based on the collection of inputs.

Data Exploration

Loading libraries

library(R.utils)
library(dplyr)
library(tm)
library(stringi) 
library(tokenizers)
library(profr)

Source Data

downloading the source files and unziping them.
downloading profanity list files

setwd("E://Coursera/capstone")
zipUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
filepath <- "./data/Coursera-SwiftKey.zip"
if(!file.exists(filepath)){
        message("Downloading Text corpus ....")
        download.file(url = zipUrl,destfile = filepath ,mode="wb") 
        unzip(zipfile = filepath)
} else{
        print("Text corpus present, skip downloading")
}
## [1] "Text corpus present, skip downloading"
Url <- "http://www.bannedwordlist.com/lists/swearWords.txt"
filepath <- "./data/swearWords.txt"
if(!file.exists(filepath)){
        message("Downloading profanity list file ....")
        download.file(url = Url,destfile = filepath ,mode="wb") 
} else{
        print("profanity list file present, skip downloading")  
}
## [1] "profanity list file present, skip downloading"

For this report we are selecting en_US text corpus.
Display all file present target directory and their linecounts, size etc.

fsrc <- "./data/Coursera-SwiftKey/final/en_US/"
file_list <- dir(fsrc)
file_path_src <- unlist(lapply(fsrc,paste,file_list,sep=""))
line_counts_src <-lapply(file_path_src,countLines)
file_info <- paste("(",file_list," - ",line_counts_src,")",sep="")

files are present & their line counts: (en_US.blogs.txt - 899288), (en_US.news.txt - 1010242), (en_US.twitter.txt - 2360148)
These are lots of lines to process, so we’ll do a 1% sample using rbinom.

file_samp <- function(fname,Path_src, Path_dest){
        InFile <-paste(Path_src,fname,sep="")
        OutFile<-paste(Path_dest,"samp_",fname,sep="")
        Total_lines <- countLines(InFile)
        con <- file(InFile, "r")
        Block_cnt <- 100
        Block_size <- round(Total_lines[1]/Block_cnt)
        for(i in 1:Block_cnt){
                #Read Next block from file
                data_in <- readLines(con, Block_size)
                #1% sample
                Index <-1:length(data_in)
                keep_rec<-which(rbinom(Index,1,prob = 0.02)==1)
                #write following records
                write(data_in[keep_rec],file =OutFile,append=TRUE)
                #cat("reading block i=",i,"\n") #printing progress
        }
        close(con)
}
fsrc <- "./data/Coursera-SwiftKey/final/en_US/"
fdest <- "./data/text/"
file_list <- dir(fsrc)  #create list of all files in that directory
#file_samp(,fsrc,fdest)
if(!file.exists(paste(fdest,"samp_",file_list[1],sep=""))){
        lapply(file_list,FUN=file_samp,Path_src=fsrc,Path_dest=fdest)        
}
file_list_samp <- dir(fdest)
file_path_samp <- unlist(lapply(fdest,paste,file_list_samp,sep=""))
line_counts_samp <-lapply(file_path_samp,countLines)
file_info_samp <- paste("(",file_list_samp," - ",line_counts_samp,")",sep="")

Sample files names & their line counts: (samp_en_US.blogs.txt - 35754), (samp_en_US.news.txt - 3372), (samp_en_US.twitter.txt - 94026)

Text Preprocessing - cleaning the data

We now load the data using the tm package and perform the following cleanup.

  • Load data using utf-8 conversion
  • remove puncuations (reduces combination overhead)
  • remove numbers (difficult to predict the numbers so removing)
  • Try to do sensible translate on latin to ascii characters
  • remove non character text (some strange looking symbols etc.)
  • lower case conversion of all text (reduces combination overhead)
  • remove profanity words (list source from site www.bannedwordlist.com)
fsrc <- "./data/text/"
docs <- Corpus(DirSource(fsrc,encoding = "UTF-8"))
#load swear words
pwords <- "./data/swearWords.txt"
con <- file(pwords, "r")
list_swearWords <- readLines(con)
## Warning in readLines(con): incomplete final line found on './data/
## swearWords.txt'
close(con)
list_swearWords <-stri_trans_tolower(list_swearWords, locale = NULL)
#Text cleaning
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
#Replace n remove Latin characters
for(j in seq(docs)){
        docs[[j]]$content %>% 
                #stri_trans_general("latin-ascii") %>% #sensible text conversion to ascii
                        stri_replace_all_regex("\\p{S}|\\p{N}|\\p{C}", " ") %>% #remove pesky \uxxx characters
                                tokenize_lines() %>%
                stri_trans_tolower()
        docs[[j]] <- gsub('[^a-z ]', '', docs[[j]], perl = TRUE)
}
docs <- tm_map(docs, removeWords, list_swearWords)

It would be interesting to note that while creating unigrams stops words don’t provide us with much information. Hence we split processing as-

  • For unigrams we remove the stop words
  • For bigram and higher stop words are retained.
  • remove extra white space
docs_uni <- tm_map(docs, removeWords, stopwords("en"))

docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, PlainTextDocument)

docs_uni <- tm_map(docs_uni, stripWhitespace)
docs_uni <- tm_map(docs_uni, PlainTextDocument)

Statistics- unigram

Lets explore the maximum appear words in the text. Inorder to optimize processing, table summerises were joined and total frequencies recalculated.

#unigrams
lines <- list()
words <- list()
wd_count <- function(docs,j){
        lines <- docs[[j]]$content
        words<- unlist(tokenize_word_stems(lines))
        wd_tb <- table(words)
        wd_tb <- wd_tb[order(-wd_tb)]
        return(as.data.frame(wd_tb,stringsAsFactors = FALSE))
}
wd_tb1 <- wd_count(docs_uni,1)
wd_tb2 <- wd_count(docs_uni,2)
wd_tb3 <- wd_count(docs_uni,3)

wd_tb_all <- rbind(wd_tb1,wd_tb2,wd_tb3) %>%
        group_by(words) %>%
        summarise(Cnt = sum(Freq)) %>%
        arrange(desc(Cnt))
## Warning: failed to assign NativeSymbolInfo for env since env is already
## defined in the 'lazyeval' namespace
rm(list=c("lines","words","wd_tb1","wd_tb2","wd_tb3"))

Plot the frequencies of unigrams

word_cnt <- wd_tb_all$Cnt
names(word_cnt) <- wd_tb_all$words
barplot(word_cnt[1:15] , main="Most frequent word Distribution", 
        xlab="Stemmed words", ylab="Occurance")  

calculating relative percentages

number of words to include to account for 50% & 90% of occurences.

word_cumsum <- cumsum(wd_tb_all$Cnt)
word_cumsum_relfreq <- word_cumsum/sum(wd_tb_all$Cnt)
words_total <- nrow(wd_tb_all)
words_half  <- wd_tb_all$words[which(word_cumsum_relfreq <= 0.5)] #cover half word occurances
words_ninty <- wd_tb_all$words[which(word_cumsum_relfreq <= 0.9)] #cover 90% word occurances

Number of words to include for 50% total coverage - 332 out of 50208 total words
Number of words to include for 90% total coverage - 5916 out of 50208 total words

Bigram & Trigram Word distributions

Calculations

ngram_count <- function(docs,j,ngram){
    lines <- docs[[j]]$content
    lines <- gsub('[^a-z ]', '',lines, perl = TRUE)
    words<- unlist(tokenize_ngrams(lines,n=ngram,n_min=ngram))
    wd_tb <- table(words)
    wd_tb <- wd_tb[order(-wd_tb)]
    return(as.data.frame(wd_tb,stringsAsFactors = FALSE))
}
#bigram calculations
ngram_tb1 <- ngram_count(docs,1,2)
ngram_tb2 <- ngram_count(docs,2,2)
ngram_tb3 <- ngram_count(docs,3,2)
bigram_tb_all <- rbind(ngram_tb1,ngram_tb2,ngram_tb3) %>%
    group_by(words) %>%
        summarise(Cnt = sum(Freq)) %>%
        arrange(desc(Cnt))
## Warning: failed to assign NativeSymbolInfo for env since env is already
## defined in the 'lazyeval' namespace
rm(list=c("ngram_tb1","ngram_tb2","ngram_tb3")) 
#trigram calculations
ngram_tb1 <- ngram_count(docs,1,3)
ngram_tb2 <- ngram_count(docs,2,3)
ngram_tb3 <- ngram_count(docs,3,3)
trigram_tb_all <- rbind(ngram_tb1,ngram_tb2,ngram_tb3) %>%
    group_by(words) %>%
        summarise(Cnt = sum(Freq)) %>%
        arrange(desc(Cnt))
rm(list=c("ngram_tb1","ngram_tb2","ngram_tb3")) 

Number of non unique bigrams 2
Number of non unique trigrams 2

word_cnt <- bigram_tb_all$Cnt
names(word_cnt) <- bigram_tb_all$words
barplot(word_cnt[1:15] ,main="Most frequent word pair Distribution",
xlab="Word Pairs", ylab="Occurance",las=2)

word_cnt <- trigram_tb_all$Cnt
names(word_cnt) <- trigram_tb_all$words
barplot(word_cnt[1:15] ,main="Most frequent three word pair Distribution",
xlab="three Word Pairs", ylab="Occurance",las=2)

Ideas for Text Prediction

I intend to use the above ngrams(uni, bi & tri) and turn these into markov or hash dictionaries for lookups.
Upto 3 word combination is only intended to use due to computational limitations.
I also tend to use list of english word to append to by unigram dictionary. Proposed flow of the algorithm is like this:
for a multi word input next word will be fetched by looking at last three words. if lookups fail at any time the backoff to bigram will be made, if that fails then prediction will make use of unigram. I am still open to the kind of statistical prediction method to use.