This report is a brief summary of the exploration done on the sample of text that was provide provided. Using the insights of text relationships observed we propose to build a practical application using shiny app that help predict the next word based on the collection of inputs.
library(R.utils)
library(dplyr)
library(tm)
library(stringi)
library(tokenizers)
library(profr)
downloading the source files and unziping them.
downloading profanity list files
setwd("E://Coursera/capstone")
zipUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
filepath <- "./data/Coursera-SwiftKey.zip"
if(!file.exists(filepath)){
message("Downloading Text corpus ....")
download.file(url = zipUrl,destfile = filepath ,mode="wb")
unzip(zipfile = filepath)
} else{
print("Text corpus present, skip downloading")
}
## [1] "Text corpus present, skip downloading"
Url <- "http://www.bannedwordlist.com/lists/swearWords.txt"
filepath <- "./data/swearWords.txt"
if(!file.exists(filepath)){
message("Downloading profanity list file ....")
download.file(url = Url,destfile = filepath ,mode="wb")
} else{
print("profanity list file present, skip downloading")
}
## [1] "profanity list file present, skip downloading"
For this report we are selecting en_US text corpus.
Display all file present target directory and their linecounts, size etc.
fsrc <- "./data/Coursera-SwiftKey/final/en_US/"
file_list <- dir(fsrc)
file_path_src <- unlist(lapply(fsrc,paste,file_list,sep=""))
line_counts_src <-lapply(file_path_src,countLines)
file_info <- paste("(",file_list," - ",line_counts_src,")",sep="")
files are present & their line counts: (en_US.blogs.txt - 899288), (en_US.news.txt - 1010242), (en_US.twitter.txt - 2360148)
These are lots of lines to process, so we’ll do a 1% sample using rbinom.
file_samp <- function(fname,Path_src, Path_dest){
InFile <-paste(Path_src,fname,sep="")
OutFile<-paste(Path_dest,"samp_",fname,sep="")
Total_lines <- countLines(InFile)
con <- file(InFile, "r")
Block_cnt <- 100
Block_size <- round(Total_lines[1]/Block_cnt)
for(i in 1:Block_cnt){
#Read Next block from file
data_in <- readLines(con, Block_size)
#1% sample
Index <-1:length(data_in)
keep_rec<-which(rbinom(Index,1,prob = 0.02)==1)
#write following records
write(data_in[keep_rec],file =OutFile,append=TRUE)
#cat("reading block i=",i,"\n") #printing progress
}
close(con)
}
fsrc <- "./data/Coursera-SwiftKey/final/en_US/"
fdest <- "./data/text/"
file_list <- dir(fsrc) #create list of all files in that directory
#file_samp(,fsrc,fdest)
if(!file.exists(paste(fdest,"samp_",file_list[1],sep=""))){
lapply(file_list,FUN=file_samp,Path_src=fsrc,Path_dest=fdest)
}
file_list_samp <- dir(fdest)
file_path_samp <- unlist(lapply(fdest,paste,file_list_samp,sep=""))
line_counts_samp <-lapply(file_path_samp,countLines)
file_info_samp <- paste("(",file_list_samp," - ",line_counts_samp,")",sep="")
Sample files names & their line counts: (samp_en_US.blogs.txt - 35754), (samp_en_US.news.txt - 3372), (samp_en_US.twitter.txt - 94026)
We now load the data using the tm package and perform the following cleanup.
fsrc <- "./data/text/"
docs <- Corpus(DirSource(fsrc,encoding = "UTF-8"))
#load swear words
pwords <- "./data/swearWords.txt"
con <- file(pwords, "r")
list_swearWords <- readLines(con)
## Warning in readLines(con): incomplete final line found on './data/
## swearWords.txt'
close(con)
list_swearWords <-stri_trans_tolower(list_swearWords, locale = NULL)
#Text cleaning
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
#Replace n remove Latin characters
for(j in seq(docs)){
docs[[j]]$content %>%
#stri_trans_general("latin-ascii") %>% #sensible text conversion to ascii
stri_replace_all_regex("\\p{S}|\\p{N}|\\p{C}", " ") %>% #remove pesky \uxxx characters
tokenize_lines() %>%
stri_trans_tolower()
docs[[j]] <- gsub('[^a-z ]', '', docs[[j]], perl = TRUE)
}
docs <- tm_map(docs, removeWords, list_swearWords)
It would be interesting to note that while creating unigrams stops words don’t provide us with much information. Hence we split processing as-
docs_uni <- tm_map(docs, removeWords, stopwords("en"))
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, PlainTextDocument)
docs_uni <- tm_map(docs_uni, stripWhitespace)
docs_uni <- tm_map(docs_uni, PlainTextDocument)
Lets explore the maximum appear words in the text. Inorder to optimize processing, table summerises were joined and total frequencies recalculated.
#unigrams
lines <- list()
words <- list()
wd_count <- function(docs,j){
lines <- docs[[j]]$content
words<- unlist(tokenize_word_stems(lines))
wd_tb <- table(words)
wd_tb <- wd_tb[order(-wd_tb)]
return(as.data.frame(wd_tb,stringsAsFactors = FALSE))
}
wd_tb1 <- wd_count(docs_uni,1)
wd_tb2 <- wd_count(docs_uni,2)
wd_tb3 <- wd_count(docs_uni,3)
wd_tb_all <- rbind(wd_tb1,wd_tb2,wd_tb3) %>%
group_by(words) %>%
summarise(Cnt = sum(Freq)) %>%
arrange(desc(Cnt))
## Warning: failed to assign NativeSymbolInfo for env since env is already
## defined in the 'lazyeval' namespace
rm(list=c("lines","words","wd_tb1","wd_tb2","wd_tb3"))
Plot the frequencies of unigrams
word_cnt <- wd_tb_all$Cnt
names(word_cnt) <- wd_tb_all$words
barplot(word_cnt[1:15] , main="Most frequent word Distribution",
xlab="Stemmed words", ylab="Occurance")
number of words to include to account for 50% & 90% of occurences.
word_cumsum <- cumsum(wd_tb_all$Cnt)
word_cumsum_relfreq <- word_cumsum/sum(wd_tb_all$Cnt)
words_total <- nrow(wd_tb_all)
words_half <- wd_tb_all$words[which(word_cumsum_relfreq <= 0.5)] #cover half word occurances
words_ninty <- wd_tb_all$words[which(word_cumsum_relfreq <= 0.9)] #cover 90% word occurances
Number of words to include for 50% total coverage - 332 out of 50208 total words
Number of words to include for 90% total coverage - 5916 out of 50208 total words
Calculations
ngram_count <- function(docs,j,ngram){
lines <- docs[[j]]$content
lines <- gsub('[^a-z ]', '',lines, perl = TRUE)
words<- unlist(tokenize_ngrams(lines,n=ngram,n_min=ngram))
wd_tb <- table(words)
wd_tb <- wd_tb[order(-wd_tb)]
return(as.data.frame(wd_tb,stringsAsFactors = FALSE))
}
#bigram calculations
ngram_tb1 <- ngram_count(docs,1,2)
ngram_tb2 <- ngram_count(docs,2,2)
ngram_tb3 <- ngram_count(docs,3,2)
bigram_tb_all <- rbind(ngram_tb1,ngram_tb2,ngram_tb3) %>%
group_by(words) %>%
summarise(Cnt = sum(Freq)) %>%
arrange(desc(Cnt))
## Warning: failed to assign NativeSymbolInfo for env since env is already
## defined in the 'lazyeval' namespace
rm(list=c("ngram_tb1","ngram_tb2","ngram_tb3"))
#trigram calculations
ngram_tb1 <- ngram_count(docs,1,3)
ngram_tb2 <- ngram_count(docs,2,3)
ngram_tb3 <- ngram_count(docs,3,3)
trigram_tb_all <- rbind(ngram_tb1,ngram_tb2,ngram_tb3) %>%
group_by(words) %>%
summarise(Cnt = sum(Freq)) %>%
arrange(desc(Cnt))
rm(list=c("ngram_tb1","ngram_tb2","ngram_tb3"))
Number of non unique bigrams 2
Number of non unique trigrams 2
word_cnt <- bigram_tb_all$Cnt
names(word_cnt) <- bigram_tb_all$words
barplot(word_cnt[1:15] ,main="Most frequent word pair Distribution",
xlab="Word Pairs", ylab="Occurance",las=2)
word_cnt <- trigram_tb_all$Cnt
names(word_cnt) <- trigram_tb_all$words
barplot(word_cnt[1:15] ,main="Most frequent three word pair Distribution",
xlab="three Word Pairs", ylab="Occurance",las=2)
I intend to use the above ngrams(uni, bi & tri) and turn these into markov or hash dictionaries for lookups.
Upto 3 word combination is only intended to use due to computational limitations.
I also tend to use list of english word to append to by unigram dictionary. Proposed flow of the algorithm is like this:
for a multi word input next word will be fetched by looking at last three words. if lookups fail at any time the backoff to bigram will be made, if that fails then prediction will make use of unigram. I am still open to the kind of statistical prediction method to use.