Purpose

The goal of this project is to show some familiarity with the data and outline the path for the prediction algorithm.

Exploratory analysis consists of:

  1. Getting and reading the data into R
  2. Cleaning of the data consists of removing all non-letter symbols, blank string and stop words within helper functions
  3. Summary statistics
  4. Discussion of the prediction algorithm
  5. Plan for the Shiny app

Getting and reading the data into R

# Open connection
con.twitter<-file(paste0(path2data,"en_US.twitter.txt")) 
con.news<-file(paste0(path2data,"en_US.news.txt")) 
con.blogs<-file(paste0(path2data,"en_US.blogs.txt")) 

# Read files
twitter<-read_lines(con.twitter)
news<-read_lines(con.news)
blogs<-read_lines(con.blogs)

Summary Statistics

  • Counts of lines(documents) and words by file
summary.counts<-data.frame(category=c("twitter","news","blogs"),lines=NA,words=NA)
summary.counts$lines<-sapply(list(twitter,news,blogs),length)
summary.counts$words<-sapply(list(twitter,news,blogs),count_words)
##   category   lines    words
## 1  twitter 2360148 30373543
## 2     news 1010242 34372530
## 3    blogs  899288 37334131
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

  • Sampling was done to extract 5% of the documents from the data, these samples were used to determine:

  • Total words in a sample
  • Number of words required to cover 50% of all words in a sample
  • Number of words required to cover 90% of all words in a sample
  • Unique words in a sample (100% coverage)

# Generate lists where each element is a unique word. Each element contains multiple entries for each occurrence of a word. Number of elements corresponds to unique words. Length of an element corresponds to the count a word.
# Sampled at 5% of the list.
# Stopwords removed
# remove all special characters and numbers

uni.twitter<-unique_list(twitter,0.05)
uni.news<-unique_list(news,0.05)
uni.blogs<-unique_list(blogs,0.05)

# All words, 50% and 90% coverage, 100% (unique counts)
uni.coverage<-data.frame(category=c("twitter","news","blogs"),Sample5Pct=NA,cover50=NA,cover90=NA,cover100=NA)

uni.coverage$Sample5Pct<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),"[[",2))
uni.coverage$cover50<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),coverage_pct,cov_pct=.5))
uni.coverage$cover90<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),coverage_pct,cov_pct=.9))
uni.coverage$cover100<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),coverage_pct,cov_pct=1))
print(uni.coverage)
##   category Sample5Pct cover50 cover90 cover100
## 1  twitter    926,490     605  28,434  121,083
## 2     news  1,026,130   1,322  25,286  115,756
## 3    blogs  1,000,355   1,007  19,442   97,222
  • Top 10 most frequent words, 2-words and 3-words combinations. Two approaches were considered for the exploratory analysis:

  • As an experiment, Twitter file (2% sample) was unlisted into a single string and 1, 2 and 3 most frequent word combinations were identified

# uni_top10 function generates a list of top 10 grams( 1, 2 and 3 combinations from a sample)

top<-list() #list to store results of uni_top10
all_files<-c("twitter","news","blogs")

for (i in 1:3){         #loop through list of files
  for (j in c(1,2,3))   #loop through grams: 1, 2 and 3 
    top[[paste0(all_files[i],"_n",j)]]<-uni_top10(eval(as.symbol(all_files[i])),.02, gram=j)
  
}

# Use twitter file for illustration

# Top 1 for Twitter
twitter.top1<-data.frame(Top=names(top$twitter_n1),Freq=top$twitter_n1)
twitter.top1$Top<-factor(twitter.top1$Top,levels=twitter.top1$Top[order(twitter.top1$Freq,decreasing = F)])

Top1<-ggplot(twitter.top1,aes(x=Top,y=Freq,fill=Top))+geom_bar(stat="identity",color="black")+
  coord_flip()+labs(title="Top 1 Word Terms",x="Freq", y = "Term")+
  scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
  theme(legend.position="none")


# Top 2 for Twitter
twitter.top2<-data.frame(Top=names(top$twitter_n2),Freq=top$twitter_n2)
twitter.top2$Top<-factor(twitter.top2$Top,levels=twitter.top2$Top[order(twitter.top2$Freq,decreasing = F)])

Top2<-ggplot(twitter.top2,aes(x=Top,y=Freq,fill=Top))+geom_bar(stat="identity",color="black")+
  coord_flip()+labs(title="Top 2 Word Terms",x="Freq", y = "Term")+
  scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
  theme(legend.position="none")

# Top 3 for Twitter
twitter.top3<-data.frame(Top=names(top$twitter_n3),Freq=top$twitter_n3)
twitter.top3$Top<-factor(twitter.top3$Top,levels=twitter.top3$Top[order(twitter.top3$Freq,decreasing = F)])

Top3<-ggplot(twitter.top3,aes(x=Top,y=Freq,fill=Top))+geom_bar(stat="identity",color="black")+
  coord_flip()+labs(title="Top 3 Word Terms",x="Freq", y = "Term")+
  scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
  theme(legend.position="none")
grid.arrange(Top1, Top2, Top3, nrow = 1,top="Use R List Functionality to Accumulate Grams")
plot of chunk unnamed-chunk-8

plot of chunk unnamed-chunk-8

  • Use “tm” package functionality on Twitter file (5% sample) to generate term document matrix for1, 2 and 3 most frequent word combinations (N-grams):

Creation of Corpora and cleanup

# Create Corpus
Corpus.twitter<-VCorpus(VectorSource(twitter[rbinom(length(twitter),1,.02)>0]))

# Cleanup

#1. keep only letters and spaces
removeNumSpecial <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
Corpus.twitter<- tm_map(Corpus.twitter, content_transformer(removeNumSpecial))


#2. lower case
Corpus.twitter<-tm_map(Corpus.twitter, content_transformer(tolower))

#3. remove stopwords
Stopwords<-c(stopwords("en"),"rt")
Corpus.twitter<- tm_map(Corpus.twitter, removeWords, Stopwords)

#4. remove whitespace
Corpus.twitter<- tm_map(Corpus.twitter, stripWhitespace)

Create charts

# Use tm to create chart for the most frequent 1,2,3 gram terms. Function forplot returns ggplot object
Top1.twitter<-forPlotTop10(corpora=Corpus.twitter,ngram=1)
Top2.twitter<-forPlotTop10(corpora=Corpus.twitter,ngram=2)
Top3.twitter<-forPlotTop10(corpora=Corpus.twitter,ngram=3)
grid.arrange(Top1.twitter, Top2.twitter, Top3.twitter, nrow = 1, top="Use 'tm' package to generate Terms/N-grams")
plot of chunk unnamed-chunk-11

plot of chunk unnamed-chunk-11

Observations:

  • Lines (documents) vs. words: as expected, blogs have the most words per document and twitter has the least
  • Twitter requires relatively few words to cover 50% of all words and yet has the most unique words in the sample
  • Using lists to accumulate 1 and 2 grams generated an acceptable result when compared to “tm” results.

Plans for prediction Algorithm

N-Grams is the most frequently used algorithm for word prediction. The algorithm looks at the large collection of documents and estimates relative frequency counts of a next word given 1 (bigram) or 2 (trigram) prior words. The assumption is that the probability depends only on a few words before and is similar to Markov process.
Next step is to take a transformed bigram term matrix and return a table of most likely next words based on a first word. Test words are: “cant”, “right”, “last”:

# Create dataframe that contains bigrams and their frequency counts
tdm.summary2<-Tdm.biGram(Corpus.twitter)
biGram(tdm.summary2,"cant")
##        Next Freq  Prop
##  1:    wait  345 31.5%
##  2: believe   60  5.5%
##  3:     get   40  3.6%
##  4:    even   28  2.6%
##  5:   sleep   27  2.5%
##  6:    stop   21  1.9%
##  7:   stand   19  1.7%
##  8:      go   18  1.6%
##  9:    help   18  1.6%
## 10:     see   17  1.6%
biGram(tdm.summary2,"right")
##       Next Freq  Prop
##  1:    now  308 40.5%
##  2:   back   18  2.4%
##  3:     im    7  0.9%
##  4:   next    7  0.9%
##  5:  thing    5  0.7%
##  6: middle    5  0.7%
##  7:   just    4  0.5%
##  8:    lol    4  0.5%
##  9:   side    4  0.5%
## 10:    one    4  0.5%
biGram(tdm.summary2,"last")
##       Next Freq  Prop
##  1:  night  223 30.7%
##  2:   year   46  6.3%
##  3:   week   42  5.8%
##  4:    day   31  4.3%
##  5:   time   29  4.0%
##  6:  tweet   16  2.2%
##  7:  years   16  2.2%
##  8: nights   15  2.1%
##  9:   name   13  1.8%
## 10: minute   13  1.8%

Shiny App

Shiny app will consist of text input field (box). Upon competition of a word (after a user hits “space”) a user will be presented with 3 most likely next words in a separate window below the main input field, similar to many keyboard apps on today’s mobile platforms.

Functions

# count words
count_words<-function(x) length(unlist(strsplit(x,"\40")))

# create list that contains unique words as elements. Count or sort elements to get unique words stats 

unique_list<-function(file2read,sample_size){
  file2read<-file2read[rbinom(length(file2read),1,sample_size)>0]
  words<-unlist(strsplit(file2read,"\40"))
  #keeps only letters
  words<-tolower(gsub("([^a-zA-Z])","",words))
  #remove stopwords
  words<-words[!words %in% stopwords("en")]
  #return number of words less exclusions
  len_words<-length(words)
  
  #empty list to store values
  wlist<-list()
  
  for (i in words){
    #remoSpecial<-tolower(gsub("([^a-zA-Z])","",i))
    wlist[[i]]<-c(wlist[[i]],i)
  }
  return(list(wlist,len_words))
}




# Unique words from sampled list and coverage requirements

coverage_pct<-function(uni_list,cov_pct=.5){
  tt.ordered<-uni_list[[1]][order(sapply(uni_list[[1]],length),decreasing = T)]
  
  coverage_count<-0
  i<-1
  coverage_limit<-uni_list[[2]]*cov_pct
  while(TRUE){
    coverage_count<-coverage_count+length(tt.ordered[[i]])
    if (coverage_count>=coverage_limit) break
    i<-i+1
    
  }
  return(i)
}

# Function produces the list of 10 most frequent terms for a sample of an input file

uni_top10<-function(file2read,sample_size, gram=1){
  file2read<-file2read[rbinom(length(file2read),1,sample_size)>0]
  words<-unlist(strsplit(file2read,"\40+"))
  #keeps only letters
  words<-tolower(gsub("([^a-zA-Z])","",words))
  #remove stopwords
  words<-words[!words %in% c(stopwords("en"),"","g","rt","ass")]
  #return number of words less exclusions
  len_words<-length(words)
  
  #uni_lag<-function(gram){
  
  if (gram==1) {
    uni_lag_vector<-words
  } else if (gram==2) {
    uni_lag_vector<-paste(words,lead(words,2))
  } else if (gram==3) {uni_lag_vector<-paste(words,lead(words,2),lead(words,3))
  }
  #   return(uni_lag_vector)
  # }
  
  
  wlist<-list()
  
  for (i in uni_lag_vector){
    #remoSpecial<-tolower(gsub("([^a-zA-Z])","",i))
    wlist[[i]]<-c(wlist[[i]],i)
  }
  
  wlist<-wlist[order(sapply(wlist,length),decreasing = T)]
  top_list<-sapply(wlist[1:10],length)
  
  return(top_list)
}


#fuction: take corpora and return top N (1,2,3) ggplot objects

forPlotTop10<-function(corpora, ngram){
  tdm.temp<-TermDocumentMatrix(Corpus.twitter, control=list(tokenize = function(x) NGramTokenizer(x, Weka_control(min=ngram, max=ngram)), wordLengths = c(3,Inf)))
  #data.table for term matrix aggregation
  tdm.summary<-data.table(Term.ind=tdm.temp$i)
  #determine position of the Top 10 most frequent terms 
  Top10.tdm<-tdm.summary[,.("Freq"=.N),by=Term.ind][order(-Freq)][1:10,]
  #dataframe for ggplot
  forplot<-data.frame(Terms=Terms(tdm.temp)[Top10.tdm$Term.ind],Freq=Top10.tdm$Freq)
  #reorder from highest to lowest for ggplot
  forplot$Terms<-factor(forplot$Terms,levels=forplot$Terms[order(forplot$Freq,decreasing = F)])
  #ggplot object
  gplotTop10<-ggplot(forplot,aes(x=Terms,y=Freq,fill=Terms))+geom_bar(stat="identity",color="black")+
    coord_flip()+labs(title=paste0("Top ",ngram, " Word Terms"),x="Freq", y = "Term")+
    scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
    theme(legend.position="none")
  
  return(gplotTop10)
}

# function to create input for BiGram - dataframe that has bigrams and their frequency
Tdm.biGram<-function(corpora, ngram=2){
  tdm.temp<-TermDocumentMatrix(Corpus.twitter, control=list(tokenize = function(x) NGramTokenizer(x, Weka_control(min=ngram, max=ngram)),
                                                            wordLengths = c(3,Inf)))
  #data.table for term matrix aggregation
  tdm.summary<-data.table(Term.ind=tdm.temp$i)
  #determine position of the Top 10 most frequent terms 
  tdm.summary<-tdm.summary[,.("Freq"=.N),by=Term.ind][order(-Freq)]
  #dataframe for ggplot
  tdm.summary2<-data.table(Terms=Terms(tdm.temp)[tdm.summary$Term.ind],Freq=tdm.summary$Freq,Term.ind=tdm.summary$Term.ind)
  
  #return dataframe that has bigrams and their frequency
  return(tdm.summary2)
}


# function dataframe from Tdm.biGram and a word - return the table of most likely next words
biGram<-function(df,word){
  return(df[grep(paste0("(^",word,")(\\s)"),df$Terms)
            ][,`:=`(Prop=percent(prop.table(Freq)),Next=gsub(paste0(word," "),"",Terms))
              ][1:10,.(Next,Freq,Prop)])
}