DS10: Exploratory Analysis

Purpose

The goal of this project is to show some familiarity with the data and outline the path for the prediction algorithm.

Exploratory analysis consists of:

Getting and reading the data into R
Cleaning of the data consists of removing all non-letter symbols, blank string and stop words within helper functions
Summary statistics
Discussion of the prediction algorithm
Plan for the Shiny app

Getting and reading the data into R

# Open connection
con.twitter<-file(paste0(path2data,"en_US.twitter.txt")) 
con.news<-file(paste0(path2data,"en_US.news.txt")) 
con.blogs<-file(paste0(path2data,"en_US.blogs.txt")) 

# Read files
twitter<-read_lines(con.twitter)
news<-read_lines(con.news)
blogs<-read_lines(con.blogs)

Summary Statistics

Counts of lines(documents) and words by file

summary.counts<-data.frame(category=c("twitter","news","blogs"),lines=NA,words=NA)
summary.counts$lines<-sapply(list(twitter,news,blogs),length)
summary.counts$words<-sapply(list(twitter,news,blogs),count_words)

##   category   lines    words
## 1  twitter 2360148 30373543
## 2     news 1010242 34372530
## 3    blogs  899288 37334131

plot of chunk unnamed-chunk-4

Sampling was done to extract 5% of the documents from the data, these samples were used to determine:
Total words in a sample
Number of words required to cover 50% of all words in a sample
Number of words required to cover 90% of all words in a sample
Unique words in a sample (100% coverage)

# Generate lists where each element is a unique word. Each element contains multiple entries for each occurrence of a word. Number of elements corresponds to unique words. Length of an element corresponds to the count a word.
# Sampled at 5% of the list.
# Stopwords removed
# remove all special characters and numbers

uni.twitter<-unique_list(twitter,0.05)
uni.news<-unique_list(news,0.05)
uni.blogs<-unique_list(blogs,0.05)

# All words, 50% and 90% coverage, 100% (unique counts)
uni.coverage<-data.frame(category=c("twitter","news","blogs"),Sample5Pct=NA,cover50=NA,cover90=NA,cover100=NA)

uni.coverage$Sample5Pct<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),"[[",2))
uni.coverage$cover50<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),coverage_pct,cov_pct=.5))
uni.coverage$cover90<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),coverage_pct,cov_pct=.9))
uni.coverage$cover100<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),coverage_pct,cov_pct=1))

print(uni.coverage)

##   category Sample5Pct cover50 cover90 cover100
## 1  twitter    926,490     605  28,434  121,083
## 2     news  1,026,130   1,322  25,286  115,756
## 3    blogs  1,000,355   1,007  19,442   97,222

Top 10 most frequent words, 2-words and 3-words combinations. Two approaches were considered for the exploratory analysis:
As an experiment, Twitter file (2% sample) was unlisted into a single string and 1, 2 and 3 most frequent word combinations were identified

# uni_top10 function generates a list of top 10 grams( 1, 2 and 3 combinations from a sample)

top<-list() #list to store results of uni_top10
all_files<-c("twitter","news","blogs")

for (i in 1:3){         #loop through list of files
  for (j in c(1,2,3))   #loop through grams: 1, 2 and 3 
    top[[paste0(all_files[i],"_n",j)]]<-uni_top10(eval(as.symbol(all_files[i])),.02, gram=j)
  
}

# Use twitter file for illustration

# Top 1 for Twitter
twitter.top1<-data.frame(Top=names(top$twitter_n1),Freq=top$twitter_n1)
twitter.top1$Top<-factor(twitter.top1$Top,levels=twitter.top1$Top[order(twitter.top1$Freq,decreasing = F)])

Top1<-ggplot(twitter.top1,aes(x=Top,y=Freq,fill=Top))+geom_bar(stat="identity",color="black")+
  coord_flip()+labs(title="Top 1 Word Terms",x="Freq", y = "Term")+
  scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
  theme(legend.position="none")


# Top 2 for Twitter
twitter.top2<-data.frame(Top=names(top$twitter_n2),Freq=top$twitter_n2)
twitter.top2$Top<-factor(twitter.top2$Top,levels=twitter.top2$Top[order(twitter.top2$Freq,decreasing = F)])

Top2<-ggplot(twitter.top2,aes(x=Top,y=Freq,fill=Top))+geom_bar(stat="identity",color="black")+
  coord_flip()+labs(title="Top 2 Word Terms",x="Freq", y = "Term")+
  scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
  theme(legend.position="none")

# Top 3 for Twitter
twitter.top3<-data.frame(Top=names(top$twitter_n3),Freq=top$twitter_n3)
twitter.top3$Top<-factor(twitter.top3$Top,levels=twitter.top3$Top[order(twitter.top3$Freq,decreasing = F)])

Top3<-ggplot(twitter.top3,aes(x=Top,y=Freq,fill=Top))+geom_bar(stat="identity",color="black")+
  coord_flip()+labs(title="Top 3 Word Terms",x="Freq", y = "Term")+
  scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
  theme(legend.position="none")

grid.arrange(Top1, Top2, Top3, nrow = 1,top="Use R List Functionality to Accumulate Grams")

plot of chunk unnamed-chunk-8

Use “tm” package functionality on Twitter file (5% sample) to generate term document matrix for1, 2 and 3 most frequent word combinations (N-grams):

Creation of Corpora and cleanup

# Create Corpus
Corpus.twitter<-VCorpus(VectorSource(twitter[rbinom(length(twitter),1,.02)>0]))

# Cleanup

#1. keep only letters and spaces
removeNumSpecial <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
Corpus.twitter<- tm_map(Corpus.twitter, content_transformer(removeNumSpecial))


#2. lower case
Corpus.twitter<-tm_map(Corpus.twitter, content_transformer(tolower))

#3. remove stopwords
Stopwords<-c(stopwords("en"),"rt")
Corpus.twitter<- tm_map(Corpus.twitter, removeWords, Stopwords)

#4. remove whitespace
Corpus.twitter<- tm_map(Corpus.twitter, stripWhitespace)

Create charts

# Use tm to create chart for the most frequent 1,2,3 gram terms. Function forplot returns ggplot object
Top1.twitter<-forPlotTop10(corpora=Corpus.twitter,ngram=1)
Top2.twitter<-forPlotTop10(corpora=Corpus.twitter,ngram=2)
Top3.twitter<-forPlotTop10(corpora=Corpus.twitter,ngram=3)

grid.arrange(Top1.twitter, Top2.twitter, Top3.twitter, nrow = 1, top="Use 'tm' package to generate Terms/N-grams")

plot of chunk unnamed-chunk-11

Observations:

Lines (documents) vs. words: as expected, blogs have the most words per document and twitter has the least
Twitter requires relatively few words to cover 50% of all words and yet has the most unique words in the sample
Using lists to accumulate 1 and 2 grams generated an acceptable result when compared to “tm” results.

Plans for prediction Algorithm

N-Grams is the most frequently used algorithm for word prediction. The algorithm looks at the large collection of documents and estimates relative frequency counts of a next word given 1 (bigram) or 2 (trigram) prior words. The assumption is that the probability depends only on a few words before and is similar to Markov process.
Next step is to take a transformed bigram term matrix and return a table of most likely next words based on a first word. Test words are: “cant”, “right”, “last”:

# Create dataframe that contains bigrams and their frequency counts
tdm.summary2<-Tdm.biGram(Corpus.twitter)

biGram(tdm.summary2,"cant")

##        Next Freq  Prop
##  1:    wait  345 31.5%
##  2: believe   60  5.5%
##  3:     get   40  3.6%
##  4:    even   28  2.6%
##  5:   sleep   27  2.5%
##  6:    stop   21  1.9%
##  7:   stand   19  1.7%
##  8:      go   18  1.6%
##  9:    help   18  1.6%
## 10:     see   17  1.6%

biGram(tdm.summary2,"right")

##       Next Freq  Prop
##  1:    now  308 40.5%
##  2:   back   18  2.4%
##  3:     im    7  0.9%
##  4:   next    7  0.9%
##  5:  thing    5  0.7%
##  6: middle    5  0.7%
##  7:   just    4  0.5%
##  8:    lol    4  0.5%
##  9:   side    4  0.5%
## 10:    one    4  0.5%

biGram(tdm.summary2,"last")

##       Next Freq  Prop
##  1:  night  223 30.7%
##  2:   year   46  6.3%
##  3:   week   42  5.8%
##  4:    day   31  4.3%
##  5:   time   29  4.0%
##  6:  tweet   16  2.2%
##  7:  years   16  2.2%
##  8: nights   15  2.1%
##  9:   name   13  1.8%
## 10: minute   13  1.8%

Shiny App

Shiny app will consist of text input field (box). Upon competition of a word (after a user hits “space”) a user will be presented with 3 most likely next words in a separate window below the main input field, similar to many keyboard apps on today’s mobile platforms.

Functions

# count words
count_words<-function(x) length(unlist(strsplit(x,"\40")))

# create list that contains unique words as elements. Count or sort elements to get unique words stats 

unique_list<-function(file2read,sample_size){
  file2read<-file2read[rbinom(length(file2read),1,sample_size)>0]
  words<-unlist(strsplit(file2read,"\40"))
  #keeps only letters
  words<-tolower(gsub("([^a-zA-Z])","",words))
  #remove stopwords
  words<-words[!words %in% stopwords("en")]
  #return number of words less exclusions
  len_words<-length(words)
  
  #empty list to store values
  wlist<-list()
  
  for (i in words){
    #remoSpecial<-tolower(gsub("([^a-zA-Z])","",i))
    wlist[[i]]<-c(wlist[[i]],i)
  }
  return(list(wlist,len_words))
}




# Unique words from sampled list and coverage requirements

coverage_pct<-function(uni_list,cov_pct=.5){
  tt.ordered<-uni_list[[1]][order(sapply(uni_list[[1]],length),decreasing = T)]
  
  coverage_count<-0
  i<-1
  coverage_limit<-uni_list[[2]]*cov_pct
  while(TRUE){
    coverage_count<-coverage_count+length(tt.ordered[[i]])
    if (coverage_count>=coverage_limit) break
    i<-i+1
    
  }
  return(i)
}

# Function produces the list of 10 most frequent terms for a sample of an input file

uni_top10<-function(file2read,sample_size, gram=1){
  file2read<-file2read[rbinom(length(file2read),1,sample_size)>0]
  words<-unlist(strsplit(file2read,"\40+"))
  #keeps only letters
  words<-tolower(gsub("([^a-zA-Z])","",words))
  #remove stopwords
  words<-words[!words %in% c(stopwords("en"),"","g","rt","ass")]
  #return number of words less exclusions
  len_words<-length(words)
  
  #uni_lag<-function(gram){
  
  if (gram==1) {
    uni_lag_vector<-words
  } else if (gram==2) {
    uni_lag_vector<-paste(words,lead(words,2))
  } else if (gram==3) {uni_lag_vector<-paste(words,lead(words,2),lead(words,3))
  }
  #   return(uni_lag_vector)
  # }
  
  
  wlist<-list()
  
  for (i in uni_lag_vector){
    #remoSpecial<-tolower(gsub("([^a-zA-Z])","",i))
    wlist[[i]]<-c(wlist[[i]],i)
  }
  
  wlist<-wlist[order(sapply(wlist,length),decreasing = T)]
  top_list<-sapply(wlist[1:10],length)
  
  return(top_list)
}


#fuction: take corpora and return top N (1,2,3) ggplot objects

forPlotTop10<-function(corpora, ngram){
  tdm.temp<-TermDocumentMatrix(Corpus.twitter, control=list(tokenize = function(x) NGramTokenizer(x, Weka_control(min=ngram, max=ngram)), wordLengths = c(3,Inf)))
  #data.table for term matrix aggregation
  tdm.summary<-data.table(Term.ind=tdm.temp$i)
  #determine position of the Top 10 most frequent terms 
  Top10.tdm<-tdm.summary[,.("Freq"=.N),by=Term.ind][order(-Freq)][1:10,]
  #dataframe for ggplot
  forplot<-data.frame(Terms=Terms(tdm.temp)[Top10.tdm$Term.ind],Freq=Top10.tdm$Freq)
  #reorder from highest to lowest for ggplot
  forplot$Terms<-factor(forplot$Terms,levels=forplot$Terms[order(forplot$Freq,decreasing = F)])
  #ggplot object
  gplotTop10<-ggplot(forplot,aes(x=Terms,y=Freq,fill=Terms))+geom_bar(stat="identity",color="black")+
    coord_flip()+labs(title=paste0("Top ",ngram, " Word Terms"),x="Freq", y = "Term")+
    scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
    theme(legend.position="none")
  
  return(gplotTop10)
}

# function to create input for BiGram - dataframe that has bigrams and their frequency
Tdm.biGram<-function(corpora, ngram=2){
  tdm.temp<-TermDocumentMatrix(Corpus.twitter, control=list(tokenize = function(x) NGramTokenizer(x, Weka_control(min=ngram, max=ngram)),
                                                            wordLengths = c(3,Inf)))
  #data.table for term matrix aggregation
  tdm.summary<-data.table(Term.ind=tdm.temp$i)
  #determine position of the Top 10 most frequent terms 
  tdm.summary<-tdm.summary[,.("Freq"=.N),by=Term.ind][order(-Freq)]
  #dataframe for ggplot
  tdm.summary2<-data.table(Terms=Terms(tdm.temp)[tdm.summary$Term.ind],Freq=tdm.summary$Freq,Term.ind=tdm.summary$Term.ind)
  
  #return dataframe that has bigrams and their frequency
  return(tdm.summary2)
}


# function dataframe from Tdm.biGram and a word - return the table of most likely next words
biGram<-function(df,word){
  return(df[grep(paste0("(^",word,")(\\s)"),df$Terms)
            ][,`:=`(Prop=percent(prop.table(Freq)),Next=gsub(paste0(word," "),"",Terms))
              ][1:10,.(Next,Freq,Prop)])
}