The goal of this project is to show some familiarity with the data and outline the path for the prediction algorithm.
# Open connection
con.twitter<-file(paste0(path2data,"en_US.twitter.txt"))
con.news<-file(paste0(path2data,"en_US.news.txt"))
con.blogs<-file(paste0(path2data,"en_US.blogs.txt"))
# Read files
twitter<-read_lines(con.twitter)
news<-read_lines(con.news)
blogs<-read_lines(con.blogs)
summary.counts<-data.frame(category=c("twitter","news","blogs"),lines=NA,words=NA)
summary.counts$lines<-sapply(list(twitter,news,blogs),length)
summary.counts$words<-sapply(list(twitter,news,blogs),count_words)
## category lines words
## 1 twitter 2360148 30373543
## 2 news 1010242 34372530
## 3 blogs 899288 37334131
plot of chunk unnamed-chunk-4
Sampling was done to extract 5% of the documents from the data, these samples were used to determine:
Unique words in a sample (100% coverage)
# Generate lists where each element is a unique word. Each element contains multiple entries for each occurrence of a word. Number of elements corresponds to unique words. Length of an element corresponds to the count a word.
# Sampled at 5% of the list.
# Stopwords removed
# remove all special characters and numbers
uni.twitter<-unique_list(twitter,0.05)
uni.news<-unique_list(news,0.05)
uni.blogs<-unique_list(blogs,0.05)
# All words, 50% and 90% coverage, 100% (unique counts)
uni.coverage<-data.frame(category=c("twitter","news","blogs"),Sample5Pct=NA,cover50=NA,cover90=NA,cover100=NA)
uni.coverage$Sample5Pct<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),"[[",2))
uni.coverage$cover50<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),coverage_pct,cov_pct=.5))
uni.coverage$cover90<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),coverage_pct,cov_pct=.9))
uni.coverage$cover100<-comma(sapply(list(uni.twitter,uni.news,uni.blogs),coverage_pct,cov_pct=1))
print(uni.coverage)
## category Sample5Pct cover50 cover90 cover100
## 1 twitter 926,490 605 28,434 121,083
## 2 news 1,026,130 1,322 25,286 115,756
## 3 blogs 1,000,355 1,007 19,442 97,222
Top 10 most frequent words, 2-words and 3-words combinations. Two approaches were considered for the exploratory analysis:
As an experiment, Twitter file (2% sample) was unlisted into a single string and 1, 2 and 3 most frequent word combinations were identified
# uni_top10 function generates a list of top 10 grams( 1, 2 and 3 combinations from a sample)
top<-list() #list to store results of uni_top10
all_files<-c("twitter","news","blogs")
for (i in 1:3){ #loop through list of files
for (j in c(1,2,3)) #loop through grams: 1, 2 and 3
top[[paste0(all_files[i],"_n",j)]]<-uni_top10(eval(as.symbol(all_files[i])),.02, gram=j)
}
# Use twitter file for illustration
# Top 1 for Twitter
twitter.top1<-data.frame(Top=names(top$twitter_n1),Freq=top$twitter_n1)
twitter.top1$Top<-factor(twitter.top1$Top,levels=twitter.top1$Top[order(twitter.top1$Freq,decreasing = F)])
Top1<-ggplot(twitter.top1,aes(x=Top,y=Freq,fill=Top))+geom_bar(stat="identity",color="black")+
coord_flip()+labs(title="Top 1 Word Terms",x="Freq", y = "Term")+
scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
theme(legend.position="none")
# Top 2 for Twitter
twitter.top2<-data.frame(Top=names(top$twitter_n2),Freq=top$twitter_n2)
twitter.top2$Top<-factor(twitter.top2$Top,levels=twitter.top2$Top[order(twitter.top2$Freq,decreasing = F)])
Top2<-ggplot(twitter.top2,aes(x=Top,y=Freq,fill=Top))+geom_bar(stat="identity",color="black")+
coord_flip()+labs(title="Top 2 Word Terms",x="Freq", y = "Term")+
scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
theme(legend.position="none")
# Top 3 for Twitter
twitter.top3<-data.frame(Top=names(top$twitter_n3),Freq=top$twitter_n3)
twitter.top3$Top<-factor(twitter.top3$Top,levels=twitter.top3$Top[order(twitter.top3$Freq,decreasing = F)])
Top3<-ggplot(twitter.top3,aes(x=Top,y=Freq,fill=Top))+geom_bar(stat="identity",color="black")+
coord_flip()+labs(title="Top 3 Word Terms",x="Freq", y = "Term")+
scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
theme(legend.position="none")
grid.arrange(Top1, Top2, Top3, nrow = 1,top="Use R List Functionality to Accumulate Grams")
plot of chunk unnamed-chunk-8
# Create Corpus
Corpus.twitter<-VCorpus(VectorSource(twitter[rbinom(length(twitter),1,.02)>0]))
# Cleanup
#1. keep only letters and spaces
removeNumSpecial <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
Corpus.twitter<- tm_map(Corpus.twitter, content_transformer(removeNumSpecial))
#2. lower case
Corpus.twitter<-tm_map(Corpus.twitter, content_transformer(tolower))
#3. remove stopwords
Stopwords<-c(stopwords("en"),"rt")
Corpus.twitter<- tm_map(Corpus.twitter, removeWords, Stopwords)
#4. remove whitespace
Corpus.twitter<- tm_map(Corpus.twitter, stripWhitespace)
# Use tm to create chart for the most frequent 1,2,3 gram terms. Function forplot returns ggplot object
Top1.twitter<-forPlotTop10(corpora=Corpus.twitter,ngram=1)
Top2.twitter<-forPlotTop10(corpora=Corpus.twitter,ngram=2)
Top3.twitter<-forPlotTop10(corpora=Corpus.twitter,ngram=3)
grid.arrange(Top1.twitter, Top2.twitter, Top3.twitter, nrow = 1, top="Use 'tm' package to generate Terms/N-grams")
plot of chunk unnamed-chunk-11
N-Grams is the most frequently used algorithm for word prediction. The algorithm looks at the large collection of documents and estimates relative frequency counts of a next word given 1 (bigram) or 2 (trigram) prior words. The assumption is that the probability depends only on a few words before and is similar to Markov process.
Next step is to take a transformed bigram term matrix and return a table of most likely next words based on a first word. Test words are: “cant”, “right”, “last”:
# Create dataframe that contains bigrams and their frequency counts
tdm.summary2<-Tdm.biGram(Corpus.twitter)
biGram(tdm.summary2,"cant")
## Next Freq Prop
## 1: wait 345 31.5%
## 2: believe 60 5.5%
## 3: get 40 3.6%
## 4: even 28 2.6%
## 5: sleep 27 2.5%
## 6: stop 21 1.9%
## 7: stand 19 1.7%
## 8: go 18 1.6%
## 9: help 18 1.6%
## 10: see 17 1.6%
biGram(tdm.summary2,"right")
## Next Freq Prop
## 1: now 308 40.5%
## 2: back 18 2.4%
## 3: im 7 0.9%
## 4: next 7 0.9%
## 5: thing 5 0.7%
## 6: middle 5 0.7%
## 7: just 4 0.5%
## 8: lol 4 0.5%
## 9: side 4 0.5%
## 10: one 4 0.5%
biGram(tdm.summary2,"last")
## Next Freq Prop
## 1: night 223 30.7%
## 2: year 46 6.3%
## 3: week 42 5.8%
## 4: day 31 4.3%
## 5: time 29 4.0%
## 6: tweet 16 2.2%
## 7: years 16 2.2%
## 8: nights 15 2.1%
## 9: name 13 1.8%
## 10: minute 13 1.8%
Shiny app will consist of text input field (box). Upon competition of a word (after a user hits “space”) a user will be presented with 3 most likely next words in a separate window below the main input field, similar to many keyboard apps on today’s mobile platforms.
# count words
count_words<-function(x) length(unlist(strsplit(x,"\40")))
# create list that contains unique words as elements. Count or sort elements to get unique words stats
unique_list<-function(file2read,sample_size){
file2read<-file2read[rbinom(length(file2read),1,sample_size)>0]
words<-unlist(strsplit(file2read,"\40"))
#keeps only letters
words<-tolower(gsub("([^a-zA-Z])","",words))
#remove stopwords
words<-words[!words %in% stopwords("en")]
#return number of words less exclusions
len_words<-length(words)
#empty list to store values
wlist<-list()
for (i in words){
#remoSpecial<-tolower(gsub("([^a-zA-Z])","",i))
wlist[[i]]<-c(wlist[[i]],i)
}
return(list(wlist,len_words))
}
# Unique words from sampled list and coverage requirements
coverage_pct<-function(uni_list,cov_pct=.5){
tt.ordered<-uni_list[[1]][order(sapply(uni_list[[1]],length),decreasing = T)]
coverage_count<-0
i<-1
coverage_limit<-uni_list[[2]]*cov_pct
while(TRUE){
coverage_count<-coverage_count+length(tt.ordered[[i]])
if (coverage_count>=coverage_limit) break
i<-i+1
}
return(i)
}
# Function produces the list of 10 most frequent terms for a sample of an input file
uni_top10<-function(file2read,sample_size, gram=1){
file2read<-file2read[rbinom(length(file2read),1,sample_size)>0]
words<-unlist(strsplit(file2read,"\40+"))
#keeps only letters
words<-tolower(gsub("([^a-zA-Z])","",words))
#remove stopwords
words<-words[!words %in% c(stopwords("en"),"","g","rt","ass")]
#return number of words less exclusions
len_words<-length(words)
#uni_lag<-function(gram){
if (gram==1) {
uni_lag_vector<-words
} else if (gram==2) {
uni_lag_vector<-paste(words,lead(words,2))
} else if (gram==3) {uni_lag_vector<-paste(words,lead(words,2),lead(words,3))
}
# return(uni_lag_vector)
# }
wlist<-list()
for (i in uni_lag_vector){
#remoSpecial<-tolower(gsub("([^a-zA-Z])","",i))
wlist[[i]]<-c(wlist[[i]],i)
}
wlist<-wlist[order(sapply(wlist,length),decreasing = T)]
top_list<-sapply(wlist[1:10],length)
return(top_list)
}
#fuction: take corpora and return top N (1,2,3) ggplot objects
forPlotTop10<-function(corpora, ngram){
tdm.temp<-TermDocumentMatrix(Corpus.twitter, control=list(tokenize = function(x) NGramTokenizer(x, Weka_control(min=ngram, max=ngram)), wordLengths = c(3,Inf)))
#data.table for term matrix aggregation
tdm.summary<-data.table(Term.ind=tdm.temp$i)
#determine position of the Top 10 most frequent terms
Top10.tdm<-tdm.summary[,.("Freq"=.N),by=Term.ind][order(-Freq)][1:10,]
#dataframe for ggplot
forplot<-data.frame(Terms=Terms(tdm.temp)[Top10.tdm$Term.ind],Freq=Top10.tdm$Freq)
#reorder from highest to lowest for ggplot
forplot$Terms<-factor(forplot$Terms,levels=forplot$Terms[order(forplot$Freq,decreasing = F)])
#ggplot object
gplotTop10<-ggplot(forplot,aes(x=Terms,y=Freq,fill=Terms))+geom_bar(stat="identity",color="black")+
coord_flip()+labs(title=paste0("Top ",ngram, " Word Terms"),x="Freq", y = "Term")+
scale_fill_manual(values=c('#fff7ec','#ffffcc','#ffeda0','#fed976','#feb24c','#fd8d3c','#fc4e2a','#e31a1c','#bd0026','#800026'))+
theme(legend.position="none")
return(gplotTop10)
}
# function to create input for BiGram - dataframe that has bigrams and their frequency
Tdm.biGram<-function(corpora, ngram=2){
tdm.temp<-TermDocumentMatrix(Corpus.twitter, control=list(tokenize = function(x) NGramTokenizer(x, Weka_control(min=ngram, max=ngram)),
wordLengths = c(3,Inf)))
#data.table for term matrix aggregation
tdm.summary<-data.table(Term.ind=tdm.temp$i)
#determine position of the Top 10 most frequent terms
tdm.summary<-tdm.summary[,.("Freq"=.N),by=Term.ind][order(-Freq)]
#dataframe for ggplot
tdm.summary2<-data.table(Terms=Terms(tdm.temp)[tdm.summary$Term.ind],Freq=tdm.summary$Freq,Term.ind=tdm.summary$Term.ind)
#return dataframe that has bigrams and their frequency
return(tdm.summary2)
}
# function dataframe from Tdm.biGram and a word - return the table of most likely next words
biGram<-function(df,word){
return(df[grep(paste0("(^",word,")(\\s)"),df$Terms)
][,`:=`(Prop=percent(prop.table(Freq)),Next=gsub(paste0(word," "),"",Terms))
][1:10,.(Next,Freq,Prop)])
}